From f015556562ffa291f41076f49938111a674f3a60 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 21 Nov 2022 14:56:14 -0800 Subject: [PATCH] Adds a test to cover this edge case --- .../tests/samples/single-page-mixed.pdf | Bin 0 -> 21901 bytes src/paperless_tesseract/tests/test_parser.py | 62 +++++++++++++++--- 2 files changed, 53 insertions(+), 9 deletions(-) create mode 100644 src/paperless_tesseract/tests/samples/single-page-mixed.pdf diff --git a/src/paperless_tesseract/tests/samples/single-page-mixed.pdf b/src/paperless_tesseract/tests/samples/single-page-mixed.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2281fd389180b8aac2e85c4211dafb891d946434 GIT binary patch literal 21901 zcmcG!18`+ux9HokZSOc8+jhERt798Gwr!go+qP|WoOEnEFaPg5_nq_ZxpnK+tNW^E z?KRfebB?vf7;{bhYLd%~iqkRCv%-?kpC0ePG65I?wgwikyu1v`=0GcB0F{aj@E^el zU}S6PWNmB%{0_???_g`>WN7RFpi;6l*Jt=L0kCFbVgyLo+M0f~5w$gR1SlHY7#TYl zJACKohy4=&G0O6v#troWOt1{XG5`iyTL)`>tN*OT_MerM-Rz734C-02>P!4((Z}`=;*w>v1EAo zOs6JzE<2_(HDu9Yhy%YY{~!9%^lx=MOx`!x9SGyF-F&-W)@^q>4L(J4cs;G|&h~oH z(9uup;7hz51$ZyVcbx-(qKc6n)u zP{5ma2bU8=8xB$RZ7)5*`8461iujXJ0o;x$MaBf|uh1ZC%~Z6~h=Ih;*P@cv-F8PG2UIMA@uGoJ9lg z5ObOr|!DPL)@ zq<9&zCZ%E&=3Z|Al)&3nzb^>b%)R@=*=nufa1*N4twK}+rJQUIf2UHhEb;)Bj_XbY zLzkeJo6Hma=V6Wyy=Frwnr@81$J0!nh|AfEpIS`|D0+@aVj6~aqlL=lW~V(I?DM+g zwDo4kYKD$oJsllg7Ly_5$zL7c^n9|zsVuE#n~elH{;=^Y$MIx3*SjsB&nORjEM^I} znI#htyU&kzFepT2W#!xk&zGuS?XlQunvl-@i->QEN=p63d-m1cAKc;A92cwgP*scF zKA&({^y<4VTV5eWIGlFdrAozm;v9!>jR7{k$?r!6G4A&#gKhnVj1_8D(<&>T6`%w| z+1bc$+Q#~bl0q^Hx(m3E59fcd$fa!VsOp33tpBjz@u>4#*5~WE&U}SrhQti30nrM= zl;7(GDBP4SSEn;_^0?(=N)vFsH%yAw0umW4>z8?bhe4y-f*FC!hqkz*hk5}e;qYg{ z!p~A2!f0-0X4@&e=tUj02B~5ox~Jj_nCSP3paLeh+U~-zo-IBUg^gE`5vYyhE{0hym66yI8kQ+W7)&OMJLw!^o`4%T(oR#k1R@!zanw0Nc^8PhglOz> znB{^<#J?UKe8i;!hUxoVk5wB?q}EH|c^^$@@4osgS71AE%Yff)ZEbaR@g{)?dz!6J z_%qyJi54#vkHq%V((yQF6W>TE;Ur$4zR{<`<8@`6Go4Il^LVft z8;B?WDNi<_STFdM$QVMtv3BHDMrMV z-1+hGaeIcoZGQ;3Ct7qoq3P@(Jp0~1w%zZS-Jjw(-|<<^kZBwarwGjc(o}c&!-+@X z27xXtFH<&m330f7JRkA^j~H#%Im-usGISb+Co8e#8jq(iTrO7=6;_s&!FIQw9H5pB zUG4PSlSaX_HkY4_(5L|#VW^!;!v^F!?; zdZGpAHUh2owPiG|#}152W)!Y%4i2i*hi?qT3cKvCriM3 zw`KJq2-#0^m`q?}=cI{{ml9xur%U`^fKaAJWX}|J4AkwREa`+4V|4p`xZ{nwH7ufjRqaFYqe{rgVbWTxaX82|=trEe?NS zgE5cRtWe-8sXBg%5ZGtsw2#j$xsoV#Js;pHCqLwU+j&61V+RFh7TA$aYNpcJLvP^40oTjZL@ zat%*Y9VUKPdg2dKAh|S{zdKt^$oB1{{nduZZ`wyRqp5hI?%Oj=OQcfX*Nl~_Xg|&sCC1yvDjvcQa01qGOZ~%uuLfE#)fIj`PQ^rNJ6~%o|T+hmhFnx z#meX7?b$TjZAjRqzAV_~=hUi`I$^arwAm(v4zUJ9AZW9W1`Vp}m=c0qU+Qgnb(e)~ zknUA+5)L1P2tnio-_Bl?6##q!OV<;n3N!?!S|0@Vp*EJt=M4ojL%^rY5@C8_GpB+F z78&auV1)c5PzGF%gMeYrHA(zj8eJP(yzk-ncv_%wqvw$WVMk9EGT`j8!BevmjV)Sd z-DRSLi8;_XbPq)d;pnVVtC==V)_C&do?(DV_R0V}wD&EG$Hf6N_X!`7kT-NYj64Y#ztVB1>-`MDP@^hT7TivfxhSyl@K+Mv`(iR^v&G(Uy~Uo-{VtTJ3Aqe% z08UP~+q=_+JX~h99z8l#?>8a=cMsh!v50UbglSR6kGtb}8P*IHHpcb)=&JqJvS?x| zn?NMbx9j1gq@>KMN9DZ$Bz`;F(N&SJ5OTCQ_5XB8o7I7q+46p+8_Ka`pMt&?Z-L4r z!)KpoNzT45|waov`3m9FZEc-4R;@D|ImqrKga40_p}h!o7U5fig zP=NBL@$U0-f2xMsP4oTE(*je?QM;_Htnbl2G4pbpjg3vmmQJ0mE`3C0&O{}u#`a(W zs>)yW0G(SXWw?b^8KD{jRte$q#Zwuqxmvxpk2m48(Cg){77xdvKVA?3K)cZY1L{dj zQ!JNLRZ_CO%fJKICfE$Gp%Soo7$_hlg~_0^J~Pwvc_YS;f6kccet8TD8F+OfZ<#-| zgLW8-`ft}C7X*@)^z|9NGq@a!mbta{CzPPq>Mf8X-%vS_#q8(Cle(>9DTw#G95t zgTH)@^i-6^cR!hb>a9z~SKw-7QICnRaRM2{*@Vm6%HmW{YZzfFNf$$Jln}tCntVP# zZfex(O4i46mIJS5^UVP1zGnE}!`t(pWSEJ=SO?YMB7IWG9?&~rGvtverr*8=8Guuotg(VL+MtZeoc=R_fL5k3!p51~!*(%HkR&9B&0_6rw6f zEHkRDWH1j9-b3IjF3g&~5y8QzUE~j`+;c9jG+JVLew#Yj?CWL6<8xzM&m1QG`hCd5 zD8S7dT-kaKYiiJ>95dvQ<^Ou^8h$M;Ez3XqPb&11yn+0d{|uE;W*LU0rceZYWcr7d z?akfa7QZ<;Ic-ArmDqa_5?W-q@|lUB5-6Cg#-VOw#{bccYq8njE^_s-KM&2BRI24k zML&>g5m{^!($q7GaelPUA1iSk{Y0X6z|z(>TxBcmump~0ZX3czLeAUoRrB^IPnnpVNL#9Zyw$Rr(CYOaAs{LNY>h>n_|EyL(T>ZinZ| zbXvQx2{pG%&hc0UxFb-lX)5OQO3mn6P!E^$#W8SQUgsSlKBr-LDQak~z+_8ApOtsI*K? zP0iSMXyUrBt*Sb-=(*4rO(RT71#UwI=iw{=C*TL;FQvAA1AH!6C<%o*+@N1>LEuw(1 zK7KF%h>n+O^fj|IIr6}Eq)_6wyIk7nbB+BO?yKA*#0Jb3th7GS9wB-&(F|tY)Cc&X z1@sr&7r(9N5}>+_P4gKkL|Ook=OCP&*>{_A2nGQGVRNdJYQFf&!J4R6qtpHVYYzi=RhwCc zVvIR}lAH#!7PmR*SjXFfUS-L9#}JO*QFVPgEl^(vM5^vCUg1{QbJ0x2qq}-O%sF%$ zV5llmuv9^qCG@5KS`|r0FRN~oq+|(9q7gcqoy0E{NI3c&b=54P3wM}xJZ^e4Ha5zV?X~NK!W{0S;PO91u(PzKTv?G z!q0w2Wbf(fk$S!4pBL^hq1y}RZi+K5PGI`sx1Ry#Jk8Eeqv4_aslsX(f4sIf$X)0( zbgPu8IVO8It0&9tsVndInz2XOmnYYqkus9fCJKIara{UnYuqTVP8MjS4}yTz8tP&q z5W*cM>uY9=M&eaiB(J%fn-L`$PKL3XXjRJ?M$ngtM~^NcJxsio95#6~j~c_M8<>QC<*TACgv)rEh2Lvx7z7LU9IimrVnwfH#q zvj-vq4b|_Y?)Q&THCl@03K_t@u02~Syzd}i!W z&8VdIm~qmQo^xx1t@%RgV%XmWpw6>rJ74Ye-|DQ!^HRi<=Yyj}tt~G(*^2?4PV}T8 zHz*xis-XX#ZO;F9*=A;E`LEe-Oz8aq#)KSvd5i9dDb11~MFH}?#lWKq-1A!_M4tco zYk*W!8(=CA3T*}9Gw1zdB9Q#fdQ;KEOy85MkCHP=qv2n$d^ zN%)3o=NRFk#xk&qV%N`V_ZGqXZ8x6t+FD+8s!Gyeitorji@Jh+&gC~h`YE9qR~Z^z z8Y_B*#F3_51umIUMYZdp*9_Z|i`o82Sn1(ZuryWxN!)S=Hd4P?ZmQ&uCnf{j;I^JL zSK?`9FjCZ%iluFD9pG>}c!cU})?J_#%j3n9xwk z7zoh%7a?R&Hg*L982$zCg#Y6z@*h{JU$6{fu0RPT;1`Mer>O)B;2&D3_{9bP2LSv> zt;9=MV{BxuFKp`y&|>^jvNJIQIJlU_U9k>{+mCJ|AfcsE1Jx(3_s0{906MYWLFXJkMoMZ{u9l95#Il)wTM1Y-^$kX zpQo9A<@|pN9@BrJ$^XWxOaNwfW)AlMJ$hwgWn%?o zrEFV*tP6ku+K3pO#2f&^TE{?nN7?`N+)i=sW`lt>{8e*}*sW-5w; z-;9DKFQfpaj~}eS+Ip$%xm_khxP3`0jpA zkOskCSW%TY6GsiWEkT?&SffZa08x>ib$4$EE<9Wh5nIUOGa6EIN(eC8ZwEGW)&1Z{ z(D0qBg3`Wz#HKTDN5Gm(FNGV4^p_Sw+!PXK4P{;@GKuNI{lZw5uDNAHYz za$UEjmnPx+b}?7$gpbquHi2oA^Ms<{xNF6!McnfcnM`kNB-#@d-4iN4TQuI;wjgb9 ziXcQ6xOh)csG&Z|u2G422ez(i4xoY*&z_WDs_=dBnlw)ODdep|GZb)tpM^M(6?85CO8A92)t#l+6m0Gs=t2KE$31@{QdA ztR5RO5aOyr^pr$t zo}L1SF9w0%?c$@t2e@cNdbItqyq^tto5B&jvX(yROTui9zc<^9dN+556nSzld5~k& z93WrEuXQG~5GESv7GIx;>~3qbUrlM>RHrpP2m2i2K_FmersI z#t;)!%WF_O{YC%9jIbU#xetTNJ}T9jpSI^oBlVGa0B|Sd($oZy?$NH7XFoJ_9Tf zu`!U9{Nb@1{hTBD(ynN4fQB!u_8{;}_>+?8S}C*Fr&$_JyT{AdUNS~dEokqjh!1vK z<33UieG=C3X(6Q^U4(>MuP3HljxU>f7yh`h;kwf=Copx^_MU*LayN8d18^s-lAHpb zp|>fn6Q<-T&u|`^26BQP!z=VOFu7hESHH{wMJ;{GhCZU+AkAlfb(m(uan!0Hfxf7b zJs7H43_G6dZTHa-DET@fU$4|hk``R9zhpM|a4nEa(6$DpY$I5l$bnzyF+V2M|nEnpXZBKzzc zEG>QXeNHV(PZ(n>6nU~_C$#tQ4*)d!Vg696{#Qw!D1-B;D=zzlq&~Zfzr<`+7rolv zB_bLd;O&x3I8rQZjVESKM4Yuz80x2dGoU-y28Ra)MiDqqe#%>} zgUa;@ghs)W?Tro^v1>>z#p}j;jPN`7;unY8eU3-Z?so#aaieVhK3OW{>)E_WSCZuDrQj!;VR$?|(zhB8q*bGZ#&9qkC zPE93GsoonlF!NF=c|+%JV#l~wfH*zFHWYYAIebts?XJ5?(c=iy52U4uAH)wDK}_x6 z8=*b3UB1rmyJh}%8HG8k4|{RTd7U%hj*K3BL|v1_nvc3;U;D9@X9}*t2#wiGybntn z#GR+0POVM6{(T^5IS?sGE^xjN>8UJV2R>B8Kr}L(bkKa4`P%#1@!C}wE~$^jj)o4M@6PFkd+e4q z^?PIyRp}j5U33*mC977wFs?LI0+XF$RIWCMbs+Bm$}DH9xy$#Qvp(5%KqS^DeRs zn-T0|?|h#@9c*@9k{O|AbtleWS9qZ;$OK7{XH~)RlxU&v)F}L7eC#nFIMI9DI#4cg zX}h#77_g&1p705KRc5su(-LX8MreT1E!jnJs=2#+R}1Sdp4aTll? zOXi)oPd07HkPQ(BbRCUnBoF4~0jk|(NAQ8S2&=~YX2swzEO>n_mFpb z>;WX{tf%y`Z|WaF(A|Ayg$&M}Y}4_?vfg*NH@UY?ywEApzw|q9smJeC@Q$L3L|x0P zafII}iblrVaYgENy;oopykpeLig~!dy|UB*LvTQM!>9D=yVik-0{XWpo1J9_+}41Y z5(n2e!rxCv)$SxvB19!2aaCJg<<9w$4~hjryubc^JWs%yJl~+9M4du3HiiVlIrh$3 z-gD+6c=*EEwW%`V0XoG=U@nID;>=pnDRnj3eDGPbiv+9+c!<4MYT;gXW+zX!xn|`+3;*lAOv1_I zs7!X^th6u)|D=mmj9$)CB-J4a+MLd=x-QW2KPEFsNC>tIkD~$B?qKndg}YTV`PKt$ zfp-3F)KJzUmWzQ|;T|FwN%nFj{`Ua-KpH%gq2Z|Ns){RPlcl@Sh=d&$;rTseT+C)s zH%#oQjIf{md(5^JoFP$=Hn2A1mbyNU8KCp01w(0hX$Xyr_IY{1jrvgr?HKmUNfPU( z0qU*RuJ>yfrTqF{zddAQA(pFk@gi+;bT|0e67*#dmi&y(T}9ttP*(0HIm1-Y0#@?UI)f3G&GxSdwcoXs5)yyOTanXqKs^pm5GujVFsNx)3!~!!34o7FUruO4sOf zN9QJjAR){|8OT-+v6BYFVMPqQXMr?e_q2dc4xbi{o(MY)UanSytaVl8uY#e;+kWW= z`yqJrr0YJ!N@iGCbK{~%mXJWN(0_+c5F@_V+A1x&zcZqAwuekef05AK5fv;PkqB_- z^kPIjY|1l{=a=;{p)kCXv1Uq*_H(^`IuYfBp7)|d9*3q(4YEz_ew8Kw|R;D-|HYQ~@Zwsdy2QM!Nn{tYKX!1EI=ilU| zu4`8oy%GuN_b|K|L3K?uEO5$rdj{B_Wb_Lxe~jpUkjD`BqS*iTRtB}aOH2DS$YZ}J zqZ#b1AB^9KL<)a^N4h|{nC07Z2jPj-GF~h!JzYNaD9kG%$K@3`&Ku}2JekR`rd%3cI0qv$Z_gULRtGyB;9y6uczSQ3*=J-P3!4)eCGi0c;J*s@&XG$ zDxOLTJqm&@9`M6|;1(O9EvG~`d0L};odqyj=j)@s2{1HA5jpexMiD@W%7H&BVU*|7 z?fsl{ie;QVY{2{``mTA-b-L_?lsj3m7_LcIz7SL0FXp(Qd zUip@3y=HIjk(DPRi&yri#(Q-3y_U6VC!I<+0=%lKfUE-N_Mzv6`yK7RT~8~a+M_SQ z9KLgBfA0wrXHCOwkxPvUF$d3z(o!VtzC&*-q3!e#K7GpQ-@dBj`d{j6RvZQxBWqo8 z$cG}!h6cpdSX1{;9_$APHleGJf{jJ%8(?h)8-K&?w zd3YYrwGE4g54Qv+f;pR+NWYL?*+Fn{4l4xeJpf>yI5^3XK>W5+Iqu}9XWC;=M5z1e z(^ge)H}F^st4EyjVXyNTRj*hr&oh z?h=i$;O@^ord}f2pk9P+wz@95yzFIG#UBReKD5tIsT@|wQ$U6jJ* z+3M$2mmoX+_5uVyQe^1dvB6IxuQ)UVqr&J{ixouI(^MSaOr8)92gBqPO!50 zg=joo&~Y<-dc@!u1DL%a_CR#&UQuc38tdt3bgNCeP(g5N%3K9ysb z8X;eAnHWmNqw8}M%ie=LtQk2ynzoH2@E4daqWPE|T=&#TE(5N%$s^~2KQR~J++l=6 zkH}p_voSrw0)d{uzeh!ReVFqD=uuMUKTDAy%c9cj#@BGctsUiPcAZ9{VhG6=i1JYy<+)%-t%a~8LKK5b za^dDWk90&H5${B>L^u$$k_pXvP1cS+l1FNK=Q%`EoD(aM@djK)#(#D!_}d!EHl0B5TVq zX`Fyxsfj<-V@UrFvIeSa_le>v4_Djjr*6avs~0O|G~*^oY=vkOJ)^!q<-s@ta5%M= zWD3>xVNrx9QL7RU7c%;AD8v>unG*XFPguXiAz5MAyr|7jD`(kUqcdJqe2cu4J=$N< z^+2wvxxaw9@m093unOD9ru~E6Nb7Jhk3-0Gm$-vNsax>5H-Nbf*`!*+!D3-+B&)7) zdO_i;Tsn)SVu9{S9LRrDKm;OPl}VkjnLQGQeO0a>D~%a6FJD+`liEV%04eB3|f5d z@UBn@T~YQ9=#ZJPkz6c=NS%QnIs>q@q5pIXO9>aytZ>=B^|Sd)$Y{)ge|MI4$PE}E z>g5y+5FL}Ey(P(oR$%qGmR3}EAjK&v2SRybTBFgp z#BQ*5jWpl<$>?WO6219@Ly()&{CZSw+*R|`7XPHYrb7BJeA9Rz+VA8~$e-q|y$k}ZWW zB1}K^gpDlp(3jM9atK4&^s{(C|9<=gH4f7rteA)v6gU%R6*@35^>}qF$b;9}*^g&u z=1Hd9`mQ!^_!q>%FO$-A420{~!>DtmOu5;lW`SzV@K4?>^Y4V>WN-|nt#3x zQ^$`8+&OCI_4rWt2z)B#|F#1em=C1ZWV6RE4R4 zmchj&!mhMPScJKbH&f9jbg{o@7q+SFUE%z-EUu^Mgr}gffJ@X)v;h;SzP!F`=v~d$ zys+T=r?q4Kmkpi)9f86%ntaZ{(dzI0So8&A#kR1JP;3l`L8cJiB` zTHb?k6T*Z30^8I>xKki+j-+~*wajV~#-F-+H$pY!bk}SOwHlA?PW=fq#9Q7ela=w? zGbi?X-fLpsh^-h6<~LrrYyTY6Ycz&w=D$|SZ$`;@?RD!TTBswdydR>3^E=9f^IvV> z)pCaPXQ@{CTTI01+C8g~RM_*&W40PRn|L;P`LNg73n<)Z2ugGK4cTw;aQE2qhQgmB z<;Cp&Y_H`=RvC{5h+zE5ZE{e&Z(36~6dT`O*&dS@5u1F7+YA5U-r!`YrX{xLonL2P zQAe`0G!ma9Nv}Ujs-Z<*QHlC>zaqJbX@^6k)+qzmZxOsq@pitDE8$l8XuJgPNF5Pu zw?mM5+_lyWSD7^2gIpPgC(^%50i-RRp5YhKK&|?oY-FN~9WQlJ4rTh{95);<1uL#4 z(hq0@YRog+sZEtizOn5DDEv&Rg^|;1I6fqlE{`0>rA|-YEhWz8sioH8_qMoa*BDxH zXD@upRnF{{UoC2)BzZO12Vj1O{)MBVjT13{9FOBO@o6mO!>I)^dDnVQmMoMktQ3Vj zU6L-UfFb=u=kJBER=72i3>#+;1uD5rDR=xvv^X&eB>O?4RvWDp`JsXNoLu4niA?GD zWA_y!v33)y@2Vq&AmDw3Q2^v{lOnBKED;{ha+TFzp=+Zy=nII>7l+**jDTGn0u;Hq zP?Km(Mp9}$p$3uJBs zY~ri?jwquW>^dPjA=>}IGvW_0fWFq^EjA|EB}QTLe?l7ZLpER-5kEsf8PJYa7*UWD zA-oel>qm((34KNuJA?8A2t9bLSXSZ<)kF|s zS2{I(1saP~2e$D9i!ox)szf9cETv>vkA8uaFJ{lvOi=sVJktkrs$Wa&9!YAcH)2E{ ziapGPu+?r0x=DFXFLP=@3;qF#PP8k!Y7g%KjULEuk{!;j3cgyO@PJAU^tFY01)-lM zC%XvRu7!HID;=%U>f_tr&}Bz$u$1zg9)ucDdpd2u7Q`yi4wNcIf#CZZ)N2a$ASMn{ zZ}{e(#n9$Hwop7hwq5ao{wuf^%xm@*#OoBhPIqXVLszH@p$@bql1+$H#(SFq5j&*y z&=#1ba-!Z&JG5OdJC@yQ&P}9d5}ROzP%S;uU9eE1zJM#Zfsk$Xfsmb)(C*+ri%8oj zEh1h-gh4zgL;>x{M2An{>rqc+>j6)AEm+szqviU&LbZ0(LhW0K?D2Mm%={)p|LDUR z(D0UYK~49FoaE?34V&)h53oP6?aI1>+hYwMZZi!W?%3b|dZOd?d*X?1-;o+{enQy2 z-1$C$wQW9trC-Q!=8aR;EDQ7&=VnBY#quv zw7Oq*;2j^Shkk%#+igIg*LIO(N9OCQb;UE#tk1E_GgH+Ca_R8K(&76=*b(ZA`~c{h zz6NuJxfXH7xh8!IaQ}vA&kJ8&^15TV67dAn5&rajnDh#FS79aO38y3ai3?v|H^dOX z+m{2mJ5ZqXb(_W&?V15Ure7;mx1Z<4(hhV}jCb_9`pR^`cAIaYyH{vH3I7w3sHD2j z{0aG*=n7@GIn-l^hcnO>5|Pv!iHP*0rxuJ~Mb}T3)4!xoVc_%R?(>!Vz4~*9`@P&p zquA$~h5J2%==jc1pkQ9$GfwvNOn2s3q7C>GPssf~pEG-$-(xme9KYv6iUL=W$!xp1MWQ3)1A+5 zd)0$B%~%mL;PLNG0zk~8-77}}StW=~>M$qKs`K!U_%yFMNM`*C^ZaE+A`{OR3sxG2 z@6z(}y#-e|_So<3%JXMNB|UNKHEAWEqr^q)M+Xe$hIla}SaHw`;_^lVkJu~>D1OET zFB+RTnM=x}&C5p(*F+@7v2903D?~5QMepXuHydG7;cfUSsa;T$t7ft&LOChKOHROR zkW0}I9n%zUH6waC&i%^SXTYe&tz*8r8|j>{NiqJGA6XwOw80~!8q-*ZGt*SM)l|1L zZjId!P1sKXVkm2EI=~+GzQR~kPfg*Xna;Wz2vey_L)jI5j1Ex>-%bH$8QQ8I90{MC zzQbYoBRDRKy&#Goi$$?>90Q(8^+y26zT1b^EA~ae+n&|8zwnRfcbA8M#M^OM*Q^pb zu;(=@SQn>uQhSrdDNR>2@XOmF%4OkwWyN!*%($)D1I@vQ0D1yXKsY*P@yZttC1oyeK<|HiAv(OXRL;j(-G`yd!eR!>KGcypE5Kq-$Jh3(UH+$wQPs-vu3(*y7af-J&VD2GLW`IXL{X1!p&#*9b8tTb6>Q!}_kml{ zYJ%)T_w(zs%X`YY6T`0q+O#GS#z%S!lm(v@JacpR$hIOBxGDq(#XE+DBAk&0XY9J8L-3&!;(3GB{E>GF0kK>fOqp z12q_%OHBjxULx%viH&rHtPGv58nqxex+T5r`M zE_m!}M`jkCN+xEpo*+anA%BAy$`udnXm$O-j#wEJ|FKI8XKJ!%aA?g!Vxm{OO0q{2 z9AoDcJGq3N!Ai!jN8*cqspcHMvub^}o61EAi9(JVc^6?7W%fgXra65Q=ujx*MJblK z)|(!EXBx(&1X*@A_n_GJJ2A0s<=a7R*)Po{1+|r|q-mFU&#vkHTLKCJkLCv5e2`M@ z{P?Vh=B&SQt~jF;2a3#f#F94b;G-ii)AD`F?0KkLf`uPo?M>iaG&LC0$({aq;6F=v zys4AamwK;`6xPwkV{+$CLkXqKdaLRpm?C{($o<0eq6ZTOk9v1&$b#xf2F0YwYnofA zDDyCb<(1)NQ|5PScz)%D0!B^3jr2jr$r2~mxm-?fNi)PTrg=Cc&{eD255n(SUhBX( zsyqxb^Bs-(E?s1{>T8-FA>ApE?o1Jdkcnr_S!N!H_chDPdqr|yPAn|h0z5;1%0z) zOs(bUY1OsJ5q<+#HTBSwFtZ07BR1V;0F#gk>+$u2}YoYA(`V9v_z1o_b(&Cq+;i}>X zc(Sgr)l_Vax=GEh%{ou*`K%w6$<>DC&!A)MqF{mFh{;T%^C_a3YDt-rBjC5oIEDfv z2U4LE6r@K&Qx+ws9(h{u|D)7Bj*|gW#+BLY4eGKn5?Yt^9XTSW2@HI6XS3=5rXkPQ z`)f6mJx^mfgX(`;fhnbAhBNOcL<(EbO{$$lZxbbq4Hnwi%P7vGVkP{n!CYOr*pDZb zrb64LOtk(faDCT1JM5bd`nf);j{rdb7 zai=6*FD>zVI?HtAd%ZBe|7xy!Pe?_h|E8bjbQW_b_BCyWunY0tA;+`>eQ;&6a&WeA zY=01-f(oiaw05ZCe#xX9JCtb@VIY-O%UW7Y^DCE{L)1)M&24VD<-I*mW^63{22Yk% zn3Z8esG=3w428eh=@Hncbn#LkwVEpIY#hstn>%#*$;#_S_6;j&M{qpf?Yt;V$(Aru zaJqy>bm&>KBlGPi*7z2U55y%{^pB4of?8~t2r*N?If;%(GlCtuy5Tr3%?0u)g;2v` znqsm91>w>V<17i&>=?u`ng;qsc~rT1tgbg#m3OoFct~7vj3%}B&JVC<~A)qe0=;3s6N;KR7s#-Z)TC7=alEZh-n4gc9|I!&MK1SSe>Don5SJ} z!)-upS|uw!k+cFWn0tqJw7N8(T9NY@{Ad!&{`kmin%&I%xNg#%2KNcD_G|9#7S`P1 zl#y%4utSX){fm28;X>~KE8A^cP$!lbPV(~Rj^jK_xkA0eL$E^$IYbSCbw^-cf@$~3 zH9FXvR7V>3lx(Y3g{?VFWCs_LcG{6b(WGCW#!*`a=|uZ*Hn$GXUvnvm~sjl_lu_+u5*k~?2xT^350}p-fPdf<;$aa%iQ}qUcT=k#nDBY zP*8v~9ixXo)^j{aYXj`d7SqB+Ytbp8IaSvlnpjcPI)NW>t|0&3m@B9cqJHjBlu(;( zB>q%Hu3blSd77mQ%mzFlZBm%r^fP&lr{i$A;1}*iffcW}+}JY@-pkX#Ap2oURZqom zcPsg8xlB^HtTTE(kXlVkiP)Phb->Wi@g(anqeM_{%}=A<(%!6Fc1+*3NS(6XGY|5e z6VCP*u^k|w|F)FzpWR(XgNj<47Zn11H{q!*H}^!GH^`4bqvubfH|zr<3*|PWG+q2@ zek4*KK*~_?$u%QImW!;^k(&rC$2wb2KOd}IrESV4W-uS#bw4Po%$npRt6-7!TpEkX zGwbFXudEDN&psTb9|@cnjX9h=I=+CTA45bl`+K9}!t34AYG$=-XhVAE`UTQ9pM`~of+QBvHv2g=u z=>-Nyp?tQ*VAbY!-I0%iwd(zwNI~HJfQWuDiKD`?Mg^$0hYO_{JePFnC`(wVuiLXErauQSkDn z10@(GEvb{zSoVZZB0G+&z{N|)akDqmFikf^c3=IM?C7j`bK}EJ@u(+_s=up|KAt^G zLq7;_Q^TwZli1#))?Jc@cfsZ{6+D)+K!aBT_mCIf_)c+i%^$BGF?6*Fl}iifFt-s@ zZmt+hL#*YYDtha)-L;j!`{rk`ewK#GKMi&Q*`1jzlPYn^rV5u{(pIeKxqi}-!E1of zOaHt0ooyt)CyQV%l0ee_O|CUFzjB3JUc~GYr_-UJNv2(D8=Tf`vfHrQX%`c=IuyXQVgHd-|*qG&b@vJ zmg3|ATM}cUO_`BN7miQH$3O2h+2{^iW?S!T5e(fUs^^;D{N7!(&i13Twg<@n+F`() z$qyi$)Tq-c!?sJ4#5lC@iqTgtnl-}Vf?{@yH{sn4Eo56t*ZvvdOc_6z#a|HSyoRTI5!u*9EvQAJzH{xkCb%N=KEA)L>Hg&5ubf&V zeFn!xGV^3rw5?6K$5U6d`JG>*noNh2%CKVz!BXv~D#4A)w=`bdTX{abt{#P5s?!7) zSb;*sI2hps649o{-z?#{EAX)R$6ukyIRw3V0o7)OW-+Yg##6*&*%PA~2w((ovTbs-q@P=*80l!nJ4KB8h|JHv z1|q(xwS)t>@U7jixXz*QBZ8s2P2R@5T0!odOL*n{b80V z)n%v3^k1gpWKMbLy##Ec0;@3QQr1O<3+Z%X>Pn*S7B6q#`%*|czLW6TN5oI`L+gfL z6+x5YVUkLJ@5>~i{Z2yH7()HMFPWs`OZ*+4gs%S+WeL5Pp?DSDLJ8brPGDFZPg+d> zI@DG4vG&wV{O)aK72p#gEff(c zQl+DmAT5AWl&UlV>4Hd+UIZz=6W`sn?E8Il<&Qb{nbV$`=UkI3`Q0y)jVe`1t*L{_ zozTa{Mej=*ug$7Yu&*~?{ZLLCYB^P)jT^kvnS88>zd+Fnj@2JjZ&~@z;AY&97U?S5>u_^vWt&|`Ct>bTOAiuJLRqJvfhK4q=+2r1{5tQQ{%5<%UBs8@n zQzNTl_b=4D^vWKjdX)?l&gR|L(2X9kK|RE&IO-N)`~vvKq(ukYWsjhron#qGhPe8^ zcw?)_Q_csqSw5aG`b>f@K9%7GpyzVS{3vCEfUj+?Mdj|nTt?T-`f_|bY??OL?=Jk7_wH@Czovhc^}s-6nY3t zWcXu%RzGexp=pg?8?v*zmqNKHkZcjXE-;}%i1E5-ke(#g-swP&Q7%*Nu=38-x6LCN zRyrc7l}EIa3OcaQX=h5#pmBH$<1yeThD3}HPj6ZNd(XV64t&>uvsMWGlBjJ}Rho(dlV%thWe zy;=wg?lXMp!GYxP>_&Eb3g_gFe{~T$FQu&f+aV>tGoL5a8|%?EYl?1W>%Hgi$VDtV z;eF4xZXg|vuktllc=~*Nd1AY!^1jW{ZpSAz8%^3z78*MW=MzQj(hT}R0YkjT4hCN& zKT2J?cUd6x5CPOb$fI@2l{4cHZF9~co*q`x?<=Nbm32nxwEoL;x$Et{9q^8e$I7R+ z-USffDS7x${|TSCyQ|SOp2Fu5+>p)g$w>}~VSCTmV(=k*Ja0|6(E`b=Tl;9F+C9GRTl2s$ghXpPxk$E4uP7gOP@$TVuUUwvg zs_$@!aWE@H&s#N#?&?PwYA{C|-3rmJYY4L7NlA(fvtbU2jy}f{NrK%!9LWDpNBL+? zJ^cf|6F7R1{jtI3Fub)T0u<<9qZ0JmB~{@TMa-pZ_pxMA#nQx7j);9kYKXQS;%>M* zEwY-oq53ICmMVxj;nfnJfHh3zf7z#adb(_2j*ep~JEM?#%g z2AG;44tBHYiE|V<4$InkuN}<7kJ@DS4d(H&Sk6|Cm!?h>Uqopn}-5oK_E9mW=b^M*$bfw#5-elNh*QDNL z!UX4-fqWUo(cICzpaAmCt!aY5VI_TCZ@NBBeKMb2gpa(PE3B6PqLA-Dw0WVUo>r!A zb`Rs5Dbt?M$)?3NASH}7lR_SlLP|;@bwc911RI@rGjUX`WxCngK@rx@H|XJ3a7OWI zpB=eT+yd`h{M5kgz#>H^`KEp9$4GUSYp844AD>C?W8E8lCMGHGb>CD7s=`IOc>%=y>>A2zro1+nt;1 z>HEeV@vfvZWw{Ev;vvZ(>6s(N`T~2@uKLC6_>(Js+%({iLJ22!ZYC#@xfCW&DMV_M z@Co@hz=K-&%tVV1?gK?Jmnf|@b;VamwCY3gz&FBm#(Bol>XeF`$DXz>Ik`&cTm~;b zZW{_w_|LXzMSdP-e5Z~7|0@j5Z*2%XcMHP=1inTthIE`WM)qX z+>Gjwc_g`=s=`p7lgMvMyN>k0rGIKL^G4ma)sl7-=d{)p%{nCu=DQGeA<@+QUB5!I zbbvUjL&Jysp()R>P?GiQ$_L*j))No5$su`u^1Xa$R68EqH`Dgnw065t%H7P2`c;9H z>^~CKbZ=E)&FVfJOCp$y3oV^q&f*prXP4Sv{PELuXNh(4_bLd(!)`mYotu z_LfB~#Uzjk`zOJ*>cXjWJlO>RxDUeO&1tlTVm z+o%f`N7?3*ORMfWhQ>ircTQZXmY{i1yvgW=OElr_v5o}-RywR{lF_+OEmtIkd6spW zb?%(0gGjHZb4GI&UZIv^E4#9bd#NWY=f{@IKQZIBFS>euXL(t)Cj8z+Xs)jXi3 zS)uSc22s=~!}Laxdso+PX*#a&lTVqPDpbQUYKzxePJxt|3(9-?^^ezGO}+79Fzz(M z5L(KY9~UO*KR5Pv`--|OkMYns;#|L<_E!q<=6G3MyH@GdS0y&k4i`4}LNSL<$RE`? zdfsvywL z#u^bH@S+#tD6>5(df8|}bC2Qm+)Vb|Qe}j+dMxgxQeJ%rtUaNQ`mm<6lA5=XKR*}# zwm^U5x4D|eV#6x?0{lSqnKhddeZ$AHdC%9wvhEv}$kTf2rUoO;9NI{+Ow*Uk#Yzmu zmCky`MCG^Xo3@!hjH>1H>uB7{nW&wW7R@-?yM8l@Jwru>wJ35R_SF(6qu3;{+%&=5EX27y38PzW$G28o4Z0MM4+0)XjlVW4mL zCmq%Oi=~DB;Q#kuW|)5Me_MQCG3Z-J0HRgJ(%qJR!M_*&j}hQm&2#iC=$!%RDo`>Y zs%xG*Z|MqP#WukD2_8P*WSoyXuy{=uJ0cwz*0;4M06qXdAW3Dwme3aPGt?siR)qu? zdywSc%>WVS>FVlaOLsg3LFi<$5gjQuMk67n0GbR%gJ5Vh7y}2Q1_S{FgMyGq6c~zx z!4be2$QT8M0JT7*O+hFO$oP9D1cU@a_PzZ5?0bjbweM#z7z_+Wp|Nx*7Z~rK5(W+j zLx8&iH3eayAPoG6t}(zo;D1+t8}@$fiT^j#d8!3;%!i2rw-2 zM_SjE(&`cjVNw0Obx$UX%ofuC>t%dI=}aAT#sTjM1z`=8}eMczI{p7lB|;TGf}luHQ& zU02c4)9?w7vDFAV(;^tDYyZioAbEGA63-#Z4!pn-N*JW_axAn2n>BP7m{tH%LJ3zg z`7X3&@oVMcBub+~j@#}oxqf`!y8EC@6}4pe()Q|p4T1Y<|K16H2=}S7fNYwwBAL`%pL!x28>VN7FNNFTMuK%JTF~5!r zgF>J`%+G^l32b{Lx^*g!a2<#$U`DD>BzgebBR#yp)=J-h=tw+0)b1XZB#&=_h5?%f LV81G+s;BlJD0G@z literal 0 HcmV?d00001 diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 858cc7701..67c1ad859 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -37,6 +37,9 @@ class FakeImageFile(ContextManager): class TestParser(DirectoriesMixin, TestCase): + + SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") + def assertContainsStrings(self, content, strings): # Asserts that all strings appear in content, in the given order. indices = [] @@ -47,14 +50,18 @@ class TestParser(DirectoriesMixin, TestCase): self.fail(f"'{s}' is not in '{content}'") self.assertListEqual(indices, sorted(indices)) - text_cases = [ - ("simple string", "simple string"), - ("simple newline\n testing string", "simple newline\ntesting string"), - ("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце"), - ] - def test_post_process_text(self): - for source, result in self.text_cases: + + text_cases = [ + ("simple string", "simple string"), + ("simple newline\n testing string", "simple newline\ntesting string"), + ( + "utf-8 строка с пробелами в конце ", + "utf-8 строка с пробелами в конце", + ), + ] + + for source, result in text_cases: actual_result = post_process_text(source) self.assertEqual( result, @@ -66,8 +73,6 @@ class TestParser(DirectoriesMixin, TestCase): ), ) - SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") - def test_get_text_from_pdf(self): parser = RasterisedDocumentParser(uuid.uuid4()) text = parser.extract_text( @@ -461,6 +466,45 @@ class TestParser(DirectoriesMixin, TestCase): self.assertIn("[OCR skipped on page(s) 4-6]", sidecar) + @override_settings(OCR_MODE="redo") + def test_single_page_mixed(self): + """ + GIVEN: + - File with some text contained in images and some in text layer + - Text and images are mixed on the same page + - OCR mode set to redo + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - Full content of the file is parsed (not just the image text) + - An archive file is created with the OCRd text and the original text + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"), + "application/pdf", + ) + self.assertIsNotNone(parser.archive_path) + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertContainsStrings( + parser.get_text().lower(), + [ + "this is some normal text, present on page 1 of the document.", + "this is some text, but in an image, also on page 1.", + "this is further text on page 1.", + ], + ) + + with open(os.path.join(parser.tempdir, "sidecar.txt")) as f: + sidecar = f.read().lower() + + self.assertIn("this is some text, but in an image, also on page 1.", sidecar) + self.assertNotIn( + "this is some normal text, present on page 1 of the document.", + sidecar, + ) + @override_settings(OCR_MODE="skip_noarchive") def test_multi_page_mixed_no_archive(self): """