From 1655d85a533d5778ac2a8c648e30d3a936baa837 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 19 Nov 2020 20:28:41 +0100 Subject: [PATCH] testing the tesseract parser --- .../tests/samples/simple.pdf | Bin 0 -> 22926 bytes .../tests/samples/simple.png | Bin 0 -> 7913 bytes src/paperless_tesseract/tests/test_parser.py | 221 ++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 src/paperless_tesseract/tests/samples/simple.pdf create mode 100644 src/paperless_tesseract/tests/samples/simple.png create mode 100644 src/paperless_tesseract/tests/test_parser.py diff --git a/src/paperless_tesseract/tests/samples/simple.pdf b/src/paperless_tesseract/tests/samples/simple.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e450de48269ce43785b8344c63e233a1794abae6 GIT binary patch literal 22926 zcmeFZ1ymeg@;^!l!6mrE;Lb3(yL)iAVQ_bM2@U~*y9EgZNJ4OTf?IHc27(3mH{|=> z-S7V7z4P{+J?EYO***+?`*z*Bb*rj-daCNvG^&!)EFe~HWSZ{c?w0P)-Fe9D05*W5 znGLd_AW#wFVCiNB;DGk10i~_&+#oJMX**Llh$IB;Xbuq;Ms{^`ftcDOdu6l44>H?6H;@?p^vKWPs#P#ZGbM{;Bcf{BGr+ELKNlvYYRNcNgtX+7@aLXT zMQN!S?3XnMOGXzd6?Y;rsx^sOB+DXSS48V9%8C_*Nre0Ge09*fJ;tB)nym>uKSOw1 z2i!o0IGFz_FtqiwM&zfZJvBhw+)rnJ_woEU1@Qha3iwk&AOMJsm!0je>e%A*b|c=( zS#^}IgXn*zCa)u*nWXP(wA20EkL1j%VyEC?M)$S`K=_(QzmCRCLHrFiECjSQtYn`iKg zf}%nOaWK%_&+Ku&A#j>Q@-?@j>#2p9dZv4QKhun z=@em(Dge&env$D{x9Q_-*cI_>U>>Rgrg4#rb67eijW{P8;mu->2nuC92$yD~)|^om zof)g{JNi%po%qS2uXL^$$;LVc720v6ksjPB{pbm!yHQ(d{s&oogF>puBi3^YH8K8~ ztf=^&Z>QNYfr%PP%}Ba_X=avrD9bVAkH*pka_wzWhja;v5}TSXTYZnCH!OGA` z3&Wr_z7-7B5)oa9ALHmvT?5AkgZZZC23wcJ>T-OElbRVKU0r;Baq_`7Pq-kT3Z{JZ znzD>tk*w~s6hTUEMXXn7y`Gwr?fjkxs;nIJ_~l-gs<>$-h<Ro53Nw-;(BpU?f_z^C3`oT3wrR`6@gqyKrECgMXzc67xJHqs zAT-Dx8^>$LdmKT)E37b`Q9HMosc9RLS$SU}H%%K8sPn~!;@wJl8+r3Ni~|WNKE`!R z=<|F3*4*42fu`oqj{85Inim%J8su5@xg8h26nNi<@6U2i3+$78s?@4}*VfUtWWkk6 zeAe{ldtn!>Qb4X=udArd3&K0rj2b-D!=Pmdh8w@li!_F%!*}lAmIHJV5!u@`n11Hu z#F}Fagcv7kuQ4S`TnuB$$LG*7l+ct^QSXK;nPa<~;%{0m9&|Yq?448ky<0xS-kd^R z=`@)^j=-TZt1p0$iI!&iVt%=D96Ou<>au%fn$^mpvHOmuK3obBkAk|UuHVvh2G0bh zVd#_TTdGX6Jv;Kv{=TJ2sA4{=8zx zVGa>A?xEGeV@B7Swbkd+ z`Yz5K(Xo}_Tt4>o8W?%ftQ37A^?FYCHR{9eQ0jBvmGBcLV z7USBIYAT_SguJkPyK>eTf=DHgI?IA7lk=OMias-*WM_{oKsStX;f1tbxPT*rG)H@JdR-qiMbg%YftI!VPiy zZR^}EJtn@&S8k+jFr_tRn+KzvT{naNdgcjyzj@^-Cw4W{vM74GX3aG zA8%4J&>|DQ4h1z-uCB}oY#P?Uy|GYrt+1K%w)kn}x`2wFUsXPfXf!%W(eEb!UUN;={~aG}&ptOzqXF$UaFB(W2)RzJSXYod?!X>MwuK0cJ@kv?_Z)Wq0~* zGOg&X#OHioX*4tz8_S6BMI3fc-aPx9SV>#!LJ6SP0Y&o|8J_vzMoHtuuMdn&y(V1R zK3q=dZ`GNZv1=&=LdVU94vAbHoVU;;EGI@!=NH-SOr^m`dwB(Y2hn07Nsh@#q^8!b zog){pP5B33E|Gl)J?KO2>I$`2g4eMdGHjsV|+;o9-(THn7OA24?NE{GgWdW$|4i-A%#Om9y@vU~Gu zf_#`FM|CtNfv^t=Vv#jFC!namky9zp<6{Wl8^lNw%}gptv8L=)vGr7JU$w5d0xfO@ zO`Hb6y3uS@5GCb|O^vME)$Um$SdSk5l-cGS^vgLtmnCt;I?6gFaT1e^Kycs3X~0)8 z#@Ld>x3EadXZ=fh*Sy_b)t4{;X-ds?7e@fOpdJ~0__})@Tj!i~EmyhR zMIaQ*Gq&r}C!;53!hbq4PU6b(^$S5J$HvCwPj~NadHT7-)7`vvRWj>x(94OQT=S)QiT2GGZDghdV$l(WmRmJFIsV5<7Q&=*@b_>z0*3@5vvn##f4+iAtctFB4n-0 zwal!;jo%)3jY*cxR)?YS9BGm&4jLFzMgE%Zds9|GHgwt$G;dYa(PPb(`E&Rb*J(?S z_{*t4;H1me92Saibz9)2`y4aeaEOjeuoRE9t$Nj#&&W$5r|$}8Eg86;nv zY>xf(Dh_F-t`;Xnc;xxNV!5UqHMfq0Mn~fae3Tz`4iS{D8W|NQbS!2j1 zFH<*9e-3L`+3Q8VSR14DPu+Z%TC5kTag`HZQN$w}xA&Ek)xR!ydk{s_4Go>SZMbzn zL_!NQZ`ynqXsi}XRqLZv8&^~H(aUdJUdVX!Wb3r=2iHsE=MP+Ky3f0aysXerYl3sR z5~I>gd=d9wF?6mJ6Nf#spfYIGs^W}}v(3s@?XPuWV*1IXJ)gFtnL^COB6#`zXTs8I zjVGs@;mP!J+c-_!s)%4fjqBG1$)mOSoCb_f^J1>2>yVNogSmjmRHb3NgQCNlWix+| z$sg8^D~-jq)%N)JT%m5PZNtW+B61^Nnib_);Fa7z$&cGqY6z0urs1<5oo6tjMwHBh zLT5Uxy+ebokmIfyM`}Yfy!<3ZTCpOuLq1}?{DPAe)JsR}5mWMY)a^u$BWv&snUh|Z z@w5RenTyjt7*A*MW61mAPy2v&IL4vgK6m^sl*=XlPajm@ruzpjPB@b8&6E8!ZhOcJ zGVt6uW;rN|yGwpNKIh=k;PasC_;_3KzGd&CgJ!hc@&E-d)yGrgbLMkskg>WWifmaok5-jbv%Y8R!_ZR!a*c(d+@u|ReL8tA^_wR4s z(=t^yBG?}G5mo>c1}UuHA&WLhsuxu1Sd>%h>@wQo;x$#s>+9K^UCE#8x!0)JX4ePV z1sCD*+67yq^suZo1ogxA!0I2<=;p7$hP*h#OSg2PPx(e#C$Lc>?`kEZI6BfLi$SJO zjJt)G59v2|fqDU8FJ>>^UfrMHs7BPnfo3i=cG%(VWP5TAq^)XV21 z>6;rtTl(aT+79zB=gbYc&^^nVu<_A&2Xe&RJh8r#PMxAtj2=F z)%fPs)dVAg;B8O)`^^>5hk(N#67s$PyzgN`w1&2-_?TSfoYwM!!g0AwmnPyNNUxEU zGdjQ_KG?fTY-8g)N^Je5hwqTkvrHD?oUyNz02zSybtl5ozu44-iuMSpv>)lG6f4(H zCxLhD77fEc*}vi;X4N!6E_&~1A$gs;Yve||em_#RSFR3h6Yjd>=9CVaFK(>8{5wA! zKjaDD@8=nN@71b1-k<$AYee6E82M|*{myK$S7Dr2yYe_m6F!X?Kf@arw%S@Lck%O4NiFh)aHP0`uSyE} zzVqrj;R7Q2We;?xXxoRme}!`>0F&jz_Z z+4Z{~oZjNYI|?`TY}u5vk?2_o@$&Ar9*%ca``_lrUe4KB*;HE_UKP0{o(jKXVAguS zm??JaKRl`d!4R~I&*nvK@8{E#-!86{)6FQCt^CTf1*~W8O+HjJB&3v1?$@@eqAvj# zm4QzUQCjSS%UFWQaZ+DoVm5ZeGN8b*u$csVpm6H0!J?!S61UqK9D1U)Ta4)gZU=`a zU77grJahcUgf4TnJ1nQvW4@gP6LXRM6#^`!0#*5iP7e2R2vXlZDxQ&NV}PPF(@o>dC__(T55|`~t+14O#brDBA9x;pGlC zIt@R7J(&;skAWXW9<%BL2Mzt0YUGt3VXHfjly1aZ{T4%F3{r4IP9N?n$87sn305g* z7{!MgE!4V!RL?bXL?rn!f&C2#-is$IQOC-DS+C0ASC!-G1LXpdJ-XDFi0=5hTEAkP zv}<{5TvjT1XZ;GuxUBxJsX~b71ikv8*)L#9xQV1jh`h9Y+0p zbeVCkVQ>QJ4%^~<31eR8ncCGnt2xE^h0nnDQmmJWO-5eB|}CM)u=DRO#rwrrNJlnQ3GDQG*v`~ES$4E(xmH{pAOhk;6c~hK@@-&o1!BJ87^L2x~HgP5i7gB1bZi0mJ$O00#VQur~ zfSb@JH+r2cP*wTMj95A|U)|XFO)E;A=)!tYHa4t`*Tv&{yordhX1yQmk|wAt5%I)d zBNU>~X;9)}m=@!^yXE2bz)a03_R(GxS~8NNg@;U2MYQs82FveyQQ|F1oD#;+u#KBA z{v3$%W5!b|uu(*W3r*+MV&dlKlDiFqC__R}Vrk6=VpQPNtRU>V_{t^UiL*!jZ!czE zq|C;eS?-P3oeq1I-=8E-Ccu4S=bMN-NSK^es(qNGzG-kr5pu3dAP&XG3D%?F$ly(j zMqvh1=gQC0P+_~*_I#=DUshU>3ya;z>AIE2q7Xlk#yW~gzl26iMH!1Y|86GU+i>iS* zEN(?Bh%T-N$i;cDD8GDF`|8Xd?hKb$mrDJLDU4l=mf|JCiZNuP@~W92WyFODR1NPD zn7gh0EiRuXSd+h(mvd4M@1Cr`(<~?2;(QA*I4(8L*1O#G-@OI zDRO9j@%?0Hyb(7=Rmp^duPm~ZB^O*!*FUID4S!fc!V^pU0dY|%+!^y&JGCszlWIu2 zUh3S%QDHY?`YuOPV<_fF$wP_^dJsDNE=kmsd zBmG7+@)hcp)4s7n$s>+9{)o(NZfRrZEFbt^=*&#zyT8;qZmsoD5o9%3)&*@6r^ zm%bGXWtg1My4uO;e(bc1|6!wEj?}Bpzk&3b{9y3HGWM%}Q=yo}*OM1qdkZ`CPkW*O%DR3o%(%G_(6lTi~BdHmdw zjg=M}x*YEEBctv534OShxFd$-QFhG91mK3co1$-;JgM~o967uFLhrH;@SPz6OY&P` zr_AG;t`E5xE5Na@ltOWsJ?hN4)Rhp@VN;T8q1}S4+=8iguB9Lf|4}aewQu~e6tFKN zoYkbO*60LG(^0#$>IkX*6b5X2J&C`c86sp2at_%`-{AG|Q20|F=4knSURw^mWbvvz zR#uoU3d3Gie&Go)iXj?X$DUUzXOS&AI(Y5*y;;J$Iq9&gL7}sOBHrN$kWJ!3phqwv6>k} zay@rFY5dW+v!nd=>zH+E9`7C8>Z=W)}?i- zd;v3|?}A;cuAVvPhv+fq)dJdjD$9IfM+pYV^=bGypcaT7$$t`|JF@zEJzdHWM^gX z>vwzR5s>Bk=~F$pRddetz{x{!?!=O-|ny^XIrdD z2V@1aoNqLZ<`NW#H<+*9@NUDc$#j{E-$+Iek;fj6izKqU{`ox@mtp@Y{%wRL` z`oPTn8N)gHpf`d!JFi=6m&Uukv#8}xTxAj~k4Gs^#v_jt-^JS980*?D>Xp7`MVN8&@n+X(`cc64x)gt&rT ztexB(p_w1(p|Ytx1Sl@{v@t1Kn?YPm-K-rQ)FCd`7RXPjAxl>PC$s^j6c=~&(r4k~ z^wXGK3qV&`N3 zf5=@KCJj4u1E9GpzZb)9nAb_X7=v{ww6&X3a{A>c;~5#g z2n27yt~OOjgO%i(C}J0VHW*W0r^uTxs{5ju(-z)2C0J=o>_}p@9tgf5AX-d-FuZBd z$qLEa`kvp9BKUf<*Y>#2XzTmNeBYN=zRMTS=K<=kG}R=?Zw1{C8;JoFb9EE7c3my8 zZ$&z9e~ND7So+~|*`ynkb$DJN7b;H4zjZf6`fy{+!Zh#lW5#l)h%;uAlVw(OGux|P z!U%MfG}p6;>mrt2Hb6XUKA=CVnn${WvMI!|g)zeQi;Cl1urBS4ZZt1(Y{YUib4+Ne z=&+%GXZio4Gx? zj)sdnJUN4+>7Q*77cx-W+T6{`Ri6jyINVUXaA#VCWXl50t0r7G({3ri} z5Ksf+(6f|uM zlz>VMozfNXTj;-DKb-#%P#M7Sw+t|d>$hzG_bK4_nVK3}hPQ`1dMZ4EZl2IT&jvSH z&xW9nAppi4O8Kb#pdfnW*CEj%h>Y(L8R<;`@cHx~yxTp0p4?+xJld@y>_5cWjyBE; zRojxW!dxwP`*nj&`C+=KzMK7wJ@?8nH1Gxu2f@4xLL~@}ZE&Mtn4iG_LC1j~#vV;p^V6MRt~AjvP#y%j1EWc96Oe>Y zI!&G5bYebw%Co3(=P z^oDtP3WF+hcj>pu2cyA_aW?fh1GoPye-mchbEiF1p~05##4< zqOdrk&rv&`-F5m?kbRNl(3I{-81)mq>n+|<9tpAaLj9Is%_mh}BEiL@G8URg2^Vu*f1zhM!SL3?Q-x zj{sZ;u<9c000INg_~H1(SQD8x(Y?hZDI5pze?`I;0O;Y6Ln-rR>4~DlzZbyKBk{wQ zh|3hX(PIxqxD;%t3cJBFhDGO3se&|+QzdX!aW$ULh@GoGcY9_NqL&{tPV}U zkB`J&H|NC_Mz-wwb`0XhU=32~DqA=Ef?6F^xvuwx%pr()-QtSU52+2+vrBv3=nFYn zkYi`}F`^*yYGnU9(iKP$O(fKEJ?+@m3o`%#*v)h-bH#Co`+)A)wRnu)f^tM<0*4$d zwT4LzM;h|1Gt5NF3GfB81@T!JTS!Ers4Rs!CNd%6POV_jY z*G^(zs9IhMBL+&oq{P7tel6WYfrTma()u;3BpxMxQY5`74#g-y9y9edQID?V^Fqvt z5Gx5c06(TSrvK~x*IBPAdJxTUPGC+DPY6J9UJ>aME#l0SD-alfONSQ~zB+klu0h0z zofyV)o!&H`22};_Ong&FQ*=`VktkZhVg6wOSs_`Gg=+mf?RQesST--t zkH! zuxCm4xZ#)YMJeS*r{ci_prnNj3E$2+81xuUz<>?QVaYL zGY|Pmf!4c7H0C;u;5UlzZBmZ+AnmzW*MF3}r8ZbNS4Z^IXR zySAP0VP-k}#VtK5>qMoN=XHI#jGftUe_5u$?gP zE`cthbtco~<48nIkR(=8@PPCt4Kg;(YZpn}LcEDYE9H+g{Fp+o1A1PX;edkErAKJD zu~jgKqdxVV_Go>_H3K>a@rt*oWK#=MwNbXwRAbFWW%YAv_rKPr=qT18O-+-iCikt_Z_-+guRMNQ)`rSGsX7T>uBbS&*m@FWlJdv% z9~?grtRqpAK<4ZjQ6pm8bW;P9<}`J*--7I#w+=kpWOHG@M&wJ*mSLq#OR^or zAF$tKbrJDS6qFB;%%y0jZl|ev)BdJmpc#icT(m%Kp1uVKGa5%KsZea9Ed7-o!zd0= z9)>0xOGd81{M9dpGKSO?A9;?F&`Alx{8-gK1{HeO6saMEA^aiQEg)$Kx{3@mK)z9e zU65R=UN}|Ekzb!*U*=kJT7Xg-Q>aj=P$nR)EvYT7Ei)(U8Fk3GjMjwNgy5#`=IR#X z#^ko|rtj8#M(;x@L?wjK3e!r^O47>M%G!$5iq?wViti8bhx3Q^C-P^#m%CBB(Yukp zQ8?57fv_vH+yA5Nhw*rOiE+_t{-e|j4dzsFNWm{vsds7`G!=L=uWh7+B+v72)Vs@1 zra5iUKPBqPzc1ldTPzov&YEtXmYFV_)}E%>quaySBZr539t_<=#e&4L#d5`RI)gex zI)giNIuj^jDdH(oDIyCJ3sMT=3!(}_3)0r{y6L;&x?Lk)L|~xqqw1sH#u7*)Q=w1+ zi|C6`i;#<0ix`SnikONpz=&XGFfy1MOaW#9vw*R|2w?IJzy`qvxKFo_C$=(nl^~Eq zOwo&$5gQ>HCK)anHkme=A{jRsI~ia7r8cEn($Yh-d{XoP>Hbi`w%dZcZ{bmU}2CL;lqXI5?QVB!F78xrjB zOY;u7<~yZ4Wjy8CCdGq8NeWL2kC953%9IM1O6!g7&F#(U&7%pSNumj*$ty`Od0P^- z#XijS*7Pj`^AvN9&RgBK4|esiS|m)VsiB&Qn$emWnvpXpGx0MyGkIJgTuEHX=Ww=2 zwk)xWj0+m6qjpS$Bt=xRCYPwQf8hwE+Yh-()=G}qnMMb^{R zS=Qy503jm7D_twS!X3gx!h^#75ApXLx3ssy2=Sr8p)nF+5)l${-9g>a-5K4HRIyaK zR5?_6MPWtpMd3v;;2?0y2B;q;o+chhvY7pnE;u!7g^>0oEEd?yeckD+av&VSx`Q~^7T@Ia18{ZjZ8&De58SEKE7;qa@8w40684MbH zGmtV!HP~t7|5EL|yzSF-p1sd9M^~-n&=I#|@qK$8Z$oQs_B;31=bej9^zZRo${VB5 z`(^cp!`9(;Td#kDZ>DdqZ}_dir7;;QMrvfTOqfi(OlW_4e|&!qT_RlsUFr_63P^=i z1x1Btj3xJ0a7wVoTgyA(_~(h4H=$Rfw{;70%R-Aq3wn!D3q{LG3t`KPmaP`FmV=f# zFBva7uN|*6uQ~6)Q;KVPzn^~Me(ZjGe$sxQ{SN(t{Xl*)*T1giulKGOu6(W|uE|cm z{%YO0?YfL1A5>Yl+`<2zwm!GUv)!<9xrwozu%WU8Tm$Ynt{tr%?X-74ra#g=B0MOlK(lsF~2B3!6eM2&ZK&cuv4#7tCOsgqBAn2K13m8KSV&>ilX9W!%HRESF+jz zY6DUO#`N;^n)E8~Glx|@PceNBPus_T;L zBESfij;noZzwA$r71lqb15k(ktn}WmPy)3>Ph-Y zMo5uGwY>BO{!%j6#sLm7GXfY{Dp+kO_W(XRoCch_XX#e8ySft!g>#Ze(aKS6c!t=^ zg`J$paiO&G>iIt=nXRiois~`5glfrF7IIBBSxbDB`N+56yTG%MOCT1jsi>-`rKm8X zI3hQq!mP)v^fOE+T&GAUZJwppxmLFpQtN2u``ybK&soe_*O|rH)!EWH@Py#R?L_>9 z@r3ro;e_Hu<~y)Q5cL2S305IiG?ow6Ec9!w6AKCJ8&(<3sJy|O-86#K^tU=G zSCq9>QAy@0o5?9}rxQ6--BQkyW8Vrg>H>6WzA<}i{`$OGyva1kHORtHjMIwKkP?n%tyZa~KZ$7VW9?v#WX)v_{vk0bJo#!;rUa$9 zvRJ>^w^*jQtk_r~@XZ5D=v&j2^|Yk76DgoH=Om<*sN}&nd@Qb7rH0#0BSIOL>UF6AnQQD+vTM0APVbV0F7^Jsm(q(953TGsJHKbjkYNNHM z*;L6{bf#r3VQrjIHB!P{P*rVL@w1BLU3PwUr9p+CSfl>+=U*wm3V${Isz!blE+M5T zrP(XlD@P+rV?-lUBDvKfpEhKl%8)9R>Yi$-S;!b!@uecOqP(KsCf}xT4>ZF*BRxYl zBTz9_;i%bO{){V{C?PW?voy0Lvsv$*Ui1ReC!0?(HupA{GZr&i)1Rj&_7wNT_mKDC z_b&F@_CR~OdzRD0(*iS*GiB5GGwl^e)`?tSxTv`}xtO_@xVkvixR5z*I9aWi#tG)U z%1>TxG|(z1DNt#q2ZU9}y({TWX>MoK|aL3dbZSdK}R z>DB%ht#GZFS+4iKdi8otdW#DL_IUP~_7HopJ>7xm0ri34f&78iLBPT70`TM9$H4tH zw|Tc7w`sQ#Hv@NZcQrR;Hyif_cZWU2-Im$w!um3W*{#{s*_qi-vqDwGC0XUZ(w8j` z4Tq%5gv*fSJRf=?S|L**8X>R{T`QoKy4A1Mrj@((S!<-vsgLoQ$NAum;LYR>|IO!{ zl^f8_x0{lixSQ4+@J;;<(@i_76Z}v3@9=x@c<^NKnD9T~%g~5Wo6w|DEl}U0aidK{ zFh>+dR76-rI7FC6SVu@joJZtDG@zBCrJ)8A6~yYqO2%o#jl?d+cEwJ{Dih{#?6DJp zGC_LW3+#Oa^+Z##GjS}jLa|h_9I;~rc|3<8GOhrJ7Mr_8IL9t)O6x*%CwAD$DQClw-gLxi`NzrB@&(@2!;rw}P{Rl7g**Bh-`z!^p!iZ+TegSXOlc z8pQQ#btbB&-!s2&sWPi_sTz9Es)wedq{E`;-yp5Wp+8kM^ZtDmhJL@!j-IwogYN3b z+>Zga_ucUJ-8NJXMb=>O1{mAXb*XfrwA*69V!`5@#Uh^Nur-r?rR}~6u~n9xzLmb? zBE)yjW7gyAMGa|h{Hk*2==|sm>k?~s^?dbU_3B~4a>z1|Z?RCl&{w1Gm4%gl;T7Sr zw!VkDd;8n63#JPJbZP`61aAZ?1pG+0NViC{NZQDN$VhY)^i_-k^jb13kz)6Oi^Ru* zVu=@#qhx#hoIX~E<7E=Q@ftD|GPp9XGQfU2Me@FElDN);ildbzgm(eQSv^5IPvTUGT$n zfBne0r)XfKMJBIbB0(yFJXc3WS%pyLxeET6(-`xZ%NX|<#Telj?pV8#n~}It1@smv zY!qem!f2<(r-jU`_2MRgz(0NC!Nu9ynYUfD<#SMDt zleMXiPaXGwK0qztAYVd^LX1#sODrwK0YM1kh6EL%9!>#%A3*>?25tvV`-L$Q8Cn2t z3uXxVI%*OM2x7-4Fw#AR}D` ze+AQ?(X{yVn3XtZ;79q7?rx%PHg0lm`e#mOc4t1Q>=9JbtbNpdOtci+=8E8vxO6&D zCFr(xq!z38rnb6vqSm|iyw<-KyLPx%!1>I%(0Rc*-g(`b&zZ}4*%{;eN{@Yi@<#Xu z*9Pf#iH-X2mEQ$6j(UMTjx>o_aM;t>Ke5EI*s&S0tErb1bY$lhscDgEE9e$zBB(8B zwP-zPR}>6nm*gpFkZ7!_`KZHbAvBtF?leNOKFMcH8E=^1RHgn*CrQfAGDurWzN7bM zxE{ldpfZy=i~m9vCHy9^cwP1}shp~k0T50j1ub~!Wul=dp)cWsux)t{`7ko3GHE`2 z(TQ-ed}97Y<<99Yd~QhuQeso0`bxpV%tH5``#qgK<$?dg;j7Z3H!^4mJ|uA>KEJG{ zCOV785~|CJrW&VGr`}IJpPHItniQBiDr=W}%n{2I8y^4a^!4>1Z;~?w7_ed8x1(rI zt6geVs%ar=;bS3XVYz)j(vZ`rptcTW0Pg zr@hgB>&bUyw35}5;}_u<+-XdAfcn3pd=;UWGQ)*lsyU8Zax2y!ud z=&^WKBKlFZrsF-J4f=%r$)tEqv(tj2QT3y0&G>t^wqp%b+jZG>zx8doA-M&)*5pGv zY~@^G?~~1(14qDQ0u%KV<;fGLOZUL%5W zgGC)Epn<39QCL-O_vI(#n+0EqLDkZSrOG>5Z(VQ8Yp?5DWYx}B1v;kcn0n4tg#~8$ z-@SVqfnAZ8F}h|JdgP{~j$<7dWj#{G)Kilquy&F7k9m2H~Xl~NVw6+)HC zl?s(L8VTC%HP5)RxrVqrxYoJ6x#GAaxPZ3Ywm)oXY~pPFZ4+j-W-exxX96m^r%&dr zXB($`XL72WHF&fxs~s+x7j?tyxh*Be$V7Y}ey)8>v*g+Tu}`z7wCBG+wQsw3yH~b9 zH?}^Clrtr?6Ttez;78-AE5_J+P5_Y?(KHbWu^CY`(E?E)(F*s^TuXr(uayhGtLwW{ zr5(q-K>vr!7o{k5_{=~!VXD^`9hSFKS?)U)PG zn~Nzc8dKA&XCJMLGUgR+x$Q0-gcl5!m*xbg?mmT&Q!bc|yL_Q}Us6$018Lf_Wh&78 zIFmoq%=P|L)2ABu((`?(?@|j=-7&)W2}V_o^MNNndL8XIxof#MxZ6L+es20qY+r9L z!a8C2rp4TY;aKgW#5K*q(8t@lIj7dJbh%Eb_}2YzS+R^Cq63XSTGnU6kaFtL*a}ULK7dI_{G11uj4C@GrWj?#It< zRO1^TwFrCP52JOVEud`>KjU$6$ZiMuflvAlVfup>f=)%dM89>+1k*hy<{9Lnam-vr zZOrhjzuw;cTB3i2-jl&X3R68`lpO6&r9J7FXjN{kJ`K4*DDBWMrwZsJ5vn zsIIBrQ(a06FNrNs9s!7X`b<{S}vR-n)7RlwRk z_%iD#VI}Yp=3MkRxbt~C!Z!?7!7rz4%e;c4BGm$iLb-e^Y{G&M+Z0{{x2ih@bKPf= z=NOS>tinryJ*Tp_NjpVF38TV9ejksm97BA?ZM$5*@AiC)>Wm7HYL&~CJCLgxJR7_j zEEwz^yzg&Ga3>=V$UeDR+E`3jRTfkCQ}n$`iGQX8r$Vxr*_MFRhsig1@Et8 zPcpu9^jk)aG3K6&_@3D9@ZGwe?;MUz<>vVdpZPiCjr{t$_w{CUdxZOI{MYIc(;X{U z3)ceIa92{-WLIP3@zz9t(@V+=sUI;b_+!R*K7$u(2PKQc(`lB*=&gRI$UeP~Lu;vz zmEamZVI_-sY=Yv|Hjho?OStRnC^i)9rixGyRDM3&-x2TtDBfgX9L0 z6;P{sz|OD3f#wCz(4!URPK6K!X$3I_y0;5yv}xRL%b^eY@xyY%Gs8S-_$v}eFII@# zF#Pf!A0}_3E{VzCk>d#S2FO0Bp03YE_TqX;IrUo6c$JuxfFU+pw~QMVzbb}phW5VA ztk0BiO=s?Ae$%^JDF1Z$smykf{F|_i-`QRIb?wQ*7SyeOc6KW=1-pUIq0gGrJd2LbtBmbve)?DF{`hwezl6?wCoZdx z#@ZflI!`2SBgYb!$ngUOZ)#7H*UKhV=JKpXhyzyc8=K_jQ&v7MD$Z9ew0yd@Kxw3@ zgS6o`YPQTYoSWHtW;rgJ2v{{cHM{y+_}=381K%49x5wYx+;Uy-FFKa4R$QM`X1lk`0gJq8cws? z&$;keH27Wi9d?nmK*OAE_>J04bL~9s7|>g%(q;E$%;oKc@xjsK?NN^y#e=s&`il;V z7cenMFclTgUZTSUeSZjqVZ#?${htbHo@y(eYC?E9dH(@I2R&7T{Iya<#of)x-A(Ry z-3GMCLq1_J(Z0$R~wZv9k?C1(yT?69`5hPcqXLTf$%a#9l9AU)^?j}S@`6pv1a|_caiT}de)Y{tqH}|J=KnGW#wfS$6pbY<@=7(0>xCwIz0{?ml zo$v4G!Hy0VjxP454q%9|_}|I?+WRCO@JXD%i1+_OI4;s?Qh~h>G&m} zuaAJ%7HT*;+6imBSVLbhAqf1h{l69Y_fCi;^!*dU>}>38EFcgIh*tx|#m~;h&&kWg z2I6OfuH(N@{x2)}L)lP&g8Cou?+*B+@&7Zn|19MHZ+85TMgC)vf0G9O<68eF#s14| z|6`GVS#|%Wf&a0{ze%zGGTZ-Hn-u#m zv;9BEBL9e=LGg@2l%7w~vHvMz_!pq23KTQv4Gmx+16A!z!4P|B@DC_%=Loj_E9&fT z5yrov&i(+8N!dZ{p|DCQl+4t@+|S$s1FAXNL6l5?1K<8!22d3WB>i*pzc-$=0aVccy}W;(^f#kFCslU0chyH`1Ly<( zCY|A50;B&mBKw5hb4C7dfIVd3-w}Jz$m-uAd(c^d3fAVX`hRHl=}^Ma{Rueumnqo) zE2vTe8h5pGw1lo4I@ccvL1dtc6U0Fb{1oif|7|4zP)An};NWg&_s2grDDQt(-QO*s zXzE}|4{>18RA+$7Y3gDt2MtU@-QuYUG5-@D2%X_C!Dd(e-?3;lL+HBx2m}K_zj3#J zmHRJn$v;DSg3?mb(rj#8JZ#V^CpI>2KIonkx`)CtdDx-G_*)?IPdZMCKWMn2w7==0 zbez!f+}wYDpyRnYpdV;Il!T1~%9mXVx@Uv(`aLC79Z#V_dP!(Fkev+#0`Y>l*x7!^ z1?kw>=otPYu9pkM0%~S<4rC~&KQ{n37Z(Q?zyk1R9}gD?)ZkAWz~TQY+=_7iIzwS=we$U&?!S+P!s`D6h(o=UGn;zf@Ux^bR~{QiN51tfzOz8%?u_+(c;KQ zhSD}T@LJ)eXGF>TK0gtN`{OwrawtjqNr zu=Ty=!&Y~Iu{Jf(7qWchg&`<|%RL{t1SLLk>*XB}8ke?Pd;Gj%M2wgYKkM)?7506b z)Cc-o(|s#&`Rh0lg~C4S#JPxi2#hpVu65C6obhw`Ur2K@=FM_GeSBSizlWDuDCw-V kPGfa=d?`gW`dlS1xI->&7F}1o7(8T1_&&7T@9RN#cXxA{fdBvi literal 0 HcmV?d00001 diff --git a/src/paperless_tesseract/tests/samples/simple.png b/src/paperless_tesseract/tests/samples/simple.png new file mode 100644 index 0000000000000000000000000000000000000000..a3a76840103812c371dd78f2e8bc704ea3eec1ba GIT binary patch literal 7913 zcmd^k_al{m{5A=ttdc?qk!-Ru%a%e&PWC2yk7I^1LXzzyA!P5pl3fZN9eZVd zU+43E{)y*#{BSt8`@Y}z{eEAs>vdh%J3>=knVgKCjDUcETvbIui-6z^G5j5Wjs!lL zq@Mf2ud{A)sygT3$M2j)IDDsbSJZRYcCvE!GIg~iuy%BEu;g_!ceS*1bhB}CUn6Rf zflHC93ioxq({U3>zlYW*r0W_S5x?K0-C$?sVP$=B?z~5SZy2pZ9)|N-lDk@;Zi7uh z;=~e{SySnlr|r`4XY}wVUd$&&d8$7Fb~=Cf%v|F_EnHLP_g?k$JJJl|5)x)}-%<#IA|uI;P7c?SW&9IV- z*E}MgNqTj4bz!80+1}p1kdcf!rS;=0N*HjU3F%eoFivY^WaRAP5?oPnX9b7L)Z{d= zvx|E7j%MX|opx&H!~|nW(=Pq5=OSF(+>a56h{L_LnUmu~m`AyQ0Gaaga_4`WmTCbs z1eBg}x_ifalloF5+QNzA5#inSJgGE|+k4{g!kU>iT zql}D473jWD7& zWn`G9r>9HH%I=pKHLYiy9(DZ@O#WE*b&Tro+??4Owy8Gow7E(?zd|jEk3~cTjWMM{ zDEj(J^rIxLaRT&=QLDFYCY$}6Dho6@r$OTKU4ub+qj4?9R^4d0XYa$+OLF^XdTPkLp|gde5uq z@&EGlOk|%@S6Aoc}4FjAAguYKJ@bi*&O-e*CCAuamFC1<#c- z>!z!#`?0aH5la%v&CMMmml=dYEv9|m-}kQAuIfUw-Tj!!@&r{-TYGKNZ;RX$hY17N z{ciNR41?m=V__hx%%trFeV6!tm+VPq%}|xoBvbq?4N>va-rfiHGr2$tEgF+y0XLPM=gTX^5Tw{`9Nz_zNS2h!9%dXDYKCk5Z>lUfBGy zXzg5e7C}L3*^_-XK))t%LzrB!ZqYN-)*!-bvH?=>k*jyJ%6?haJ=M`M0X&wAM@2<# zjI^9Q(JgVC@86$cKD801)dsY(;%{@bP6U%3+}J3UhknjULndrDe34PYJ*cHc7Anqe zVt1m!tIV{W?p^W6h>E-o$;CnrjNe*SIT_|>?$xMXn`R>1Ldw5N-Upr{k} zHFMw*wXLmfcTdlKAD_DftsNc2j*gD&TU+u;e1=nNYq7Pp;`o8OVi@JMwBD*C$->U8 zwVXEhFT+*k1^J(&A|vHm0*01Ba!o65}y_#!sx7xMS7g+a3)-IXg> zY|x*tz^V&4VWm)RrU9qNUMr(zL_mQm8XB*(@sJ*A5htvHg{5W0;!tkXr$=eMb6LkWBgLp%-<{7`QGnzl zYFb(j0|Nt#p^Nmw-6lL^V`HpRQU;&b78W?`J(f90D6Vq?1BVqA6>YySDRDx{c$r65 z^#070w{(G3FbE9|4F^t%1XAe3BB#TyDnPeE%_Y5m&xfgTaw=ukk%qf(S7&PFD;OB0 zYjV5;!ubsAyD^k!d2<8$TDC2KLm`$)`fCs&k-{qqMhlz| za2$Wj(cs@&z_I!C>@54E5>3v~mX?-l;?5ryhH_)=o1er+UF9cZ2^II-FTE!6{N4KQ zZeo*@on3*K5h=xWD)X)w^uH5=@C5Fsy-S~UxHdNX_@f*r>U$0@6O;v>29){kJkL5t zx3%5JcanEWNl`y!4j+bwh0)V*~3HRS^+$%CRc)lgGPpoA+(0C*(WWqzS{LY z=F4Rd$e@vhH8nNCsaK#`uySxHJxmjQ^e|Oeb$bb|mb)c&xKcqfUR+p6LqS2o;b7m_ z+pFO0T&m+Wn61jn&#$4YoUaqvax}|tH8~0-S@8D7!h+2PJZ>Zq&zQAdrA^$*VTx>E z0BX()hY^WOPUgIH`Ep08X}g63KcZa0mtRouu|2jCVSTW<=?XApc{Bra{PykJ&}T)7 zOW8R&J-RP|rmg^!j@T9y_Z@e~v1t`TJJgc=o+g&$AM^FALR@;fLGCO^C>6BEcf2KW zY|4?qAB#n4C=~Db4SF>)Y`~!d!mQHopX*~Cotp*mad9k4u}r+^in225jg1YG@h1gn zGiW5`wnl-+9xhvRk5Kiw>ZnU4EhqKrTAGGCGTPc&niD@{PwfS6=Q$o9?3CB{_xBI( z(b3S{e0hhCx#|LxnM2rr_>jhKq&NXhPEL+qJcU9QJTR2*XDY}h2+|p|L@ZrW|&@$!|pdu<9?HSb4r?leb_NFrEgXKJB%kKS+)YC6T(K?CEqWag zgE;`|wal>o^EE;#S5V~kXuU^4A@1~KuZ3hBo+81Y^l%qfQ0D08*I4Lat{O@uhvClT zza9Sib>S1OQtKZ~@$oNUIS~_Hlm1UFtgM!b8XU$&QRMz0$M$EFj1?Rmi`9>hj{F2Y zSM(6aK|w+E=8w4&K7yRnnxJM=>b^9&r1bh4i<*klWOGrCk;8DG-Ed(X+QZqIDl9C_ zcX@7pzHcdpL9DpbQ?*5T?X@7=#c024moKwP4)cY!X3D?%CNv4_q2$X%O>MgVmClVZ zEAZNIOa;2iuEmk^{Rl zv9e04su6`TRY?u`z_c54X-*Gj6M6MnSyF!uf_1wyM*UOba2GXCSO08;dl%2TqB9oM!Zn4Bb zPVwi-w6;;iUl$jb^5{@@#GU@TZgVCbkyl8Tr8d4^E-%j^E&N>Wvt>0_R2Pk^-2)t` z{kT9zw%*6z(v5ZEIX3T0n7U$^~ z{pG|#=z^zxd}1OyBo5^4w*@$WlXRP1W?r@(d`}gThoZF92A12_zG`{%x=<;(*ia~L;WqNWaRImS7`(g^1_SM47h z^uFyu?yb_Hl#qtGO?zbC&N~V_3g7i)od&Zb>|;CgwR691GVJW^BqT70nRYX9aS%VN zESW&paBrIG{3zx(s10(N>Fhai5uu4@SGSY&TuI*kjYgxVSSas#QNG$WlxApmEh#K~ z$9^X!CgvOkg{c0aa?JI(_mbk`3-;%p4>lq9=*J0zpon`mX_YYBG=j` zCML2EhRy8kxW{vxdVST94{7~}Y;KhV?zc0)uGViVNlHjau}pT*He=s1(K2Dd5OL!` zxC8_QB;T>7=zlM5my_oXcC6_3ty{|UvIIfy567u+hAS9`qt4RbcZ7s2PJc>xuGmxu z7edctKUBt4JLtzNGow$E431+y5I7(0yCCX7D|&=)Z@5l>Ax4Z{_earJcR8-J+R;h- z3Y(dkt)rhBtCn1$p#cap0bv6bl5(i0heuq#MSncXbG7m<*qFHz9w7YvZ%xJpVEp}0j{b>IlM%j#XG^ zfgqCbc%Y_MX4XLl0!?L})b@5_jCPT(`L|?2{3E%#xj|zJC%!;C1zk%sU#GjjU&+df z11vzU7bik(sM$b3K!A#vn7GP!L6WXGb1(L@X;8s0D=A;|C11YSU4dTo_)v`Yp_}IS z%A^UX1-pq6c{#a48foc^9K^am>)fnCgk2JI{N4Iq8o9nyP_lG{ybB`T>U*kX&N=2% zopfkr9ndOd^BaNld5jvb1qTO{Z_cS<8mbG7Q*9{7s{(}`Fgm)T^H{Q{PoE;{s*$0U z=J~*Bp5wz0su{8lR+oRR3pa|ank!^#nygQ?si>&50+$E|9C?o&GKku@;>K%=XeLez z*2D`YZ|1;&R!!fEWNb1b-lwJM^KaSO9Lpan38OVG7INd#V$pMdD+dR)5SIoD5qHLN z3NQ}FKqV|$en)zrOSIeDvOfglJ^Fk`V9etzCDjt}yVs?>e?~rD6!J=+ot-VOSY~K1 zR`T-l8dLRrkfSx9Tw5m+dXgz^^GHeQGj-#YW)8Ot|w)RGEm&Lha;&s@0czVJqsA!bRw>`cA2<%pPCR|;qx{meAi z9&Py#P1E_yopplSl=v}U9)eY~VsxxNXnm59kRXxNj#fB)+YV|3F$NOF6L$pq$pu=q zgu+44ZY{y{s``EhQDMxn6l4Jp;@;#4+%A0s%(T>$R4acY`0({{4BZv))yVxyZ^`MOXGB_*8c8Wd_#+Fy?O)z#wY( zEx6ykdsp=b`gzqrW*9@(n>SQrMrDfq$W#l=--x3}F-f8trQt0b7?pWvLV}}3jlj3O zNgA@pG?V^&hNwSirW~QWck9$~A_|_Kp2-5HFLY?-OrXHYJ=+oElP(EjSZ^YLl(LG- zIWRddfnA4{y`#^-GD#qJc>+%US%-Ll*v*-5YinEI+bdZsb=MP8he91&#TFLw@EA86 zDO2&((Q)YpcFW&?_IhF>X*K5#C>t8PDfwz>uB{KJH6)md<=T;-0#CLwM21W9@>>7)#OIK|3ki{r zM(X?d`~0+R(;o!`jltG6ThYo!*z7jt}R4WbjKO zw!o%c6Sdb1kun|X=}{FGM}ts$laOHPoSG4EaK7MaNn$UwN})xH)Y#7?cRpr5$oT?R z=S@n=ZDdn+_E``VkRu(-PrOq&6wfG|38>0994YQ@j9s5zPDB^uFmJI8;B1!Ar+3hW(A9kjfHoWWs0c2N22*86N=Zri z?%lf^w{DS+6dTGbDUob$ZMm%ezT4!IY+k3VpwPXlE@s(Fb6wK&BBWZJJUj}(ir^Rp zUH^8!dZvr#EO-G376_->9#lf336G5Y0^Ski7(S@5u(qC=Y6}7X9*^r3*+!Am37rM0 z{G||4aU--f{jQty&e9WnuNahdV>wH4T|B5}GC;q+sxBmi2>fj*G)(m9E`w8JFjR)g zY;OW6#J>_=s|GxV^)#T_6`h?0z_yy0n}@u3aprdu@@h&-in#0Fb8_B?fo;CxF_IG}MrZbwM*6&|=zh%yxL5E?d5aY%Bs= zS0wb)k-IN#hd$B0HidB|hJd@ny|6yY;*ydKxE6xYnwpw@$gpyT@YV=yzyh|Zp-o)W z@wZ;CB@`zwH65Mp_Li@&ZzNb9L7Sg9#+Ds0mDKRYe}Fc9$0y6oUqSv9k9S)GAeE-q z*7(58KPQ3YKrEUEB&W_09;h7ut?u)DeM;ppb4mfO+{@Pl>K^DBd84gp0 zzgzhTxVi;QRY9__?OO<R46EGKlI<6Z3YFODRN&=D2&7tw0=L`{%Ws48Nm*HVuh;Abg6;_L1 zeNK*ipu<+so4dKY(_G~@dbgDO>C@6LI@y56p5w*G=4feR5JLFPn;myXNl)SZX1UCJ z5a6`U{KkwJ)YuM@!2AgwtmUi5bHJX|y?gh92E+>}8{<<`BjGWpfxg5&mS`X+3Wc~I z&kJ~w3@1X)gU2@zu94@Zp>`V{7u~U17d9yGGM5=8s7?~FL%<>Ugb`?#ih+R}rBeeT z0&02GPLiis2Jo>km~#Q2BSL+ucr9Hxf4&X!i5;xt$nY>Nv)?)))_aB=pT8n^SK6Ff zj$h)_OC7G-x;oV^E;is1AgiYK=2W*jN!SJX@m^Z5(Ob}S_4oN4tq71tnGj<@F4u*( zdaFEo`bGLRoscD$d94}wf#y_s^r!^y+GHMH1<~=n(B_ zdJ(dLfdROl6!u@DXS!l%K-1OwABsY?oT(rdnbRA zj04&rNO7L;X9T{;g1L3}@=}ISgzEbB*8l~4jxT~uzbzzm2?$=V%#0ZJY(D64P0!7} zgdy-^*LyC-I@_{HE5FovS}vsQ-~N6p)U;=~lKze)iEdF1*lD{$V}HZ>m6g{r{s$i* zq2fT76oWs(Q`p=1_%gc@zC}rXPR=0ULx{&=+bI+t7|&4i=@+06xGxSW&WrweIfq+b zwp({hISAMTn#NDdN5v>-h$`|mJeP;b!5MVIdrjeRylBb`7upAN)F3WD^DZGFGw}3e zkg2XKE}E*5l%1Md8Vfs4ntFORXof+Nv;Su|uSDkmvpvB7-uwTZoy{X=g#M$ne+jJV Ruvbl>s;I6|ENAlKe*j2SP+b53 literal 0 HcmV?d00001 diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py new file mode 100644 index 000000000..6d4323fc2 --- /dev/null +++ b/src/paperless_tesseract/tests/test_parser.py @@ -0,0 +1,221 @@ +import os +import shutil +import tempfile +import uuid +from typing import ContextManager +from unittest import mock + +from django.test import TestCase, override_settings +from pyocr.error import TesseractError + +from documents.parsers import ParseError, run_convert +from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError + +image_to_string_calls = [] + + +class FakeTesseract(object): + + @staticmethod + def can_detect_orientation(): + return True + + @staticmethod + def detect_orientation(file_handle, lang): + raise TesseractError("arbitrary status", "message") + + @staticmethod + def get_available_languages(): + return ['eng', 'deu'] + + @staticmethod + def image_to_string(file_handle, lang): + image_to_string_calls.append((file_handle.name, lang)) + return file_handle.read() + + +class FakePyOcr(object): + + @staticmethod + def get_available_tools(): + return [FakeTesseract] + + +def fake_convert(input_file, output_file, **kwargs): + with open(input_file) as f: + lines = f.readlines() + + for i, line in enumerate(lines): + with open(output_file % i, "w") as f2: + f2.write(line.strip()) + + +def fake_unpaper(pnm): + output = pnm + ".unpaper.pnm" + shutil.copy(pnm, output) + return output + + +class FakeImageFile(ContextManager): + def __init__(self, fname): + self.fname = fname + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + def __enter__(self): + return os.path.basename(self.fname) + + +fake_image = FakeImageFile + + +@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) +@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) +@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) +@mock.patch("paperless_tesseract.parsers.Image.open", open) +class TestRasterisedDocumentParser(TestCase): + + def setUp(self): + self.scratch = tempfile.mkdtemp() + + global image_to_string_calls + + image_to_string_calls = [] + + override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() + + def tearDown(self): + shutil.rmtree(self.scratch) + + def get_input_file(self, pages): + _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) + with open(fname, "w") as f: + f.writelines([f"line {p}\n" for p in range(pages)]) + return fname + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") + def test_parse_text_simple_language_match(self): + parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") + def test_parse_text_2_pages(self): + parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0 line 1") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") + def test_parse_text_3_pages(self): + parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0 line 1 line 2") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) + def test_parse_text_lang_detect_failed(self): + parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0 line 1 line 2") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") + def test_parse_text_lang_not_installed(self): + parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0 line 1 line 2 line 3") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") + def test_parse_text_lang_mismatch(self): + parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0 line 1 line 2") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") + def test_parse_empty_doc(self): + parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) + try: + parser.get_text() + except ParseError as e: + self.assertEqual("Empty document, nothing to do.", str(e)) + else: + self.fail("Should raise exception") + + +class TestAuxilliaryFunctions(TestCase): + + def setUp(self): + self.scratch = tempfile.mkdtemp() + + override_settings(SCRATCH_DIR=self.scratch).enable() + + def tearDown(self): + shutil.rmtree(self.scratch) + + SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") + + def test_get_text_from_pdf(self): + text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf')) + + self.assertEqual(text.strip(), "This is a test document.") + + def test_get_text_from_pdf_error(self): + text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) + + self.assertEqual(text.strip(), "") + + def test_image_to_string(self): + text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) + + self.assertEqual(text, "This is a test document.") + + def test_image_to_string_language_unavailable(self): + try: + image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) + except OCRError as e: + self.assertTrue("Failed loading language" in str(e)) + else: + self.fail("Should raise exception") + + @override_settings(OCR_ALWAYS=False) + @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") + @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") + def test_is_ocred(self, m2, m): + parser = RasterisedDocumentParser("", uuid.uuid4()) + m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ + "lots of text lots of text lots of text lots of text lots of text lots of text " \ + "lots of text lots of text lots of text lots of text lots of text lots of text " + parser.get_text() + self.assertEqual(m.call_count, 2) + self.assertEqual(m2.call_count, 0) + + def test_thumbnail(self): + parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) + parser.get_thumbnail() + # dont really know how to test it, just call it and assert that it does not raise anything. + + @mock.patch("paperless_tesseract.parsers.run_convert") + def test_thumbnail_fallback(self, m): + + def call_convert(input_file, output_file, **kwargs): + if ".pdf" in input_file: + raise ParseError("Does not compute.") + else: + run_convert(input_file=input_file, output_file=output_file, **kwargs) + + m.side_effect = call_convert + + parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) + parser.get_thumbnail() + # dont really know how to test it, just call it and assert that it does not raise anything.