From 9c0c734b34881e23ffe31cfd7bc6cc1606cca400 Mon Sep 17 00:00:00 2001 From: Trenton Holmes Date: Wed, 14 Sep 2022 08:39:08 -0700 Subject: [PATCH] Enables some basic live testing against a tika server with actual sample documents to catch some more errors mocking won't catch --- .github/workflows/ci.yml | 15 +++- src/paperless_tika/tests/samples/sample.docx | Bin 0 -> 6183 bytes src/paperless_tika/tests/samples/sample.odt | Bin 0 -> 8271 bytes src/paperless_tika/tests/test_live_tika.py | 78 +++++++++++++++++++ 4 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 src/paperless_tika/tests/samples/sample.docx create mode 100644 src/paperless_tika/tests/samples/sample.odt create mode 100644 src/paperless_tika/tests/test_live_tika.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a199ce7ad..6b97837c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -82,6 +82,17 @@ jobs: matrix: python-version: ['3.8', '3.9', '3.10'] fail-fast: false + services: + tika: + image: ghcr.io/paperless-ngx/tika:latest + ports: + - "9998:9998/tcp" + gotenberg: + image: docker.io/gotenberg/gotenberg:7.4 + ports: + - "3000:3000/tcp" + env: + TIKA_LIVE: 1 steps: - name: Checkout @@ -91,7 +102,7 @@ jobs: - name: Install pipenv run: | - pipx install pipenv==2022.8.5 + pipx install pipenv pipenv --version - name: Set up Python @@ -117,7 +128,7 @@ jobs: name: Tests run: | cd src/ - pipenv run pytest + pipenv run pytest -rfEp - name: Get changed files id: changed-files-specific diff --git a/src/paperless_tika/tests/samples/sample.docx b/src/paperless_tika/tests/samples/sample.docx new file mode 100644 index 0000000000000000000000000000000000000000..be6f333136914afdafc7b7cf3a1cf3262da8af9f GIT binary patch literal 6183 zcma)g1yocE*YB&0hA=@h}C2Bd}-MM7|BkW?5c>245^?q&#KC@Dd@ zBt+l?@BhBn>;K)mzJ1o3b=Fz4&pv1G{p|gmM^o(vkOF{8CLR$g&!kd0KY9nI}t0X2SRo1rP z$xYfUaj*22!{?7XmsY*De9tD zom}r8yQaD&`rXH036{)}5Q7qp4NSLUH5lV}i)(7EU{9*HWB~B5IN}goySlZ!T3xE1(J>pH=B3FRH2YiI|piF5@ z#s-E9kqoBz?&jO|B%;TIp(^l3CaVA{<#8JAXQKn1BYF?#)a*`_xh|BRDdhIk9G{6+ zhEdZ<&0rgT^K(8R)KtU4RUw!2g<${y5?BC$=0Dbn^gnfSwQ_U2s*>x^GR3PoPVd{AP@t1&uy=IIPnc_#LJpKM)54&VsZOlINt) z;TfHGSY6`^p_OT5T&_$en`z&IO){#t@>Ct-0;0iPBe}mL@+&HI(5;k2dkb$?hYc93o-u z(n}y7FNR#LSFrc|RTBjzAK@%)d*PVX!6OzL_@4yi*f8JR6mW2z@#~!AVtB@3j7Z|W zGv2x{8giIzwrSpsl(NR~96LanRT!B*m)m_*ENme+_p+Iih8pmL9@F4Nxd+ru9zLMn zYqa%Tj)_3^2GmabEbm@&`@OC566w(p{R@UIgjQ7w+Q)}@YZ0U_oO&Y@qx66|n!i94 znTo-nmYTpat49s-z?>hz3$2r@^h>Ju6xn=-YB>>#c;+gFpEKUbP{3^Y@`rZ7ZtAD; zvFAF0)!FFM(|a&EMFW8kQ#Bfggq7QE^t~)f$oP0Qns1UDNGA-dxOB27XaEe6|AnFc!CwVh6LFewsyVkZ)-WcI4m{+lekm{#?L#t|_} zGisM>`W)-JsjyMzdQ_MI05Q(LHPwH?Z|&shru!IbZ}qdQ;(i5xCN4>6k}fWg&Z8Lv zyGx}IP5)$5@bIWsP)hR}{Bb|QPhE14!B|0DMkAwV((Y}kGEHQVpkC+PBL?akZWnrW z4E%`h$wwm9J$w@m#u-f*C+|D4O? zb;=umcy@9}#nBMa5-wzkA}Sk>As6F5mx+NIsq)q-m85YtD)rphR|G9J-LxRPedyOY zj;c$^UPZ~4ZG-EZgv&zYx-E1C)7S1KL-v+PL3hj{bkd;n^y^g*=oD>oaqGIB#`o_CcNP*D6@l(v$ zUyEsd(SIiv$X{fsZaWs3Ke;OAfx#1t*0Ai8jo}ShHYN6AleSMXk|s{6zqsj{znBzo zq@98yq5~9dc(if3P|muNCph|c|x45P~OE{XT5L=!sE;7)&c z9{#o64Ohxaimy0tp)5r@C7#5FA^vCy~uJlOYQtJ2V6b ztV#azBM*#^9B8K;=hUy^!7W5>rLY1KO@OM%S-cKD%P}dU%@Z`RDSL{ z{W|*N%jl0$gip(dCb#-TECWOG9VHGN8lJK9?9BDxeLllkI7`KA6#t6vOME19CTM=f zQ+iI`On)m-;=69?xlXI3SAhuG(Fx<`-X$L0r`8Md6F$SQ$(l=%u|`sY-}dOg9nZ@s z@^C0Ar9aoO$&a@l-%I!RcI3-o|FIG9gwtb^P>B?Q+Xv;sDX{8jjQko=6s)8RufPtU z&c8<TJA`8kMIpa0gLJ8T?(i)WdLn|f|(J)e-Moh3!HnO4A3y=+&z z!ZYKo;Eim@Nao=*AAhvz2<_MjqXkbx5;T+0Cy~zF*=sA{o#xQ$sfNb-A~U9S5Dx+2 zQ~Uq}0$F0s9Fn+|2TVhtM9(%7zCjx+|$ z)|JiLDd8_+o$ z<)Oxv{)n&GjBmC`zNPwh1F2D4ScheTDE6j_WFcFGQOzrFKcTc57_!M`nXrv-MGSfu z7*w-q7?EBiPi-3++%MhL?BPRV614HJgbJ!>6C6O=Vpi63;rIWsX`O*WHnUf$7)6`j3PtaW=oh^ z0+(-T9St2_K*tW-yP=Kbm+oyaRxRxd}H#qiB5XQ`4h^o(7XdDX>a5N}!C*{a_6{26~1J_>=IpYcaoiih%6Sf;$$502Jub7oTSNblA#o`S~6Xm^I759as?J& z?rVG7J}ga&B`7*XKmSU#a(l?k>W#JZ(^$@_O@Om}itI=MQh-pY9ylcmGk<0T1V^&S?A2CuS$*I>jn zs2PLcFd>}1@-+dko6g>$v63~3_fi{v{NzH>fP#&^tHf|$!Tq114!ZispIQ5vsfZzQ zqE>za?9b%Rh;DI6pC5@G3JdJ6umaFp%qAU>Q}H@HvNQ*@H^a`3Z9Z(#d$8RL@A2wN z%d=x#%QWb;#a>;gox@Nu5@EI8iEatN6Kk}4AQJY_17#`8gar?2iF2LQNm>rn&+Nr< zv%2#grod}8CW!l#l=f?OagAUjU9*%riVM5UfH3+o-bvI}*NF0cucuXO`%&a(AxvSp z0(WcAa&x__3R*E&c+S0E|4ANrU_(r0SMPEAia`AHn^OL#N!(yo4pw|We+8}w+m?}< z)3or-CgMp?MmHy`DgAYzd+21@bP_A*Ot> zxjL4HzC$X$%kmy?rO)0}@kB2jyu=XIrGgsX_8P9Xo9Zh6#5c`Ks-p%Is(W$7xrUGF zTaq#oVs1z#cK&5OYaqr~VznuF9dR)79gAv55FQ{l26HR&q_|y?j zgayI+EEG`1ej3A0ouF-`>>a?C@8E+p&kg=#d8LBEO!8SkY8384M=IhwE`?m{d*b=Oz;B&1dllPZmd;;qc&yfye zsy7!bJ);crloQY&12oVpQhugdEo=)K$==B0o&_z5@Pnu=Zq>ZpYgF;@p3SOU{KT`4$9t-rUQgl9FH~nKjp$W(i%=v<3{jB@ z_UyQ4;;D(aOMV6^H&c++Gq;AS7=`a}=L$r=iEQxKgK4{XGBt#@Wd$hX1j5G- zLgLtxOrY9Y$6}R4u9G#BEcr}98PC|33Hv`p&MZHhB~c2|vY|zY9Wj5v;S3!SHc({s zs***CwqtauwfP{(+^f-zke9tu{PDnrac^WPDI0ubf$O7K#vVLEJfJaRs2(m? z@cd%y{iRml9Dx0eopqxYU)w;rh>3 z5kuu&;SrS1aqyk$0=Dp(Xrs#n^OSi+hkXobJmLpS;A3!c00=nPsqFj>PjA67n4DzE zk_^+Rl!!&OHn1S5D$dhGD$Crp`ill%ykDnX*x9LzT$6o*eYp{%yG%q}i*1k)@5dB= zcAOOLq@|>+4_&GiLQb-?jL1e+XZ`gLFlCDV;MDIDHvEKrbOf(;MxhkYRGI{#Gh`>- zUiLFK*1@ixDlGQtu<`cAb%I-!MS3;c#@%fy)3hzNc(1fDe^{bz`9X`-D|s``u}|QG zi+Zm5qC%c;TPR6GqV350_;Y&cA!b?ach5H*?|y4Lr7?%G<&0*GxLnBbY@QCsj0Ahx z_4L{s-(Rv&Xm?y=NCVS_-6HV5cd)si7pP&DssUW~s8DXOTcD~Exa`f&{i8pIgnDWY z1x;!x}~#ZJ~C@= zOzpw^ND`-zPX_%8TGKTsm)bB9jcVw@N=K(-kWq-e!^Z+o=a(J;tu;*N>2tzEC3W(- zVt;rlAK}quj@QSaozk7r5PT)e+S9bD>~&h~^O}>-sOMP`Hf*_169!81gnS&h41FXh zH7YQkEUsIJZEU07*}N!#d`|F3@xf+vz3e^i-`c$uEP4C*DKNw{M~fr|rGsYHGEyiEZ$d{X!InWCKq5y#?>P)~BKB z<_W~r3f8;PXR*+3%Q-R^C3O?AFwbsWtxIRx3DGF3%fq z3bWFS>SB84_?pUGpey1`FLACp3D`IzkQ*BSn7;LI>xTSl-I&N-2@F^C*-ZDTvz4pq z^|Gmog*dieON_(f@^kv_!wPzOK&o@2YSZBx1cSiB?+;S^TermVqOmaZ;*%n*&ITHe zpRzny72gTvu`U$v2Kd@8 z697;m)m^=YP5Jxg)i&p0jSjg(=88(uciwNZ9Ga#Eg*n{ITr`;GHn+IrAfsp;&|Z zaNkSWzz8#F!nsxOBYKJqgZq=te`NBVDiI|_d0U;lb2DVmQJy_Nnj*eV_T3WV8t$vT z&I7zoHU=gI;4eYTw?gIdTo9p@& zz}CtZ?C1sqgJ3X-l_|*43TnsUY-h#}1;MT0>`)lk&J1emWDB-)WOoF+I;#8vhHNy{ zM1uqX++1YWU>cTAwkCEUD+rv!@$Vu#%+4ZAMM(zx4#l18zhKMBN~&GoI{<)N01V{o z(x-qEhwIx@6$K3`XJ==3clW@+z~|4OBM^wx)YP1uoZ{l*%F4>x+S-TBi#>i(MT~3`yP2DZ&Kvc<}v)Tx~q1u^XioMa<}5pDBRN?>wmFC(hRsFOP0{& z6oeciuEz`@C#57=CT{HipGk#*f$`T2z4`LDU|ttYp>~cp)6dlwlB;bSvBZOSId23q zSj``0Rw{#S*f|hICA6>$y?^+vSSx`_RbP#fU+L0I#-XHZiqC=_FIDH@X|~&Fsl zp|S2XtJYkd{Hazt#g`dx7P9p6eV{%P=_7+FG-+nK4n6zgBUPXlI_99D9h8qP%k_@0 zs`Qp)$?P1&D9jz?;e{{Bblf5bE(|ZiUw69m%mtWMfzbsMH!JAb-BKDW(K|QA-QkH5?J~2C zGOOE6A4X>%Q#G$C@-cE_<0tmN%eckf>MIJ_>ER-3zzj^f$7;`kW2~-Fboq7VBz{{* zV!-&)_R(UC4&6Y^F#GMew980tb+sA1mk!>ofe^L;Tfv#J(at$Jbj#d`MP{NtT@c?X zY_q%U<)Qv9rKU{*)MzTA z(b+KBEvpk*34_CT=Q#2CQ^Zze+>Qh5nWiX-WkoRr+-P?&aKbg*+NcN7=k9(fgoj9w ztmJk?NQ*t!ow7NsxWE)7cUZzlYqH8;)vHk8ZKcZCp+O4?BG^VMU_fO04^KnVP7SFO z;x6Z=>Xm%PQ$@YRd#M4~k{;$-bt+x?eWE0vs1GN}LG?;iELn27Vbj=1IxrWVD}VP17Os8&RTEz> z7%hn6p<2BJmOlDB+2$}fK8OjjV3{wd^5r6;PsrnJzBq0{k)(5X5ohH~_a_7Fv+|$~ zx6pyJ_BIaU*@EP|F%UL`E3zUGtmU4s^g?^RjL(>p4a8_|pra098=^3ukLo>Nx-9QA z`W=)WmAwAY9$$0~QYC0daEupoegeiMGmQgrdCC@;x4ofr1K$a!7)dds5n)aA@GgR#e)J`72&eb4)pQY}YP4pG9#UN$9giwDm_ zb^y!g$=#@ei``NAWQ8=&r9kr+hA`d(CO%9n$u}3-M?2*{2Q(kjg)i5TpPLda+Df+4 zwI}V?@-eL)MtIZjW{>T_#OWuZ~dJmZ|HNA<=x z6_lTsHhN1bQfX474MvSiR0eq0`y>a~O@fNRCw$ij~}6Z1m>ZupmYZG=Fqy z&(+0XR<$Y}Ev4VQs35)x)SgJns%oIh*M2)8ZqEuvs(=j*N2Op%Uw$Xm6Ydx?WjT7! zPoTIb{U{rM@52tKqN^x`i-!mfcOxA>UU??z{D~+`Vlersf(Wwe#leaCbJT>WSMvKj z3dE0NUuab>SDmQ?SyaIxBVzG22U8G!wsf}brF&;F_}n5eAHQ_4GyB=wmq&2<2LH;gI7l@gR{>(!#UgMAb&lI2%Nv z@!6$_fxF(~DMeKQts0QFn{^g8<1|B<-I%NW7z+cKXS6{gc~WS=Cap~DP$P%FP=-uN zrx_mU3e|)m%-}JPAZ~GbUxHLJ+)@n(9ErkohjcT5LNc^;{<&rIiFi%I&PHQi?3dm- znZ9YoHZWI-Qaz+UdYpy}z-;#Tglc(|N>q%Dn?fsXVNBrE4oG_zF} zySQVIrspTYnVeP997BjN-sL7eq+NF5=(?_qLihH~*)nENjYOzOcItlP+vA=NvEkzw z*d@s@;(N1{`RDuWuv4Bov-Vy|D>m8S`#v-k3K8%~Tj_4zt=3pOjyR5NnLtww1_8z= z^fq!wSS#oR?!nq3XZ$sOM3icLg;b=G!k~|mgm>z`<3_Bw2PMo)cA&uG7z!U89Vp)I zm}khFp~qSoHF!YURlt$%MCvVyFLZj3t+U0)?VGua*e+glpPY!(8j`aK4FT!q!QG%} zo)h$*E?(wWwaYjO60?J9)kDpguf4z3P#I1ZOwZmffHN#EBz9Xy64=qQ$zVO~<=rfk zjr}AOl;J?&OB42pO;VSQdJOl=yHHUN;@<5oc1;3>~Fh$xCkUU zFdgUmw1hYFgQbL`zvmFIB^uSc=TdkJcbpXG9-{eSXo;lmBM)dGjtGgQj3JB>4x52F zIRrioh6i+*QmQx)kdZBKySFaBLVw#_K!d-#si5?@PJg_CX;gmg`NQWGWA;$~Y3vH0 zr^dEVKLZa*y2zC9yFJ~OAIqk=yxjMv(PM?~20x~+q67Kc79c{-sK5m01p0ViZaui& zQyaLYfB!qVP!&^1LIR~QDD}*L`!!$JXr*H%oN$IP;rIeciIA4Wt$w(P)zF_0eXq%4 z01acM3|-+nI_>c5)8W{S$fs9L58m8u)V^z3a7VsDKVIQ|b^k6gg?GxJ_ssCBW0>AU z6FfxIa_#i!qz$o!bxYJ8+7~&MbyKb4!#z)AG1!rPFcEKErme{}d7=v;07eHQqKkExi}-RkSHf!jmT0`ENbEgCWfDx*cRct z9~SbRau>C}9kY5%|2`G<`^TqZ_j_M4@tm*?xWA_P{(j+$x9|k&C%kfDv1$9TOXSxN zVkDw=FVPIYmg~->#|VTzv!4?M1{;!#Dn@&%12h>=-}sgcJpLf3%3`zLTX~r=9c~JL z)fjL427S2k`#44tgLt+8oudnFuGVazoE(=#`>=1$9# z9<}1*(fmfaSW8-j>uv9Zq*>j2iC>cgH&+blG4Gf+T@e{XA20W^;wiWXq7373A({*C zFEM!u5%7T#%4MU#7SGonPU{TWz=f!f$)*GbNRfGRg58>wyk2=;@Okb=pBm}u@yL_P zsTXN1`Rs5`In{h+pAgfY3Z<$IbD{`l)JBGUo8a`55-Lnq5g1`HUuIXLsJSp!(n(!# zpC-ZK6{rox#-mlhnp|J3f>3pmjUoO5tG3*w9$d;1IBh~^9pQrg?&JfhP4ReLdvf6_ z2kHK#*R}`S`Ls;QWXq0bCpo5fFX))<)01=m{x>2MUg*dA#pyVg0^JA3Ci99PIQ)jK z$)t;2u(LC#Cyp?)jWMK^Oe)d5Z%0nFKGOSS{7o42t!G^f{h4*Wv^K6`P(!(-PEd;o zh4$xLEBk04or=7B7cT3wdW3IPsULLQ9+E3j_ALkYHWCT$U9^ZJlRk~amO5}9>X8;v|+4q5Yk5@agWA&Sokw|q-_ z^7lJ(Z9bAjQ0sGdaLKLW%Uo`2G_c@zvBwI4k*(9G>9HR-lIK`OxA)&u3}A(#Sy7ld zJ&_%~w~->2j8^;Cr;wZC)GXEru+RKThQuls@&v45jPuZ*B+e@n`c4zYh=$P=806`! z%Q{p|&@*@|O;xQcrS<)=<8plNw{cLNvRk}ZLuukt5-uXAVTrL#ckKQ>drZsVM3O>A znBu4he)f5EmZyG2RT;3ZFI%RtKYBsOlA>EYuK^`q-^&TXo$qPI0qDK0RW974tvH8C z^WJL^LsU&0hl_c9d}F(Rq62GmqLckpHLXdEmM?%@Tn5p>xdu-|nv)TOxjnim#4IY0Y(fFh+PE^lTqUqC>N6t6UuScnCQR#tM^(F4kvz6HEUAD==g0UJm2d^?! z1IsuNDSSDo@NqCACKqB+fo_@kr5MO<8kltYn$CYl^?3_=6MG`>X zU`z3ES8i$U6gvbxH)d(6bpcx?u}8_yiC(Mn+*M&q=cijYJHs^}EZ)W2002Vl-*$#r zH#5|!6-OP{Zn3q-;HhPZ9EzH z1AQ8w@C+dW6HMMJ`-*Ul^wmFcVHgk=URhtjkBl!E3|QioX}+~eMp!s{uwXYUc#h|} zd2Sy(S{Rx6%Glsb##|=q*HwHE(Q=!aGH5z_?yxP9)^rsK|HrC`86984h6(RW$GXQ_ zMs8|qBB|&}aH+cS&+$bFrymi+b88aQ%84y&kmgGv1zv1xi}u+ zbrDyT zjXjOD=u{4Ua!}e~frk@B+Nr(d!;{rzwrS&^pt!p%NZ@coI6l2yLf2t zj-+jH$5vmt^*Mg!ghiys;#z`t>EP^eJ(r~|RX38tpfR-{%jCh<{DN@seO?W^U@T3f zj({Y2t9Hp|V*zj??l3Tl)x%_G05?LsKcpoTA?HlrCxNTTK1&Ae^+f837^YBb;wI-I zDD%7OFnOHOY{gwTc9wp5-$7x7UL49G5l}0WTey#xy_sf0up(Y~QU2MZWoeM0UlukI_p{CIqP|8I z%DZEcRwHE2al%ZD&u`eQo$8uNGg``WM-YWj{YvcQXt^!m6+5$()`GfWjT z$kczc1!VTRPp*?7SVG^5`trR`9B&Jlv;_%sU!_|EPWd$x#wa&9#LFC;6XIp*Jwr;U zk{BU9mY56}l|8}&c1CThb87m?tAt&E=j!S~D?>d>(DaSFa*2a=hzZG%Tu2 zv2psgf#sD7R@vLZ_BrA(Skt~emRsg(T|SDma0Ja$CKput)Mv;LjVA13b)@dx3TyP!rxQ)DSKTA!@IOss)2W6jQvjS^)MsQ%oEw2kzUDy<0z zP=MgYuI^{al6|6D=o6Zh&plDTru!JCB`2%ReO*0=QND{rj#0|W^A{(d%b9F1O-RkS zw$FN2G&!%V5@(yl#|!bDvEhU2=BIRFv&s6nG?=nQhS@+BOz}W=hZa{AbRqU2&{~Dk zdP8jJpu@NDI?vUiNJtw3qqaaD-jT9wO5s(mFPoa%FiyFPJf~^tG9@`@&NyJvMM@l? zPD7&tD$n8?*zCLVkd64xd4KI453 z#%{LnsmWp~3kznf2qSEoc*4%z?LvO5$d{s6ayl3LG9OX3J9Rh{^#yS;>GjjoFe3E| zghSgRs#X&^@VXM!UWm}MDt@S2z>IIvm7-stuM_WF09iz=f1pQslI*n~2rg=TYrx`|nr2@!rKIL{e#fJ2_i>!8CLv3=>~6IQ+2kTB7`mTGUdliO<- z$R2dM#%>(-e#o|f}43}kNL@H`18||B&E?sL%)P)BRsU#vqJKg)M4;a5&dI-h9&1Z zGP$vkBobTJ>JgoXsfN|(I=(CwT0Bg)^M(k)VE2JwR)PA^S=f~f8qY2(JcUA=Q=L`=eW$x>T3(K zvoZ(69oZes%%8mwv%4M`siV{uhl%ef?;k#h#$LbWj%lkK1$Eq|I&Y5n}DiQ%DB0lU57`=dQYaB@y>;bi*5Ae;q09FGq= zQ7mJ&GcC1#d7Md)_A|KVu)F1O(P2-|gDWttE9hNKH&IWkQnQbhCFts>)n zO03F@?J!YhC4vR;9!_QzHsmD}hS7$VSv6EvPC~U+#KjwBP+sB?`DP|E&@b=RU7Lh* zZTNqT{yXP!{qn!$?sZ85YG-a`;pFgFwgdi{17vCnhJddhp$;5>)};82rXv&zfr0G6 zkpH0`+!5sH1UCWwTYbN4Da-+C;Q)sJe>87WG5@sjrltR0-|y^$L19j?>*SBqzu5P? z+RPzPkR$lttoxnTO+`Q2_`7@n@wH`&QW|1xa!S%1|2227Nu@My&>EQ*uVcq8yxm6E z7S0^2h~p-g!3DR>HStx$F7e~h^sEr{>5{D4(5jlqSw!sW4kQ`^oe*;3f%Wv0gh0Bg&1l+(IG({5eH<-RD2a z-wDHiYW*`8_(MhUy7;He;NK{J5BARy<(uTs56QabvY%2x|5W_>z;P3B{vj492JBx8|Q%>_>gy zuAS+h{{Abc{W`!uGy#Acum2(J`ypw+!TFT~|BQ6Q zFhAtuZ;*a);-67|?H`=$$A9+p&m8$@oL^fb{TrMgocZ5H@%# None: + self.parser = TikaDocumentParser(logging_group=None) + + def tearDown(self) -> None: + self.parser.cleanup() + + def test_basic_parse_odt(self): + """ + GIVEN: + - An input ODT format document + WHEN: + - The document is parsed + THEN: + - Document content is correct + - Document date is correct + """ + test_file = self.SAMPLE_DIR / Path("sample.odt") + + self.parser.parse(test_file, "application/vnd.oasis.opendocument.text") + + self.assertEqual( + self.parser.text, + "This is an ODT test document, created September 14, 2022", + ) + self.assertIsNotNone(self.parser.archive_path) + with open(self.parser.archive_path, "rb") as f: + # PDFs begin with the bytes PDF-x.y + self.assertTrue(b"PDF-" in f.read()[:10]) + + # TODO: Unsure what can set the Creation-Date field in a document, enable when possible + # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) + + def test_basic_parse_docx(self): + """ + GIVEN: + - An input DOCX format document + WHEN: + - The document is parsed + THEN: + - Document content is correct + - Document date is correct + """ + test_file = self.SAMPLE_DIR / Path("sample.docx") + + self.parser.parse( + test_file, + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + self.assertEqual( + self.parser.text, + "This is an DOCX test document, also made September 14, 2022", + ) + self.assertIsNotNone(self.parser.archive_path) + with open(self.parser.archive_path, "rb") as f: + self.assertTrue(b"PDF-" in f.read()[:10]) + + # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))