From 4782b4da0701e0e01f4e9d5a08d77279b236798b Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Sun, 18 Jun 2023 07:04:53 -0700 Subject: [PATCH] Adds better error handling/checking around getting content of a document via Tika Signed-off-by: Trenton Holmes <797416+stumpylog@users.noreply.github.com> --- Pipfile.lock | 6 ++--- src/paperless_mail/parsers.py | 9 +++++-- src/paperless_tika/parsers.py | 12 ++++++++-- src/paperless_tika/tests/samples/sample.doc | Bin 0 -> 23552 bytes src/paperless_tika/tests/test_live_tika.py | 25 ++++++++++++++++++++ 5 files changed, 45 insertions(+), 7 deletions(-) create mode 100644 src/paperless_tika/tests/samples/sample.doc diff --git a/Pipfile.lock b/Pipfile.lock index d948729ef..6bf949a7f 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1746,11 +1746,11 @@ }, "tika-client": { "hashes": [ - "sha256:43b53816b3783c9c77e16df314cad5ad66ab606391c26ad4bc94a784d473a156", - "sha256:e1ef3447b4307059e4a836e3786088498637323733f83a2f807b77f998d77610" + "sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0", + "sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d" ], "index": "pypi", - "version": "==0.0.3" + "version": "==0.1.1" }, "tornado": { "hashes": [ diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index 3ec3e64a0..f7daa758e 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -13,6 +13,7 @@ from humanfriendly import format_size from imap_tools import MailAttachment from imap_tools import MailMessage from tika_client import TikaClient +from tika_client.data_models import TikaKey from documents.parsers import DocumentParser from documents.parsers import ParseError @@ -172,8 +173,12 @@ class MailDocumentParser(DocumentParser): with TikaClient(tika_url=self.tika_server) as client: parsed = client.tika.as_text.from_buffer(html, "text/html") - if "X-TIKA:content" in parsed.data: - return parsed.data["X-TIKA:content"].strip() + if hasattr(parsed, "content") and parsed.content is not None: + return parsed.content.strip() + elif TikaKey.Content in parsed.data: + # May not be a completely handled type, but + # the Tika response may still include content + return parsed.data[TikaKey.Content].strip() return "" except Exception as err: raise ParseError( diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 10447ff53..8b476bfd8 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -4,6 +4,7 @@ from pathlib import Path import httpx from django.conf import settings from tika_client import TikaClient +from tika_client.data_models import TikaKey from documents.parsers import DocumentParser from documents.parsers import ParseError @@ -58,8 +59,15 @@ class TikaDocumentParser(DocumentParser): f"{settings.TIKA_ENDPOINT}: {err}", ) from err - self.text = parsed.content.strip() - self.date = parsed.metadata.created + self.text = None + if hasattr(parsed, "content") and parsed.content is not None: + self.text = parsed.content.strip() + elif TikaKey.Content in parsed.data: + # May not be a completely handled type, but + # the Tika response may still include content + self.text = parsed.data[TikaKey.Content].strip() + + self.date = parsed.created self.archive_path = self.convert_to_pdf(document_path, file_name) def convert_to_pdf(self, document_path, file_name): diff --git a/src/paperless_tika/tests/samples/sample.doc b/src/paperless_tika/tests/samples/sample.doc new file mode 100644 index 0000000000000000000000000000000000000000..72178a7beccd5c551dc4f14b778a787e08952efd GIT binary patch literal 23552 zcmeHP3tWs@|35QR(?$0?Qd5#B-ESmCqD?oI=na`_y67^g5jIwY-LTZUg%*)@TWu@3 z6|t=)*Fup{N)as)o7OV_-|v|oW+Uvr@BjV0?`uwd=bZC9=X}pOzjL1RoZC#rMb-M; zAN$-U-OLEWBCmvUge@j#fO`<_S0Tg{+$eh`6beyn0syDP{1<7Ub^8gTPvoQsX;Ul3 zMkEAU7K|(*F#shJHa%>5{-FHV4NrwqA97M!#6gLWb8anBh?qA)?{o=;%CF@;x#*7M$fdwW^D(veU`kOPqLW10!W<`H?QY(CO&#haks(8vy#S1RMJw8akef-uwNc(0~4u&WC2A+Y8Hq z{c?Ago@tvdAMK}YI(<(z9k07h_ea|9DLpMe(^VeY-;>=vAKFjrnfCW&)4X&$cDFwi z-aY=m%1^gfPv!1OPM7zu+CA~%*t{MF&=?pLhC#2FCTj30CnW`M4~h9t*Fa!&Ofm=l zxf}s6S-^?lhtG`XB?!zo$=noP1SckeBZ%g4_;C@uB#t?Raw7Rj@mv8~oQeFTh}S8S zIVNE<1)T6C9#w{elV$PC>o1K>>$h23X*-8}o->(5fwklDBU7XnrWg-K^f#^>PhXs-i*z^I|!v@iZ zkOHgO51}WH5sTMFPN*-qAP-IIOjBjR*V8x2p82cWzBQ}icxwXZewBV5q^ z!Xl&^&+L1P>|MWUvKbLp0(^$h5~oCfEwEH(1%khww=fi6>kV2XQc>T7(9J zL3D|mrxwM<1snKC7au`B4|vxw2$Oic=E@{;;Nt;L1c@ODB#Ll|6|kv8JZU2a+lV1! zMAY*#85()BF&{Ti6T$^}0ShvD5e3*OMe{BuYmlo$EHS2N5mPT>C!6Nx$pJDBA%W0I zkuC7OY&H}dOPtLW*!*w@57(I6WGJuCr2(fC8( zXcrcOO|w|C6`-}auty#TT%2zGQI}}r>%+DhDkjg06FH&fMM>UA}x-#C_(2DBzVj|QaW zmU1Z`g6P61tPXaF8$Uq++dH9wa}s&UQ_N?_$9vzqW73gcluNmjXxaj2)!1}wa%xOEsxMPimYuW1^M@y+OpPA& z9Xn-vor7=9-k^*HeRPI%w<~8qJ#wtX#No@d&oVanZ%=JbjPly&ydpD6KQr&r?~cr~ zb}7bbFVmFH&m4b4=Rro6V`I77)h5<~f_{^nA2xMn8#QF^KjLEV^S!5=Y>N|BHOl_b3&*w?U~Cx^~>kxhs+0!2!$;2XMJt2-f{Oh=*gZzs2&oU z#EVO|FvrhDD=$tjvsC!<(c+`&mGj15D$^N|VzE8b&0^a;a@%Bnv|-=cz7J}BPqF2T zWf$c=>e%J{Dt}q$?bUZVS{rt=%NpEV&PLx(aW0*|uzYdejsdbEvlEwWORX&mE%B?| zHvGI*-|Zd;EZ3+he^ujK;-5b%qOx4&lBavEZJ5#g9a%F%uj*%S>RoT|b@`c=|5b-+ z`73vp3k%CE&u#KOG5-g%CU>a>ht*y&cfxXaS~Lb6&D_b`eMPUL=ul_%OQ_1*6^)&< zRx2GUx)h)bfK9%5o`u+_H#dPB#Y@5-qHxYzzKo;ddV5~(tjrpR@M079uzhFqi+>|- zIekW++Hz@g?bW8nbcYLd8dV#|DF&P6$S5x5oj3pLVB1`?9esCR_s~6Q7I*uUW^>DI zo$~!(tJi&YI(huO{#BWa_ZHpcR2P|689Xw(IrXs7bmwJbC*Hf-}x)@zFUW42uz za(_mN({DaD#f2Ms4}7dxG}g#y^hn*EKYz_)4lYt#_auht?w9w4^sRHP`Lh~ktXfSp zZNIE`dXdV%_2Q^Yd(;n;ojrAiaYp%XvQPg#wTNWd-cj`)d%ixUG&RP~)&I8bPs?}Y z_4fKoAwQwy#Rbih{IG9Sq@L`WeCj+WXmCjAq)Y>a@J_vp9`cWtOg5TyVw=v8(Mmg+ZigNd3*PIxWTT)hUHNjtbk=N7x(;GZ=;`WDLl^vQD zZ(PBMJWrZ4o^vUR}w-(6TaW$!`xZ#6St4&75cB_!W&@?3Y} zrRC>i_Xfqz8FKLCU^~u>LaX%9y?br#wfmJYg8z7;aIErr+`);u=sBZsEOk!&_S+4L zCXu!`H;;UfmQsB9H#O@!akafdrJqk9@}=+iOhsNw%b+F0C$uLGIni*oG`YTL?+N503tc-)rkGBxON>0IICi!Sq;Wkz*+ zouBjSWMk;&5nsG&8dy5-MFp!)W47Poj+jwKM?O=k)bl-VRkk#CXz#DuKYJSPwMT7Z zi${aTgW%!6I}J{$a2S})RnB*x%zh*rtv{l^=&|#u`l)8W3|@J#vF4fMj5Md5LM`p7 zTKNwS54P<%)pFY9u+GF8sXF@4n-dzZWbmVMM%LwxSo3SPW8_2YoWa8l2aWeAZG5q4 z_VXj-c9lI)Qj}gdcW1?vrT@tOd{E2f1*^ORa^p9+PBtFFGPtGW;55r`na$>_A$N|u zkD4&8+3J{Ywp*j!@F2&>yILC;O^_>HFrn3EpMxq_^J{rmOgwTxOF^dVpho9(teFb?c7w`ulHQ*Xd01cGNyQPXT8&ny(%vi zW*l$1kyw}evs_2Z&3gNOLyf)FN0;{QccyH{!dG?vx8XPH(RRYyhE!Fso$$T z+^Bu^Q0{~g!uwgL9y%29_)NKD8)`;W?!0^3=fGWiZPj`0#qC;pJBp>`!%{Lnug$5m zzgzE8mvgtO{p7(rG1^Pmdk=c0Y>Pi|v;4-Cj!f(70QE5m9)*wB7W_7Piq4g!=AS=1 z6EI1u%y&w;<%0CSa&Zxv_4nO&H%wh_b8Y`s*4_xCS>WrS< zx@QJIrzmAqDXna6`$o_FLZ9t@!mCc}MlZ`)yMJM!&jG>EiSoyeUWk|(e`S5tp1zm8 z0tAt!Q>ryKx()T4dW4-mU}M3qkp-H}wG->l{jg(6nNpqy;gf2OQYfY-U_#~d%3JqSZMsrUZqg=hUQ|=Yu`H;X?bV)I8SYln$R!7yjf#; zz%T1Fr}sL0?U#y@Gj1pSomUmjpK&;M)vsZ?ZMLflgMZvFKj_I1!HnD#w{c6{)OJrV zFzvs==|%ReQ6qvzKW(_DqIhjmD_7PfareuNor`Ma11f%Jp2Z28%OmruPwEoLqU9FndvN{t><6ddsaTW4oFWk4}DMq;#jUDUZ zH_0c^Otxn6frx#NGWK!=+~d=`6_ zweZH>l7;O4N5}4}-gvqE%=XLG$6b!j`0=EI&$rjc-TPz6a|>^?i5)HXqnf^+{o|OR zew7yA73Yl^uX&;2;DNQi&o&t6@EG~u7aa&cKfC16WF49DInQ+1XJ2};C%SqFSr7Cz z4GQ&5pDw(9H?<-}b#7|G9|ouIk2|twB)f$jGhcC9UARoVbBMJl8u~ZV zUHK>02(xQ^1FX{r2FTRpTh`dsR~iY+^QI)mSGU+KAG1%eQbG0W6-(R4mMqP>XV_Syx+DwKjV6o&AWfEYITzrV@_d)%w`AaCEF%md>T4Q z5cl22)iEa5Hj+yheJl@Edd(!A=m`Z9D>Alu(gRtM^dC*!2>l9FTi3C@OA=9Y#WUP3rOA|@fq*>FPOm?4gaoMZtvA%YvnPvAKl z&fz5+x{OeiAI=>X9+%|JP2>PULh>-KvthJAkT}f3A~_spt=wdDej+ac!bB(2;7y9M zh)Ckj0^;~M3u{YDdy9B(OoAaNEXvB+Fiv1)$Pvs2S41qh!lJBEW{om%MZ|)O8y*g8 zB#6=Fpd}D4lDv*%LrZMvIJUIJmX_Gj5<6OAPfP3#Ini-339(Qkv<%@?JM%)j}dd!qs`8Hu77_6#xcprjCo_O4e233h1XpmRvO44Y4N(j<0c1|Z&^ryFCh-1{FJ3JGnOd7RHL_R>qg}j1LeegA)QD|QqrI$Hl|~f zdZ7`){)H_~He+*u!E{(VQE3So`M1(>h&S*Es7_`GwXu>-(;;oRxs3-na6FNDKn8(s zQO0<^^&gZE{ZfR5YQ<+#Oe%^ASL#@VdVpv@M3R_~)&PgWVeBXS$$|xHI4X%591;-! zjG;P{h+tH@Dz=j>`2+VN-fd)E{V+hT4emF~8iD%`j6*O=&+iG;4Gf5bo6~O_102P= z85P*1p}I!c$VM8e_e<-^k$0WRELO@UD)33_=b zv4wH@eWBf`B;Nl84ZMOEk-TX1{8M$Xv)3=){G`)2T4hg`EHNEeSc6ZRECj%1Pinx! z#|%fmM3e*40Qkbn24Kg_1HiPA{0zV}6#(35$L@(UnR@^n9NPdYfad_*rABgm1o1J# zIRbL~;I%_|@1P&!hR5+|MuhVdU_vPfg##3nLv-ro2;DjW@0jo;elkB&!10ZYj0xva z7ZyS83ZRWEI?VKro%xnfIb5@gum5pvoCv@jsJQ^#q*??} z1uO^Pm*`gm@C+yifNS?W0Iu`D2jF(`egN(ZlmL*s93Z~_$8|j#@ijTUHjnO-BQtf* z1pc0EJ;)r>A&3HaFcaJ?z(NPc2;z9JvjBm_NHiePfJ6fl4M;Q~(SSq)5)DW+AklzC z0}>5LH1NNp0bC2?G8os^xX#9PJT8avTpxee5zq7S3?J9jxHiXiI-cF*x*yl_xX#D3 zeq7IM0PvUUa7~ZveoT))X4VG)KWayP(Fnh)iA!#M0EQcYZ3yTOFait!7y||ZF#IK8 zaF-kX2#NXEX#g(|^5Oaqhq%LKAOTzjngiPYc%wz68FXo|53%L2HxY82`h>4=ikq%q z#xnz1T9@2P>^q#|Q9stLM3_|m*l)2MACnI6lhD7OHW6x{1ec*kgB=dnqVRGh{hiZ~ z;pzkV<1<9-;A7HIaq!^*BTZe6jE8*j`>%7zXOMd&RVI|+w?grfR?o%+@<)y1v-`32 z$Gb7q!wZ)5TQ$e4njc>t$1eT