mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Updates tika client library and handle the changes to it
This commit is contained in:
parent
0d6e79cb93
commit
e05b3441de
1
Pipfile
1
Pipfile
@ -66,6 +66,7 @@ scipy = "==1.8.1"
|
||||
reportlab = "==3.6.12"
|
||||
# Pin this until piwheels is building a newer version (see https://www.piwheels.org/project/cryptography/)
|
||||
cryptography = "==40.0.1"
|
||||
httpx = "*"
|
||||
|
||||
[dev-packages]
|
||||
# Linting
|
||||
|
10
Pipfile.lock
generated
10
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "db3fc8c37931534327f89c6211581495328b6f6bf2c533df848fa23faa5d0cd3"
|
||||
"sha256": "e63cdbb928210fc4dcf0554bde381abd0ff956923ae03ab9f6984025cd5a454d"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {},
|
||||
@ -712,7 +712,7 @@
|
||||
"sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd",
|
||||
"sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"index": "pypi",
|
||||
"version": "==0.24.1"
|
||||
},
|
||||
"humanfriendly": {
|
||||
@ -1746,11 +1746,11 @@
|
||||
},
|
||||
"tika-client": {
|
||||
"hashes": [
|
||||
"sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0",
|
||||
"sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d"
|
||||
"sha256:6110bd73eaa133f9c8eb1ef2566e6c0c8123a0e4efbcfb85b86f8c1b26cb4de2",
|
||||
"sha256:e8eaa52771c72426f5531c53dcc8dfc5e3bb6e1f91f89fc93674a81bfca59d6d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.1.1"
|
||||
"version": "==0.2.0"
|
||||
},
|
||||
"tornado": {
|
||||
"hashes": [
|
||||
|
@ -13,7 +13,6 @@ from humanfriendly import format_size
|
||||
from imap_tools import MailAttachment
|
||||
from imap_tools import MailMessage
|
||||
from tika_client import TikaClient
|
||||
from tika_client.data_models import TikaKey
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
@ -175,12 +174,8 @@ class MailDocumentParser(DocumentParser):
|
||||
with TikaClient(tika_url=self.tika_server) as client:
|
||||
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
||||
|
||||
if hasattr(parsed, "content") and parsed.content is not None:
|
||||
if parsed.content is not None:
|
||||
return parsed.content.strip()
|
||||
elif TikaKey.Content in parsed.data:
|
||||
# May not be a completely handled type, but
|
||||
# the Tika response may still include content
|
||||
return parsed.data[TikaKey.Content].strip()
|
||||
return ""
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
|
@ -4,7 +4,6 @@ from pathlib import Path
|
||||
import httpx
|
||||
from django.conf import settings
|
||||
from tika_client import TikaClient
|
||||
from tika_client.data_models import TikaKey
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
@ -59,13 +58,9 @@ class TikaDocumentParser(DocumentParser):
|
||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||
) from err
|
||||
|
||||
self.text = None
|
||||
if hasattr(parsed, "content") and parsed.content is not None:
|
||||
self.text = parsed.content.strip()
|
||||
elif TikaKey.Content in parsed.data:
|
||||
# May not be a completely handled type, but
|
||||
# the Tika response may still include content
|
||||
self.text = parsed.data[TikaKey.Content].strip()
|
||||
self.text = parsed.content
|
||||
if self.text is not None:
|
||||
self.text = self.text.strip()
|
||||
|
||||
self.date = parsed.created
|
||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||
|
Loading…
x
Reference in New Issue
Block a user