mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Updates tika client library and handle the changes to it
This commit is contained in:
parent
0d6e79cb93
commit
e05b3441de
1
Pipfile
1
Pipfile
@ -66,6 +66,7 @@ scipy = "==1.8.1"
|
|||||||
reportlab = "==3.6.12"
|
reportlab = "==3.6.12"
|
||||||
# Pin this until piwheels is building a newer version (see https://www.piwheels.org/project/cryptography/)
|
# Pin this until piwheels is building a newer version (see https://www.piwheels.org/project/cryptography/)
|
||||||
cryptography = "==40.0.1"
|
cryptography = "==40.0.1"
|
||||||
|
httpx = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
# Linting
|
# Linting
|
||||||
|
10
Pipfile.lock
generated
10
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "db3fc8c37931534327f89c6211581495328b6f6bf2c533df848fa23faa5d0cd3"
|
"sha256": "e63cdbb928210fc4dcf0554bde381abd0ff956923ae03ab9f6984025cd5a454d"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {},
|
"requires": {},
|
||||||
@ -712,7 +712,7 @@
|
|||||||
"sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd",
|
"sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd",
|
||||||
"sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"
|
"sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"
|
||||||
],
|
],
|
||||||
"markers": "python_version >= '3.7'",
|
"index": "pypi",
|
||||||
"version": "==0.24.1"
|
"version": "==0.24.1"
|
||||||
},
|
},
|
||||||
"humanfriendly": {
|
"humanfriendly": {
|
||||||
@ -1746,11 +1746,11 @@
|
|||||||
},
|
},
|
||||||
"tika-client": {
|
"tika-client": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0",
|
"sha256:6110bd73eaa133f9c8eb1ef2566e6c0c8123a0e4efbcfb85b86f8c1b26cb4de2",
|
||||||
"sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d"
|
"sha256:e8eaa52771c72426f5531c53dcc8dfc5e3bb6e1f91f89fc93674a81bfca59d6d"
|
||||||
],
|
],
|
||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==0.1.1"
|
"version": "==0.2.0"
|
||||||
},
|
},
|
||||||
"tornado": {
|
"tornado": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
|
@ -13,7 +13,6 @@ from humanfriendly import format_size
|
|||||||
from imap_tools import MailAttachment
|
from imap_tools import MailAttachment
|
||||||
from imap_tools import MailMessage
|
from imap_tools import MailMessage
|
||||||
from tika_client import TikaClient
|
from tika_client import TikaClient
|
||||||
from tika_client.data_models import TikaKey
|
|
||||||
|
|
||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
@ -175,12 +174,8 @@ class MailDocumentParser(DocumentParser):
|
|||||||
with TikaClient(tika_url=self.tika_server) as client:
|
with TikaClient(tika_url=self.tika_server) as client:
|
||||||
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
||||||
|
|
||||||
if hasattr(parsed, "content") and parsed.content is not None:
|
if parsed.content is not None:
|
||||||
return parsed.content.strip()
|
return parsed.content.strip()
|
||||||
elif TikaKey.Content in parsed.data:
|
|
||||||
# May not be a completely handled type, but
|
|
||||||
# the Tika response may still include content
|
|
||||||
return parsed.data[TikaKey.Content].strip()
|
|
||||||
return ""
|
return ""
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise ParseError(
|
raise ParseError(
|
||||||
|
@ -4,7 +4,6 @@ from pathlib import Path
|
|||||||
import httpx
|
import httpx
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from tika_client import TikaClient
|
from tika_client import TikaClient
|
||||||
from tika_client.data_models import TikaKey
|
|
||||||
|
|
||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
@ -59,13 +58,9 @@ class TikaDocumentParser(DocumentParser):
|
|||||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||||
) from err
|
) from err
|
||||||
|
|
||||||
self.text = None
|
self.text = parsed.content
|
||||||
if hasattr(parsed, "content") and parsed.content is not None:
|
if self.text is not None:
|
||||||
self.text = parsed.content.strip()
|
self.text = self.text.strip()
|
||||||
elif TikaKey.Content in parsed.data:
|
|
||||||
# May not be a completely handled type, but
|
|
||||||
# the Tika response may still include content
|
|
||||||
self.text = parsed.data[TikaKey.Content].strip()
|
|
||||||
|
|
||||||
self.date = parsed.created
|
self.date = parsed.created
|
||||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user