mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Adds better error handling/checking around getting content of a document via Tika
Signed-off-by: Trenton Holmes <797416+stumpylog@users.noreply.github.com>
This commit is contained in:
parent
4693632c7d
commit
4782b4da07
6
Pipfile.lock
generated
6
Pipfile.lock
generated
@ -1746,11 +1746,11 @@
|
|||||||
},
|
},
|
||||||
"tika-client": {
|
"tika-client": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:43b53816b3783c9c77e16df314cad5ad66ab606391c26ad4bc94a784d473a156",
|
"sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0",
|
||||||
"sha256:e1ef3447b4307059e4a836e3786088498637323733f83a2f807b77f998d77610"
|
"sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d"
|
||||||
],
|
],
|
||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==0.0.3"
|
"version": "==0.1.1"
|
||||||
},
|
},
|
||||||
"tornado": {
|
"tornado": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
|
@ -13,6 +13,7 @@ from humanfriendly import format_size
|
|||||||
from imap_tools import MailAttachment
|
from imap_tools import MailAttachment
|
||||||
from imap_tools import MailMessage
|
from imap_tools import MailMessage
|
||||||
from tika_client import TikaClient
|
from tika_client import TikaClient
|
||||||
|
from tika_client.data_models import TikaKey
|
||||||
|
|
||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
@ -172,8 +173,12 @@ class MailDocumentParser(DocumentParser):
|
|||||||
with TikaClient(tika_url=self.tika_server) as client:
|
with TikaClient(tika_url=self.tika_server) as client:
|
||||||
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
||||||
|
|
||||||
if "X-TIKA:content" in parsed.data:
|
if hasattr(parsed, "content") and parsed.content is not None:
|
||||||
return parsed.data["X-TIKA:content"].strip()
|
return parsed.content.strip()
|
||||||
|
elif TikaKey.Content in parsed.data:
|
||||||
|
# May not be a completely handled type, but
|
||||||
|
# the Tika response may still include content
|
||||||
|
return parsed.data[TikaKey.Content].strip()
|
||||||
return ""
|
return ""
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise ParseError(
|
raise ParseError(
|
||||||
|
@ -4,6 +4,7 @@ from pathlib import Path
|
|||||||
import httpx
|
import httpx
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from tika_client import TikaClient
|
from tika_client import TikaClient
|
||||||
|
from tika_client.data_models import TikaKey
|
||||||
|
|
||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
@ -58,8 +59,15 @@ class TikaDocumentParser(DocumentParser):
|
|||||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||||
) from err
|
) from err
|
||||||
|
|
||||||
self.text = parsed.content.strip()
|
self.text = None
|
||||||
self.date = parsed.metadata.created
|
if hasattr(parsed, "content") and parsed.content is not None:
|
||||||
|
self.text = parsed.content.strip()
|
||||||
|
elif TikaKey.Content in parsed.data:
|
||||||
|
# May not be a completely handled type, but
|
||||||
|
# the Tika response may still include content
|
||||||
|
self.text = parsed.data[TikaKey.Content].strip()
|
||||||
|
|
||||||
|
self.date = parsed.created
|
||||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||||
|
|
||||||
def convert_to_pdf(self, document_path, file_name):
|
def convert_to_pdf(self, document_path, file_name):
|
||||||
|
BIN
src/paperless_tika/tests/samples/sample.doc
Normal file
BIN
src/paperless_tika/tests/samples/sample.doc
Normal file
Binary file not shown.
@ -118,3 +118,28 @@ class TestTikaParserAgainstServer(TestCase):
|
|||||||
self.assertTrue(b"PDF-" in f.read()[:10])
|
self.assertTrue(b"PDF-" in f.read()[:10])
|
||||||
|
|
||||||
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
|
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
|
||||||
|
|
||||||
|
def test_basic_parse_doc(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- An input DOC format document
|
||||||
|
WHEN:
|
||||||
|
- The document is parsed
|
||||||
|
THEN:
|
||||||
|
- Document content is correct
|
||||||
|
- Document date is correct
|
||||||
|
"""
|
||||||
|
test_file = self.SAMPLE_DIR / "sample.doc"
|
||||||
|
|
||||||
|
self.try_parse_with_wait(
|
||||||
|
test_file,
|
||||||
|
"application/msword",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn(
|
||||||
|
"his is a test document, saved in the older .doc format",
|
||||||
|
self.parser.text,
|
||||||
|
)
|
||||||
|
self.assertIsNotNone(self.parser.archive_path)
|
||||||
|
with open(self.parser.archive_path, "rb") as f:
|
||||||
|
self.assertTrue(b"PDF-" in f.read()[:10])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user