mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Adds better error handling/checking around getting content of a document via Tika
Signed-off-by: Trenton Holmes <797416+stumpylog@users.noreply.github.com>
This commit is contained in:

committed by
Trenton H

parent
4693632c7d
commit
4782b4da07
@@ -4,6 +4,7 @@ from pathlib import Path
|
||||
import httpx
|
||||
from django.conf import settings
|
||||
from tika_client import TikaClient
|
||||
from tika_client.data_models import TikaKey
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
@@ -58,8 +59,15 @@ class TikaDocumentParser(DocumentParser):
|
||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||
) from err
|
||||
|
||||
self.text = parsed.content.strip()
|
||||
self.date = parsed.metadata.created
|
||||
self.text = None
|
||||
if hasattr(parsed, "content") and parsed.content is not None:
|
||||
self.text = parsed.content.strip()
|
||||
elif TikaKey.Content in parsed.data:
|
||||
# May not be a completely handled type, but
|
||||
# the Tika response may still include content
|
||||
self.text = parsed.data[TikaKey.Content].strip()
|
||||
|
||||
self.date = parsed.created
|
||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||
|
||||
def convert_to_pdf(self, document_path, file_name):
|
||||
|
BIN
src/paperless_tika/tests/samples/sample.doc
Normal file
BIN
src/paperless_tika/tests/samples/sample.doc
Normal file
Binary file not shown.
@@ -118,3 +118,28 @@ class TestTikaParserAgainstServer(TestCase):
|
||||
self.assertTrue(b"PDF-" in f.read()[:10])
|
||||
|
||||
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
|
||||
|
||||
def test_basic_parse_doc(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- An input DOC format document
|
||||
WHEN:
|
||||
- The document is parsed
|
||||
THEN:
|
||||
- Document content is correct
|
||||
- Document date is correct
|
||||
"""
|
||||
test_file = self.SAMPLE_DIR / "sample.doc"
|
||||
|
||||
self.try_parse_with_wait(
|
||||
test_file,
|
||||
"application/msword",
|
||||
)
|
||||
|
||||
self.assertIn(
|
||||
"his is a test document, saved in the older .doc format",
|
||||
self.parser.text,
|
||||
)
|
||||
self.assertIsNotNone(self.parser.archive_path)
|
||||
with open(self.parser.archive_path, "rb") as f:
|
||||
self.assertTrue(b"PDF-" in f.read()[:10])
|
||||
|
Reference in New Issue
Block a user