diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 402a37215..c410594bb 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -52,7 +52,18 @@ class TikaDocumentParser(DocumentParser): try: with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: - parsed = client.tika.as_text.from_file(document_path, mime_type) + try: + parsed = client.tika.as_text.from_file(document_path, mime_type) + except httpx.HTTPStatusError as err: + # Workaround https://issues.apache.org/jira/browse/TIKA-4110 + # Tika fails with some files as multi-part form data + if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR: + parsed = client.tika.as_text.from_buffer( + document_path.read_bytes(), + mime_type, + ) + else: # pragma: nocover + raise except Exception as err: raise ParseError( f"Could not parse {document_path} with tika server at " diff --git a/src/paperless_tika/tests/samples/multi-part-broken.odt b/src/paperless_tika/tests/samples/multi-part-broken.odt new file mode 100644 index 000000000..82f593f80 Binary files /dev/null and b/src/paperless_tika/tests/samples/multi-part-broken.odt differ diff --git a/src/paperless_tika/tests/test_live_tika.py b/src/paperless_tika/tests/test_live_tika.py index f34278467..1c6225bdc 100644 --- a/src/paperless_tika/tests/test_live_tika.py +++ b/src/paperless_tika/tests/test_live_tika.py @@ -111,3 +111,27 @@ class TestTikaParserAgainstServer(TestCase): self.assertIsNotNone(self.parser.archive_path) with open(self.parser.archive_path, "rb") as f: self.assertTrue(b"PDF-" in f.read()[:10]) + + def test_tika_fails_multi_part(self): + """ + GIVEN: + - An input ODT format document + - The document is known to crash Tika when uploaded via multi-part form data + WHEN: + - The document is parsed + THEN: + - Document content is correct + - Document date is correct + See also: + - https://issues.apache.org/jira/browse/TIKA-4110 + """ + test_file = self.SAMPLE_DIR / "multi-part-broken.odt" + + util_call_with_backoff( + self.parser.parse, + [test_file, "application/vnd.oasis.opendocument.text"], + ) + + self.assertIsNotNone(self.parser.archive_path) + with open(self.parser.archive_path, "rb") as f: + self.assertTrue(b"PDF-" in f.read()[:10])