mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-19 10:19:27 -05:00
Retry Tika parsing with PUT instead of form data in the event of a 500 error response (#4334)
This commit is contained in:
parent
f7f5d0efa6
commit
ada67bd54e
@ -52,7 +52,18 @@ class TikaDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
||||||
parsed = client.tika.as_text.from_file(document_path, mime_type)
|
try:
|
||||||
|
parsed = client.tika.as_text.from_file(document_path, mime_type)
|
||||||
|
except httpx.HTTPStatusError as err:
|
||||||
|
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
|
||||||
|
# Tika fails with some files as multi-part form data
|
||||||
|
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
|
||||||
|
parsed = client.tika.as_text.from_buffer(
|
||||||
|
document_path.read_bytes(),
|
||||||
|
mime_type,
|
||||||
|
)
|
||||||
|
else: # pragma: nocover
|
||||||
|
raise
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise ParseError(
|
raise ParseError(
|
||||||
f"Could not parse {document_path} with tika server at "
|
f"Could not parse {document_path} with tika server at "
|
||||||
|
BIN
src/paperless_tika/tests/samples/multi-part-broken.odt
Normal file
BIN
src/paperless_tika/tests/samples/multi-part-broken.odt
Normal file
Binary file not shown.
@ -111,3 +111,27 @@ class TestTikaParserAgainstServer(TestCase):
|
|||||||
self.assertIsNotNone(self.parser.archive_path)
|
self.assertIsNotNone(self.parser.archive_path)
|
||||||
with open(self.parser.archive_path, "rb") as f:
|
with open(self.parser.archive_path, "rb") as f:
|
||||||
self.assertTrue(b"PDF-" in f.read()[:10])
|
self.assertTrue(b"PDF-" in f.read()[:10])
|
||||||
|
|
||||||
|
def test_tika_fails_multi_part(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- An input ODT format document
|
||||||
|
- The document is known to crash Tika when uploaded via multi-part form data
|
||||||
|
WHEN:
|
||||||
|
- The document is parsed
|
||||||
|
THEN:
|
||||||
|
- Document content is correct
|
||||||
|
- Document date is correct
|
||||||
|
See also:
|
||||||
|
- https://issues.apache.org/jira/browse/TIKA-4110
|
||||||
|
"""
|
||||||
|
test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
|
||||||
|
|
||||||
|
util_call_with_backoff(
|
||||||
|
self.parser.parse,
|
||||||
|
[test_file, "application/vnd.oasis.opendocument.text"],
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNotNone(self.parser.archive_path)
|
||||||
|
with open(self.parser.archive_path, "rb") as f:
|
||||||
|
self.assertTrue(b"PDF-" in f.read()[:10])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user