Retry Tika parsing with PUT instead of form data in the event of a 500 error response (#4334)

This commit is contained in:
Trenton H 2023-10-07 18:36:27 -07:00 committed by GitHub
parent f7f5d0efa6
commit ada67bd54e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 1 deletions

View File

@ -52,7 +52,18 @@ class TikaDocumentParser(DocumentParser):
try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
parsed = client.tika.as_text.from_file(document_path, mime_type)
try:
parsed = client.tika.as_text.from_file(document_path, mime_type)
except httpx.HTTPStatusError as err:
# Workaround https://issues.apache.org/jira/browse/TIKA-4110
# Tika fails with some files as multi-part form data
if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
parsed = client.tika.as_text.from_buffer(
document_path.read_bytes(),
mime_type,
)
else: # pragma: nocover
raise
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "

Binary file not shown.

View File

@ -111,3 +111,27 @@ class TestTikaParserAgainstServer(TestCase):
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10])
def test_tika_fails_multi_part(self):
"""
GIVEN:
- An input ODT format document
- The document is known to crash Tika when uploaded via multi-part form data
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
See also:
- https://issues.apache.org/jira/browse/TIKA-4110
"""
test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
util_call_with_backoff(
self.parser.parse,
[test_file, "application/vnd.oasis.opendocument.text"],
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10])