Working arround current TIKA Library Bugs

2025-12-20 01:45:58 -06:00 · 2023-07-06 23:26:01 +02:00
parent db48d4c576
commit d875be60d4
1 changed files with 11 additions and 2 deletions
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -1,8 +1,13 @@
 import os
 from pathlib import Path
 import array
 import dateutil.parser
 import httpx
 from django.conf import settings
 from django.utils import timezone
 from tika_client import TikaClient
 from documents.parsers import DocumentParser
@@ -51,7 +56,9 @@ class TikaDocumentParser(DocumentParser):
        try:
            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
-                parsed = client.tika.as_text.from_file(document_path, mime_type)
+                with open(document_path, 'rb') as f:
                    content = f.read()
                    parsed = client.tika.as_text.from_buffer(content, mime_type)
        except Exception as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server at "
@@ -62,7 +69,9 @@ class TikaDocumentParser(DocumentParser):
        if self.text is not None:
            self.text = self.text.strip()
-        self.date = parsed.created
+        tz = timezone.get_current_timezone()
        self.date = timezone.make_aware(parsed.created,tz)
        self.archive_path = self.convert_to_pdf(document_path, file_name)
    def convert_to_pdf(self, document_path, file_name):