From d875be60d4ebd42047bd0f2a7c45372f8ef5b767 Mon Sep 17 00:00:00 2001 From: Simon Siebert Date: Thu, 6 Jul 2023 23:26:01 +0200 Subject: [PATCH] Working arround current TIKA Library Bugs --- src/paperless_tika/parsers.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 0558727f5..d69c5947a 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -1,8 +1,13 @@ import os from pathlib import Path +import array + +import dateutil.parser import httpx from django.conf import settings +from django.utils import timezone + from tika_client import TikaClient from documents.parsers import DocumentParser @@ -51,7 +56,9 @@ class TikaDocumentParser(DocumentParser): try: with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: - parsed = client.tika.as_text.from_file(document_path, mime_type) + with open(document_path, 'rb') as f: + content = f.read() + parsed = client.tika.as_text.from_buffer(content, mime_type) except Exception as err: raise ParseError( f"Could not parse {document_path} with tika server at " @@ -62,7 +69,9 @@ class TikaDocumentParser(DocumentParser): if self.text is not None: self.text = self.text.strip() - self.date = parsed.created + tz = timezone.get_current_timezone() + + self.date = timezone.make_aware(parsed.created,tz) self.archive_path = self.convert_to_pdf(document_path, file_name) def convert_to_pdf(self, document_path, file_name):