From d875be60d4ebd42047bd0f2a7c45372f8ef5b767 Mon Sep 17 00:00:00 2001 From: Simon Siebert Date: Thu, 6 Jul 2023 23:26:01 +0200 Subject: [PATCH 1/3] Working arround current TIKA Library Bugs --- src/paperless_tika/parsers.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 0558727f5..d69c5947a 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -1,8 +1,13 @@ import os from pathlib import Path +import array + +import dateutil.parser import httpx from django.conf import settings +from django.utils import timezone + from tika_client import TikaClient from documents.parsers import DocumentParser @@ -51,7 +56,9 @@ class TikaDocumentParser(DocumentParser): try: with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: - parsed = client.tika.as_text.from_file(document_path, mime_type) + with open(document_path, 'rb') as f: + content = f.read() + parsed = client.tika.as_text.from_buffer(content, mime_type) except Exception as err: raise ParseError( f"Could not parse {document_path} with tika server at " @@ -62,7 +69,9 @@ class TikaDocumentParser(DocumentParser): if self.text is not None: self.text = self.text.strip() - self.date = parsed.created + tz = timezone.get_current_timezone() + + self.date = timezone.make_aware(parsed.created,tz) self.archive_path = self.convert_to_pdf(document_path, file_name) def convert_to_pdf(self, document_path, file_name): From 56fcb3fee1bb3764c53f2d9cb21027e045891795 Mon Sep 17 00:00:00 2001 From: Simon Siebert Date: Thu, 6 Jul 2023 23:31:38 +0200 Subject: [PATCH 2/3] Working arround current TIKA Library Bugs - lint --- src/paperless_tika/parsers.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index d69c5947a..0ba59d3f6 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -1,9 +1,6 @@ import os from pathlib import Path -import array - -import dateutil.parser import httpx from django.conf import settings from django.utils import timezone @@ -56,7 +53,7 @@ class TikaDocumentParser(DocumentParser): try: with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: - with open(document_path, 'rb') as f: + with open(document_path, "rb") as f: content = f.read() parsed = client.tika.as_text.from_buffer(content, mime_type) except Exception as err: @@ -71,7 +68,7 @@ class TikaDocumentParser(DocumentParser): tz = timezone.get_current_timezone() - self.date = timezone.make_aware(parsed.created,tz) + self.date = timezone.make_aware(parsed.created, tz) self.archive_path = self.convert_to_pdf(document_path, file_name) def convert_to_pdf(self, document_path, file_name): From 6bcc26b48784e83cd23f80e7214b84a8732dc848 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Thu, 3 Aug 2023 09:52:39 -0700 Subject: [PATCH 3/3] Sets the timezone of creation, if the date is known and naive --- src/paperless_tika/parsers.py | 10 ++++------ src/paperless_tika/tests/test_tika_parser.py | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 0ba59d3f6..b6a9dd621 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -4,7 +4,6 @@ from pathlib import Path import httpx from django.conf import settings from django.utils import timezone - from tika_client import TikaClient from documents.parsers import DocumentParser @@ -53,9 +52,7 @@ class TikaDocumentParser(DocumentParser): try: with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: - with open(document_path, "rb") as f: - content = f.read() - parsed = client.tika.as_text.from_buffer(content, mime_type) + parsed = client.tika.as_text.from_file(document_path, mime_type) except Exception as err: raise ParseError( f"Could not parse {document_path} with tika server at " @@ -66,9 +63,10 @@ class TikaDocumentParser(DocumentParser): if self.text is not None: self.text = self.text.strip() - tz = timezone.get_current_timezone() + self.date = parsed.created + if self.date is not None and timezone.is_naive(self.date): + self.date = timezone.make_aware(self.date) - self.date = timezone.make_aware(parsed.created, tz) self.archive_path = self.convert_to_pdf(document_path, file_name) def convert_to_pdf(self, document_path, file_name): diff --git a/src/paperless_tika/tests/test_tika_parser.py b/src/paperless_tika/tests/test_tika_parser.py index 8ba8e0e79..4f64afc04 100644 --- a/src/paperless_tika/tests/test_tika_parser.py +++ b/src/paperless_tika/tests/test_tika_parser.py @@ -3,6 +3,11 @@ import os from pathlib import Path from unittest import mock +try: + import zoneinfo +except ImportError: + from backports import zoneinfo + from django.test import TestCase from django.test import override_settings from httpx import Request @@ -21,6 +26,7 @@ class TestTikaParser(HttpxMockMixin, TestCase): def tearDown(self) -> None: self.parser.cleanup() + @override_settings(TIME_ZONE="America/Chicago") def test_parse(self): # Pretend parse response self.httpx_mock.add_response( @@ -44,7 +50,15 @@ class TestTikaParser(HttpxMockMixin, TestCase): with open(self.parser.archive_path, "rb") as f: self.assertEqual(f.read(), b"PDF document") - self.assertEqual(self.parser.date, datetime.datetime(2020, 11, 21)) + self.assertEqual( + self.parser.date, + datetime.datetime( + 2020, + 11, + 21, + tzinfo=zoneinfo.ZoneInfo("America/Chicago"), + ), + ) def test_metadata(self): self.httpx_mock.add_response(