From 6bcc26b48784e83cd23f80e7214b84a8732dc848 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Thu, 3 Aug 2023 09:52:39 -0700 Subject: [PATCH] Sets the timezone of creation, if the date is known and naive --- src/paperless_tika/parsers.py | 10 ++++------ src/paperless_tika/tests/test_tika_parser.py | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 0ba59d3f6..b6a9dd621 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -4,7 +4,6 @@ from pathlib import Path import httpx from django.conf import settings from django.utils import timezone - from tika_client import TikaClient from documents.parsers import DocumentParser @@ -53,9 +52,7 @@ class TikaDocumentParser(DocumentParser): try: with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: - with open(document_path, "rb") as f: - content = f.read() - parsed = client.tika.as_text.from_buffer(content, mime_type) + parsed = client.tika.as_text.from_file(document_path, mime_type) except Exception as err: raise ParseError( f"Could not parse {document_path} with tika server at " @@ -66,9 +63,10 @@ class TikaDocumentParser(DocumentParser): if self.text is not None: self.text = self.text.strip() - tz = timezone.get_current_timezone() + self.date = parsed.created + if self.date is not None and timezone.is_naive(self.date): + self.date = timezone.make_aware(self.date) - self.date = timezone.make_aware(parsed.created, tz) self.archive_path = self.convert_to_pdf(document_path, file_name) def convert_to_pdf(self, document_path, file_name): diff --git a/src/paperless_tika/tests/test_tika_parser.py b/src/paperless_tika/tests/test_tika_parser.py index 8ba8e0e79..4f64afc04 100644 --- a/src/paperless_tika/tests/test_tika_parser.py +++ b/src/paperless_tika/tests/test_tika_parser.py @@ -3,6 +3,11 @@ import os from pathlib import Path from unittest import mock +try: + import zoneinfo +except ImportError: + from backports import zoneinfo + from django.test import TestCase from django.test import override_settings from httpx import Request @@ -21,6 +26,7 @@ class TestTikaParser(HttpxMockMixin, TestCase): def tearDown(self) -> None: self.parser.cleanup() + @override_settings(TIME_ZONE="America/Chicago") def test_parse(self): # Pretend parse response self.httpx_mock.add_response( @@ -44,7 +50,15 @@ class TestTikaParser(HttpxMockMixin, TestCase): with open(self.parser.archive_path, "rb") as f: self.assertEqual(f.read(), b"PDF document") - self.assertEqual(self.parser.date, datetime.datetime(2020, 11, 21)) + self.assertEqual( + self.parser.date, + datetime.datetime( + 2020, + 11, + 21, + tzinfo=zoneinfo.ZoneInfo("America/Chicago"), + ), + ) def test_metadata(self): self.httpx_mock.add_response(