Adds additional testing for both date parsing and consumed document created date

2025-11-25 23:59:09 -06:00 · 2022-04-12 19:52:56 -07:00
parent ce32089cc4
commit 8a6aaf4e2d
9 changed files with 345 additions and 42 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -3,6 +3,8 @@ import hashlib
 import os
 import uuid
 from subprocess import Popen
+from typing import Optional
+from typing import Type

 import magic
 from asgiref.sync import async_to_sync
@@ -23,6 +25,7 @@ from .models import Document
 from .models import DocumentType
 from .models import FileInfo
 from .models import Tag
+from .parsers import DocumentParser
 from .parsers import get_parser_class_for_mime_type
 from .parsers import parse_date
 from .parsers import ParseError
@@ -186,7 +189,7 @@ class Consumer(LoggingMixin):
        override_document_type_id=None,
        override_tag_ids=None,
        task_id=None,
-    ):
+    ) -> Document:
        """
        Return the document object if it was successfully created.
        """
@@ -220,7 +223,10 @@ class Consumer(LoggingMixin):

        self.log("debug", f"Detected mime type: {mime_type}")

-        parser_class = get_parser_class_for_mime_type(mime_type)
+        # Based on the mime type, get the parser for that type
+        parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type(
+            mime_type,
+        )
        if not parser_class:
            self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")

@@ -241,7 +247,10 @@ class Consumer(LoggingMixin):

        # This doesn't parse the document yet, but gives us a parser.

-        document_parser = parser_class(self.logging_group, progress_callback)
+        document_parser: DocumentParser = parser_class(
+            self.logging_group,
+            progress_callback,
+        )

        self.log("debug", f"Parser: {type(document_parser).__name__}")

@@ -270,7 +279,7 @@ class Consumer(LoggingMixin):

            text = document_parser.get_text()
            date = document_parser.get_date()
-            if not date:
+            if date is None:
                self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE)
                date = parse_date(self.filename, text)
            archive_path = document_parser.get_archive_path()
@@ -342,7 +351,7 @@ class Consumer(LoggingMixin):
                            ).hexdigest()

                # Don't save with the lock active. Saving will cause the file
-                # renaming logic to aquire the lock as well.
+                # renaming logic to acquire the lock as well.
                document.save()

                # Delete the file only if it was successfully consumed
@@ -362,7 +371,8 @@ class Consumer(LoggingMixin):
        except Exception as e:
            self._fail(
                str(e),
-                f"The following error occured while consuming " f"{self.filename}: {e}",
+                f"The following error occurred while consuming "
+                f"{self.filename}: {e}",
                exc_info=True,
            )
        finally:
@@ -376,21 +386,26 @@ class Consumer(LoggingMixin):

        return document

-    def _store(self, text, date, mime_type):
+    def _store(self, text, date, mime_type) -> Document:

        # If someone gave us the original filename, use it instead of doc.

        file_info = FileInfo.from_filename(self.filename)

-        stats = os.stat(self.path)
-
        self.log("debug", "Saving record to database")

-        created = (
-            file_info.created
-            or date
-            or timezone.make_aware(datetime.datetime.fromtimestamp(stats.st_mtime))
-        )
+        if file_info.created is not None:
+            create_date = file_info.created
+            self.log("debug", f"Creation date from FileInfo: {create_date}")
+        elif date is not None:
+            create_date = date
+            self.log("debug", f"Creation date from parse_date: {create_date}")
+        else:
+            stats = os.stat(self.path)
+            create_date = timezone.make_aware(
+                datetime.datetime.fromtimestamp(stats.st_mtime),
+            )
+            self.log("debug", "Creation date from st_mtime: {create_date}")

        storage_type = Document.STORAGE_TYPE_UNENCRYPTED

@@ -400,8 +415,8 @@ class Consumer(LoggingMixin):
                content=text,
                mime_type=mime_type,
                checksum=hashlib.md5(f.read()).hexdigest(),
-                created=created,
-                modified=created,
+                created=create_date,
+                modified=create_date,
                storage_type=storage_type,
            )