Add support for a heuristic that extracts the document date from its text

2026-02-20 00:39:32 -06:00 · 2018-01-28 19:09:52 +01:00
parent c16c9a1325
commit 21fc51c09a
5 changed files with 40 additions and 3 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -118,12 +118,14 @@ class Consumer(object):

            parsed_document = parser_class(doc)
            thumbnail = parsed_document.get_thumbnail()
+            date = parsed_document.get_date()

            try:
                document = self._store(
                    parsed_document.get_text(),
                    doc,
-                    thumbnail
+                    thumbnail,
+                    date
                )
            except ParseError as e:

@@ -174,7 +176,7 @@ class Consumer(object):
        return sorted(
            options, key=lambda _: _["weight"], reverse=True)[0]["parser"]

-    def _store(self, text, doc, thumbnail):
+    def _store(self, text, doc, thumbnail, date):

        file_info = FileInfo.from_path(doc)

@@ -182,7 +184,7 @@ class Consumer(object):

        self.log("debug", "Saving record to database")

-        created = file_info.created or timezone.make_aware(
+        created = file_info.created or date or timezone.make_aware(
                    datetime.datetime.fromtimestamp(stats.st_mtime))

        with open(doc, "rb") as f:
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -35,6 +35,12 @@ class DocumentParser(object):
        """
        raise NotImplementedError()

+    def get_date(self):
+        """
+        Returns the date of the document.
+        """
+        raise NotImplementedError()
+
    def log(self, level, message):
        getattr(self.logger, level)(message, extra={
            "group": self.logging_group