Consolidate get_date onto the DocumentParser parent class

2025-07-24 18:04:39 -05:00 · 2018-10-07 14:48:49 +01:00 · 2018-10-07 14:48:49 +01:00 · 2a3f766b93
commit 2a3f766b93
parent 14bb52b6a4
5 changed files with 83 additions and 90 deletions
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -9,6 +9,8 @@ Changelog
  for reporting this. `#414`_.
 * A bug in the Dockerfile meant that Tesseract language files weren't being
  installed correctly.  `euri10`_ was quick to provide a fix: `#406`_, `#413`_.
 * The ``get_date()`` functionality of the parsers has been consolidated onto
  the ``DocumentParser`` class since much of that code was redundant anyway.
 2.4.0
 =====
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@ -1,9 +1,12 @@
 import logging
 import os
 import re
 import shutil
 import tempfile
 import re
 import dateparser
 from django.conf import settings
 from django.utils import timezone
 # This regular expression will try to find dates in the document at
 # hand and will match the following formats:
@ -32,6 +35,7 @@ class DocumentParser:
    """
    SCRATCH = settings.SCRATCH_DIR
    DATE_ORDER = settings.DATE_ORDER
    def __init__(self, path):
        self.document_path = path
@ -55,7 +59,52 @@ class DocumentParser:
        """
        Returns the date of the document.
        """
-        raise NotImplementedError()
+
        date = None
        date_string = None
        try:
            text = self.get_text()
        except ParseError:
            return None
        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
        # Iterate through all regex matches and try to parse the date
        for m in re.finditer(DATE_REGEX, text):
            date_string = m.group(0)
            try:
                date = dateparser.parse(
                    date_string,
                    settings={
                        "DATE_ORDER": self.DATE_ORDER,
                        "PREFER_DAY_OF_MONTH": "first",
                        "RETURN_AS_TIMEZONE_AWARE": True
                    }
                )
            except TypeError:
                # Skip all matches that do not parse to a proper date
                continue
            if date is not None and next_year > date.year > 1900:
                break
            else:
                date = None
        if date is not None:
            self.log(
                "info",
                "Detected document date {} based on string {}".format(
                    date.isoformat(),
                    date_string
                )
            )
        else:
            self.log("info", "Unable to detect date for document")
        return date
    def log(self, level, message):
        getattr(self.logger, level)(message, extra={
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@ -4,7 +4,6 @@ import re
 import subprocess
 from multiprocessing.pool import Pool
 import dateparser
 import langdetect
 import pyocr
 from django.conf import settings
@ -14,7 +13,7 @@ from pyocr.libtesseract.tesseract_raw import \
 from pyocr.tesseract import TesseractError
 import pdftotext
-from documents.parsers import DocumentParser, ParseError, DATE_REGEX
+from documents.parsers import DocumentParser, ParseError
 from .languages import ISO639
@ -33,7 +32,6 @@ class RasterisedDocumentParser(DocumentParser):
    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
    UNPAPER = settings.UNPAPER_BINARY
    DATE_ORDER = settings.DATE_ORDER
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
    OCR_ALWAYS = settings.OCR_ALWAYS
@ -202,51 +200,6 @@ class RasterisedDocumentParser(DocumentParser):
        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
        return text
    def get_date(self):
        date = None
        datestring = None
        try:
            text = self.get_text()
        except ParseError as e:
            return None
        # Iterate through all regex matches and try to parse the date
        for m in re.finditer(DATE_REGEX, text):
            datestring = m.group(0)
            try:
                date = dateparser.parse(
                    datestring,
                    settings={
                        "DATE_ORDER": self.DATE_ORDER,
                        "PREFER_DAY_OF_MONTH": "first",
                        "RETURN_AS_TIMEZONE_AWARE": True
                    }
                )
            except TypeError:
                # Skip all matches that do not parse to a proper date
                continue
            if date is not None and date.year > 1900:
                break
            else:
                date = None
        if date is not None:
            self.log(
                "info",
                "Detected document date {} based on string {}".format(
                    date.isoformat(),
                    datestring
                )
            )
        else:
            self.log("info", "Unable to detect date for document")
        return date
 def run_convert(*args):
--- a/src/paperless_tesseract/tests/test_date.py
+++ b/src/paperless_tesseract/tests/test_date.py
@ -393,7 +393,33 @@ class TestDate(TestCase):
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
        SCRATCH
    )
-    def test_crazy_date(self, *args):
+    def test_crazy_date_past(self, *args):
        document = RasterisedDocumentParser("/dev/null")
        document.get_text()
        self.assertIsNone(document.get_date())
    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
        return_value="01-07-2350 00:00:00"
    )
    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
        SCRATCH
    )
    def test_crazy_date_future(self, *args):
        document = RasterisedDocumentParser("/dev/null")
        document.get_text()
        self.assertIsNone(document.get_date())
    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
        return_value="01-07-0590 00:00:00"
    )
    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
        SCRATCH
    )
    def test_crazy_date_past(self, *args):
        document = RasterisedDocumentParser("/dev/null")
        document.get_text()
        self.assertIsNone(document.get_date())
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@ -1,11 +1,9 @@
 import os
 import re
 import subprocess
 import dateparser
 from django.conf import settings
-from documents.parsers import DocumentParser, ParseError, DATE_REGEX
+from documents.parsers import DocumentParser, ParseError
 class TextDocumentParser(DocumentParser):
@ -16,7 +14,6 @@ class TextDocumentParser(DocumentParser):
    CONVERT = settings.CONVERT_BINARY
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
    UNPAPER = settings.UNPAPER_BINARY
    DATE_ORDER = settings.DATE_ORDER
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
    OCR_ALWAYS = settings.OCR_ALWAYS
@ -26,7 +23,7 @@ class TextDocumentParser(DocumentParser):
    def get_thumbnail(self):
        """
-        The thumbnail of a txt is just a 500px wide image of the text
+        The thumbnail of a text file is just a 500px wide image of the text
        rendered onto a letter-sized page.
        """
        # The below is heavily cribbed from https://askubuntu.com/a/590951
@ -84,40 +81,6 @@ class TextDocumentParser(DocumentParser):
        return self._text
    def get_date(self):
        date = None
        datestring = None
        try:
            text = self.get_text()
        except ParseError as e:
            return None
        # Iterate through all regex matches and try to parse the date
        for m in re.finditer(DATE_REGEX, text):
            datestring = m.group(0)
            try:
                date = dateparser.parse(
                           datestring,
                           settings={'DATE_ORDER': self.DATE_ORDER,
                                     'PREFER_DAY_OF_MONTH': 'first',
                                     'RETURN_AS_TIMEZONE_AWARE': True})
            except TypeError:
                # Skip all matches that do not parse to a proper date
                continue
            if date is not None:
                break
        if date is not None:
            self.log("info", "Detected document date " + date.isoformat() +
                             " based on string " + datestring)
        else:
            self.log("info", "Unable to detect date for document")
        return date
 def run_command(*args):
    environment = os.environ.copy()