diff --git a/docs/changelog.rst b/docs/changelog.rst index 7daaa9d38..aefe65c25 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,8 @@ Changelog for reporting this. `#414`_. * A bug in the Dockerfile meant that Tesseract language files weren't being installed correctly. `euri10`_ was quick to provide a fix: `#406`_, `#413`_. +* The ``get_date()`` functionality of the parsers has been consolidated onto + the ``DocumentParser`` class since much of that code was redundant anyway. 2.4.0 ===== diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 884f91ae4..29128eaad 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -1,9 +1,12 @@ import logging +import os +import re import shutil import tempfile -import re +import dateparser from django.conf import settings +from django.utils import timezone # This regular expression will try to find dates in the document at # hand and will match the following formats: @@ -32,6 +35,7 @@ class DocumentParser: """ SCRATCH = settings.SCRATCH_DIR + DATE_ORDER = settings.DATE_ORDER def __init__(self, path): self.document_path = path @@ -55,7 +59,52 @@ class DocumentParser: """ Returns the date of the document. """ - raise NotImplementedError() + + date = None + date_string = None + + try: + text = self.get_text() + except ParseError: + return None + + next_year = timezone.now().year + 5 # Arbitrary 5 year future limit + + # Iterate through all regex matches and try to parse the date + for m in re.finditer(DATE_REGEX, text): + + date_string = m.group(0) + + try: + date = dateparser.parse( + date_string, + settings={ + "DATE_ORDER": self.DATE_ORDER, + "PREFER_DAY_OF_MONTH": "first", + "RETURN_AS_TIMEZONE_AWARE": True + } + ) + except TypeError: + # Skip all matches that do not parse to a proper date + continue + + if date is not None and next_year > date.year > 1900: + break + else: + date = None + + if date is not None: + self.log( + "info", + "Detected document date {} based on string {}".format( + date.isoformat(), + date_string + ) + ) + else: + self.log("info", "Unable to detect date for document") + + return date def log(self, level, message): getattr(self.logger, level)(message, extra={ diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 5305ff053..8ba162b9f 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -4,7 +4,6 @@ import re import subprocess from multiprocessing.pool import Pool -import dateparser import langdetect import pyocr from django.conf import settings @@ -14,7 +13,7 @@ from pyocr.libtesseract.tesseract_raw import \ from pyocr.tesseract import TesseractError import pdftotext -from documents.parsers import DocumentParser, ParseError, DATE_REGEX +from documents.parsers import DocumentParser, ParseError from .languages import ISO639 @@ -33,7 +32,6 @@ class RasterisedDocumentParser(DocumentParser): DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None UNPAPER = settings.UNPAPER_BINARY - DATE_ORDER = settings.DATE_ORDER DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE OCR_ALWAYS = settings.OCR_ALWAYS @@ -202,51 +200,6 @@ class RasterisedDocumentParser(DocumentParser): text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) return text - def get_date(self): - - date = None - datestring = None - - try: - text = self.get_text() - except ParseError as e: - return None - - # Iterate through all regex matches and try to parse the date - for m in re.finditer(DATE_REGEX, text): - datestring = m.group(0) - - try: - date = dateparser.parse( - datestring, - settings={ - "DATE_ORDER": self.DATE_ORDER, - "PREFER_DAY_OF_MONTH": "first", - "RETURN_AS_TIMEZONE_AWARE": True - } - ) - except TypeError: - # Skip all matches that do not parse to a proper date - continue - - if date is not None and date.year > 1900: - break - else: - date = None - - if date is not None: - self.log( - "info", - "Detected document date {} based on string {}".format( - date.isoformat(), - datestring - ) - ) - else: - self.log("info", "Unable to detect date for document") - - return date - def run_convert(*args): diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index e75042ce1..15fed1a37 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -393,7 +393,33 @@ class TestDate(TestCase): "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", SCRATCH ) - def test_crazy_date(self, *args): + def test_crazy_date_past(self, *args): + document = RasterisedDocumentParser("/dev/null") + document.get_text() + self.assertIsNone(document.get_date()) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", + return_value="01-07-2350 00:00:00" + ) + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SCRATCH + ) + def test_crazy_date_future(self, *args): + document = RasterisedDocumentParser("/dev/null") + document.get_text() + self.assertIsNone(document.get_date()) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", + return_value="01-07-0590 00:00:00" + ) + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SCRATCH + ) + def test_crazy_date_past(self, *args): document = RasterisedDocumentParser("/dev/null") document.get_text() self.assertIsNone(document.get_date()) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index f02ba3ef8..afcfb013c 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -1,11 +1,9 @@ import os -import re import subprocess -import dateparser from django.conf import settings -from documents.parsers import DocumentParser, ParseError, DATE_REGEX +from documents.parsers import DocumentParser, ParseError class TextDocumentParser(DocumentParser): @@ -16,7 +14,6 @@ class TextDocumentParser(DocumentParser): CONVERT = settings.CONVERT_BINARY THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None UNPAPER = settings.UNPAPER_BINARY - DATE_ORDER = settings.DATE_ORDER DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE OCR_ALWAYS = settings.OCR_ALWAYS @@ -26,7 +23,7 @@ class TextDocumentParser(DocumentParser): def get_thumbnail(self): """ - The thumbnail of a txt is just a 500px wide image of the text + The thumbnail of a text file is just a 500px wide image of the text rendered onto a letter-sized page. """ # The below is heavily cribbed from https://askubuntu.com/a/590951 @@ -84,40 +81,6 @@ class TextDocumentParser(DocumentParser): return self._text - def get_date(self): - date = None - datestring = None - - try: - text = self.get_text() - except ParseError as e: - return None - - # Iterate through all regex matches and try to parse the date - for m in re.finditer(DATE_REGEX, text): - datestring = m.group(0) - - try: - date = dateparser.parse( - datestring, - settings={'DATE_ORDER': self.DATE_ORDER, - 'PREFER_DAY_OF_MONTH': 'first', - 'RETURN_AS_TIMEZONE_AWARE': True}) - except TypeError: - # Skip all matches that do not parse to a proper date - continue - - if date is not None: - break - - if date is not None: - self.log("info", "Detected document date " + date.isoformat() + - " based on string " + datestring) - else: - self.log("info", "Unable to detect date for document") - - return date - def run_command(*args): environment = os.environ.copy()