From df801d17e19c0ad6690d4080c3bd373213f70518 Mon Sep 17 00:00:00 2001
From: Jonas Winkler <jonas.winkler@jpwinkler.de>
Date: Wed, 25 Nov 2020 19:36:18 +0100
Subject: [PATCH] reworked the interface of the parsers.

---
 src/documents/consumer.py          |  14 ++-
 src/documents/parsers.py           | 169 +++++++++++++----------------
 src/paperless_tesseract/parsers.py |  44 +++-----
 src/paperless_text/parsers.py      |  20 +---
 4 files changed, 101 insertions(+), 146 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index b6a0a5912..fa61e9376 100755
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 from .file_handling import generate_filename, create_source_path_directory
 from .loggers import LoggingMixin
 from .models import Document, FileInfo, Correspondent, DocumentType, Tag
-from .parsers import ParseError, get_parser_class_for_mime_type
+from .parsers import ParseError, get_parser_class_for_mime_type, parse_date
 from .signals import (
     document_consumption_finished,
     document_consumption_started
@@ -121,7 +121,7 @@ class Consumer(LoggingMixin):
 
         # This doesn't parse the document yet, but gives us a parser.
 
-        document_parser = parser_class(self.path, self.logging_group)
+        document_parser = parser_class(self.logging_group)
 
         # However, this already created working directories which we have to
         # clean up.
@@ -129,12 +129,18 @@ class Consumer(LoggingMixin):
         # Parse the document. This may take some time.
 
         try:
-            self.log("debug", f"Generating thumbnail for {self.filename}...")
-            thumbnail = document_parser.get_optimised_thumbnail()
             self.log("debug", "Parsing {}...".format(self.filename))
+            document_parser.parse(self.path, mime_type)
+
+            self.log("debug", f"Generating thumbnail for {self.filename}...")
+            thumbnail = document_parser.get_optimised_thumbnail(self.path, mime_type)
+
             text = document_parser.get_text()
             date = document_parser.get_date()
+            if not date:
+                date = parse_date(self.filename, text)
             archive_path = document_parser.get_archive_path()
+
         except ParseError as e:
             document_parser.cleanup()
             raise ConsumerError(e)
diff --git a/src/documents/parsers.py b/src/documents/parsers.py
index 542a5dae9..4ae1d1a92 100644
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -107,6 +107,61 @@ def run_convert(input_file,
         raise ParseError("Convert failed at {}".format(args))
 
 
+def parse_date(filename, text):
+    """
+    Returns the date of the document.
+    """
+
+    def __parser(ds, date_order):
+        """
+        Call dateparser.parse with a particular date ordering
+        """
+        return dateparser.parse(
+            ds,
+            settings={
+                "DATE_ORDER": date_order,
+                "PREFER_DAY_OF_MONTH": "first",
+                "RETURN_AS_TIMEZONE_AWARE":
+                True
+            }
+        )
+
+    date = None
+
+    next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
+
+    # if filename date parsing is enabled, search there first:
+    if settings.FILENAME_DATE_ORDER:
+        for m in re.finditer(DATE_REGEX, filename):
+            date_string = m.group(0)
+
+            try:
+                date = __parser(date_string, settings.FILENAME_DATE_ORDER)
+            except (TypeError, ValueError):
+                # Skip all matches that do not parse to a proper date
+                continue
+
+            if date is not None and next_year > date.year > 1900:
+                return date
+
+    # Iterate through all regex matches in text and try to parse the date
+    for m in re.finditer(DATE_REGEX, text):
+        date_string = m.group(0)
+
+        try:
+            date = __parser(date_string, settings.DATE_ORDER)
+        except (TypeError, ValueError):
+            # Skip all matches that do not parse to a proper date
+            continue
+
+        if date is not None and next_year > date.year > 1900:
+            break
+        else:
+            date = None
+
+    return date
+
+
 class ParseError(Exception):
     pass
 
@@ -117,29 +172,35 @@ class DocumentParser(LoggingMixin):
     `paperless_tesseract.parsers` for inspiration.
     """
 
-    def __init__(self, path, logging_group):
+    def __init__(self, logging_group):
         super().__init__()
         self.logging_group = logging_group
-        self.document_path = path
         self.tempdir = tempfile.mkdtemp(
             prefix="paperless-", dir=settings.SCRATCH_DIR)
 
-    def get_archive_path(self):
-        return None
+        self.archive_path = None
+        self.text = None
+        self.date = None
 
-    def get_thumbnail(self):
+    def parse(self, document_path, mime_type):
+        raise NotImplementedError()
+
+    def get_archive_path(self):
+        return self.archive_path
+
+    def get_thumbnail(self, document_path, mime_type):
         """
         Returns the path to a file we can use as a thumbnail for this document.
         """
         raise NotImplementedError()
 
-    def optimise_thumbnail(self, in_path):
-
+    def get_optimised_thumbnail(self, document_path, mime_type):
+        thumbnail = self.get_thumbnail(document_path, mime_type)
         if settings.OPTIMIZE_THUMBNAILS:
-            out_path = os.path.join(self.tempdir, "optipng.png")
+            out_path = os.path.join(self.tempdir, "thumb_optipng.png")
 
             args = (settings.OPTIPNG_BINARY,
-                    "-silent", "-o5", in_path, "-out", out_path)
+                    "-silent", "-o5", thumbnail, "-out", out_path)
 
             self.log('debug', f"Execute: {' '.join(args)}")
 
@@ -148,97 +209,13 @@ class DocumentParser(LoggingMixin):
 
             return out_path
         else:
-            return in_path
-
-    def get_optimised_thumbnail(self):
-        return self.optimise_thumbnail(self.get_thumbnail())
+            return thumbnail
 
     def get_text(self):
-        """
-        Returns the text from the document and only the text.
-        """
-        raise NotImplementedError()
+        return self.text
 
     def get_date(self):
-        """
-        Returns the date of the document.
-        """
-
-        def __parser(ds, date_order):
-            """
-            Call dateparser.parse with a particular date ordering
-            """
-            return dateparser.parse(
-                ds,
-                settings={
-                    "DATE_ORDER": date_order,
-                    "PREFER_DAY_OF_MONTH": "first",
-                    "RETURN_AS_TIMEZONE_AWARE":
-                    True
-                }
-            )
-
-        date = None
-        date_string = None
-
-        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
-        title = os.path.basename(self.document_path)
-
-        # if filename date parsing is enabled, search there first:
-        if settings.FILENAME_DATE_ORDER:
-            self.log("info", "Checking document title for date")
-            for m in re.finditer(DATE_REGEX, title):
-                date_string = m.group(0)
-
-                try:
-                    date = __parser(date_string, settings.FILENAME_DATE_ORDER)
-                except (TypeError, ValueError):
-                    # Skip all matches that do not parse to a proper date
-                    continue
-
-                if date is not None and next_year > date.year > 1900:
-                    self.log(
-                        "info",
-                        "Detected document date {} based on string {} "
-                        "from document title"
-                        "".format(date.isoformat(), date_string)
-                    )
-                    return date
-
-        try:
-            # getting text after checking filename will save time if only
-            # looking at the filename instead of the whole text
-            text = self.get_text()
-        except ParseError:
-            return None
-
-        # Iterate through all regex matches in text and try to parse the date
-        for m in re.finditer(DATE_REGEX, text):
-            date_string = m.group(0)
-
-            try:
-                date = __parser(date_string, settings.DATE_ORDER)
-            except (TypeError, ValueError):
-                # Skip all matches that do not parse to a proper date
-                continue
-
-            if date is not None and next_year > date.year > 1900:
-                break
-            else:
-                date = None
-
-        if date is not None:
-            self.log(
-                "info",
-                "Detected document date {} based on string {}".format(
-                    date.isoformat(),
-                    date_string
-                )
-            )
-        else:
-            self.log("info", "Unable to detect date for document")
-
-        return date
+        return self.date
 
     def cleanup(self):
         self.log("debug", "Deleting directory {}".format(self.tempdir))
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
index 8f694ef56..b72f95e2d 100644
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -2,7 +2,6 @@ import os
 import re
 import subprocess
 
-import langdetect
 import ocrmypdf
 import pdftotext
 from django.conf import settings
@@ -17,12 +16,7 @@ class RasterisedDocumentParser(DocumentParser):
     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
     """
 
-    def __init__(self, path, logging_group):
-        super().__init__(path, logging_group)
-        self._text = None
-        self._archive_path = None
-
-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
         """
         The thumbnail of a PDF is just a 500px wide image of the first page.
         """
@@ -36,7 +30,7 @@ class RasterisedDocumentParser(DocumentParser):
                         alpha="remove",
                         strip=True,
                         trim=True,
-                        input_file="{}[0]".format(self.document_path),
+                        input_file="{}[0]".format(document_path),
                         output_file=out_path,
                         logging_group=self.logging_group)
         except ParseError:
@@ -51,7 +45,7 @@ class RasterisedDocumentParser(DocumentParser):
                    "-q",
                    "-sDEVICE=pngalpha",
                    "-o", gs_out_path,
-                   self.document_path]
+                   document_path]
             if not subprocess.Popen(cmd).wait() == 0:
                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
             # then run convert on the output from gs
@@ -71,10 +65,11 @@ class RasterisedDocumentParser(DocumentParser):
         if self._text:
             return self._text
 
+    def parse(self, document_path, mime_type):
         archive_path = os.path.join(self.tempdir, "archive.pdf")
 
         ocr_args = {
-            'input_file': self.document_path,
+            'input_file': document_path,
             'output_file': archive_path,
             'use_threads': True,
             'jobs': settings.THREADS_PER_WORKER,
@@ -96,17 +91,17 @@ class RasterisedDocumentParser(DocumentParser):
 
         try:
             ocrmypdf.ocr(**ocr_args)
-            # success! announce that we have an archive document
-            self._archive_path = archive_path
-            self._text = get_text_from_pdf(self._archive_path)
+            # success! announce results
+            self.archive_path = archive_path
+            self.text = get_text_from_pdf(archive_path)
 
         except InputFileError as e:
             # This happens with some PDFs when used with the redo_ocr option.
             # This is not the end of the world, we'll just use what we already
             # have in the document.
-            self._text = get_text_from_pdf(self.document_path)
+            self.text = get_text_from_pdf(document_path)
             # Also, no archived file.
-            if not self._text:
+            if not self.text:
                 # However, if we don't have anything, fail:
                 raise ParseError(e)
 
@@ -114,27 +109,14 @@ class RasterisedDocumentParser(DocumentParser):
             # Anything else is probably serious.
             raise ParseError(e)
 
-        if not self._text:
+        if not self.text:
             # This may happen for files that don't have any text.
             self.log(
                 'warning',
-                f"Document {self.document_path} does not have any text."
+                f"Document {document_path} does not have any text."
                 f"This is probably an error or you tried to add an image "
                 f"without text.")
-            return ""
-
-        return self._text
-
-    def get_archive_path(self):
-        return self._archive_path
-
-    def _guess_language(self, text):
-        try:
-            guess = langdetect.detect(text)
-            return guess
-        except Exception as e:
-            self.log('warning', f"Language detection failed with: {e}")
-            return None
+            self.text = ""
 
 
 def strip_excess_whitespace(text):
diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py
index 015016fb3..f8f369ab0 100644
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser):
     This parser directly parses a text document (.txt, .md, or .csv)
     """
 
-    def __init__(self, path, logging_group):
-        super().__init__(path, logging_group)
-        self._text = None
-
-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
         """
         The thumbnail of a text file is just a 500px wide image of the text
         rendered onto a letter-sized page.
@@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser):
             )
 
         def read_text():
-            with open(self.document_path, 'r') as src:
+            with open(document_path, 'r') as src:
                 lines = [line.strip() for line in src.readlines()]
                 text = "\n".join([line for line in lines[:n_lines]])
                 return text.replace('"', "'")
@@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser):
 
         return out_path
 
-    def get_text(self):
-
-        if self._text is not None:
-            return self._text
-
-        with open(self.document_path, 'r') as f:
-            self._text = f.read()
-
-        return self._text
+    def parse(self, document_path, mime_type):
+        with open(document_path, 'r') as f:
+            self.text = f.read()
 
 
 def run_command(*args):