reworked the interface of the parsers.

2025-07-02 16:14:39 -05:00 · 2020-11-25 19:36:18 +01:00 · 2020-11-25 19:36:18 +01:00 · df801d17e1
commit df801d17e1
parent d3c13f6c93
4 changed files with 101 additions and 146 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 from .file_handling import generate_filename, create_source_path_directory
 from .loggers import LoggingMixin
 from .models import Document, FileInfo, Correspondent, DocumentType, Tag
-from .parsers import ParseError, get_parser_class_for_mime_type
+from .parsers import ParseError, get_parser_class_for_mime_type, parse_date
 from .signals import (
    document_consumption_finished,
    document_consumption_started
@ -121,7 +121,7 @@ class Consumer(LoggingMixin):
        # This doesn't parse the document yet, but gives us a parser.
-        document_parser = parser_class(self.path, self.logging_group)
+        document_parser = parser_class(self.logging_group)
        # However, this already created working directories which we have to
        # clean up.
@ -129,12 +129,18 @@ class Consumer(LoggingMixin):
        # Parse the document. This may take some time.
        try:
            self.log("debug", f"Generating thumbnail for {self.filename}...")
            thumbnail = document_parser.get_optimised_thumbnail()
            self.log("debug", "Parsing {}...".format(self.filename))
            document_parser.parse(self.path, mime_type)
            self.log("debug", f"Generating thumbnail for {self.filename}...")
            thumbnail = document_parser.get_optimised_thumbnail(self.path, mime_type)
            text = document_parser.get_text()
            date = document_parser.get_date()
            if not date:
                date = parse_date(self.filename, text)
            archive_path = document_parser.get_archive_path()
        except ParseError as e:
            document_parser.cleanup()
            raise ConsumerError(e)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@ -107,59 +107,7 @@ def run_convert(input_file,
        raise ParseError("Convert failed at {}".format(args))
-class ParseError(Exception):
+def parse_date(filename, text):
    pass
 class DocumentParser(LoggingMixin):
    """
    Subclass this to make your own parser.  Have a look at
    `paperless_tesseract.parsers` for inspiration.
    """
    def __init__(self, path, logging_group):
        super().__init__()
        self.logging_group = logging_group
        self.document_path = path
        self.tempdir = tempfile.mkdtemp(
            prefix="paperless-", dir=settings.SCRATCH_DIR)
    def get_archive_path(self):
        return None
    def get_thumbnail(self):
        """
        Returns the path to a file we can use as a thumbnail for this document.
        """
        raise NotImplementedError()
    def optimise_thumbnail(self, in_path):
        if settings.OPTIMIZE_THUMBNAILS:
            out_path = os.path.join(self.tempdir, "optipng.png")
            args = (settings.OPTIPNG_BINARY,
                    "-silent", "-o5", in_path, "-out", out_path)
            self.log('debug', f"Execute: {' '.join(args)}")
            if not subprocess.Popen(args).wait() == 0:
                raise ParseError("Optipng failed at {}".format(args))
            return out_path
        else:
            return in_path
    def get_optimised_thumbnail(self):
        return self.optimise_thumbnail(self.get_thumbnail())
    def get_text(self):
        """
        Returns the text from the document and only the text.
        """
        raise NotImplementedError()
    def get_date(self):
    """
    Returns the date of the document.
    """
@ -179,15 +127,12 @@ class DocumentParser(LoggingMixin):
        )
    date = None
        date_string = None
    next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
        title = os.path.basename(self.document_path)
    # if filename date parsing is enabled, search there first:
    if settings.FILENAME_DATE_ORDER:
-            self.log("info", "Checking document title for date")
+        for m in re.finditer(DATE_REGEX, filename):
            for m in re.finditer(DATE_REGEX, title):
            date_string = m.group(0)
            try:
@ -197,21 +142,8 @@ class DocumentParser(LoggingMixin):
                continue
            if date is not None and next_year > date.year > 1900:
                    self.log(
                        "info",
                        "Detected document date {} based on string {} "
                        "from document title"
                        "".format(date.isoformat(), date_string)
                    )
                return date
        try:
            # getting text after checking filename will save time if only
            # looking at the filename instead of the whole text
            text = self.get_text()
        except ParseError:
            return None
    # Iterate through all regex matches in text and try to parse the date
    for m in re.finditer(DATE_REGEX, text):
        date_string = m.group(0)
@ -227,19 +159,64 @@ class DocumentParser(LoggingMixin):
        else:
            date = None
        if date is not None:
            self.log(
                "info",
                "Detected document date {} based on string {}".format(
                    date.isoformat(),
                    date_string
                )
            )
        else:
            self.log("info", "Unable to detect date for document")
    return date
 class ParseError(Exception):
    pass
 class DocumentParser(LoggingMixin):
    """
    Subclass this to make your own parser.  Have a look at
    `paperless_tesseract.parsers` for inspiration.
    """
    def __init__(self, logging_group):
        super().__init__()
        self.logging_group = logging_group
        self.tempdir = tempfile.mkdtemp(
            prefix="paperless-", dir=settings.SCRATCH_DIR)
        self.archive_path = None
        self.text = None
        self.date = None
    def parse(self, document_path, mime_type):
        raise NotImplementedError()
    def get_archive_path(self):
        return self.archive_path
    def get_thumbnail(self, document_path, mime_type):
        """
        Returns the path to a file we can use as a thumbnail for this document.
        """
        raise NotImplementedError()
    def get_optimised_thumbnail(self, document_path, mime_type):
        thumbnail = self.get_thumbnail(document_path, mime_type)
        if settings.OPTIMIZE_THUMBNAILS:
            out_path = os.path.join(self.tempdir, "thumb_optipng.png")
            args = (settings.OPTIPNG_BINARY,
                    "-silent", "-o5", thumbnail, "-out", out_path)
            self.log('debug', f"Execute: {' '.join(args)}")
            if not subprocess.Popen(args).wait() == 0:
                raise ParseError("Optipng failed at {}".format(args))
            return out_path
        else:
            return thumbnail
    def get_text(self):
        return self.text
    def get_date(self):
        return self.date
    def cleanup(self):
        self.log("debug", "Deleting directory {}".format(self.tempdir))
        shutil.rmtree(self.tempdir)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@ -2,7 +2,6 @@ import os
 import re
 import subprocess
 import langdetect
 import ocrmypdf
 import pdftotext
 from django.conf import settings
@ -17,12 +16,7 @@ class RasterisedDocumentParser(DocumentParser):
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """
-    def __init__(self, path, logging_group):
+    def get_thumbnail(self, document_path, mime_type):
        super().__init__(path, logging_group)
        self._text = None
        self._archive_path = None
    def get_thumbnail(self):
        """
        The thumbnail of a PDF is just a 500px wide image of the first page.
        """
@ -36,7 +30,7 @@ class RasterisedDocumentParser(DocumentParser):
                        alpha="remove",
                        strip=True,
                        trim=True,
-                        input_file="{}[0]".format(self.document_path),
+                        input_file="{}[0]".format(document_path),
                        output_file=out_path,
                        logging_group=self.logging_group)
        except ParseError:
@ -51,7 +45,7 @@ class RasterisedDocumentParser(DocumentParser):
                   "-q",
                   "-sDEVICE=pngalpha",
                   "-o", gs_out_path,
-                   self.document_path]
+                   document_path]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
@ -71,10 +65,11 @@ class RasterisedDocumentParser(DocumentParser):
        if self._text:
            return self._text
    def parse(self, document_path, mime_type):
        archive_path = os.path.join(self.tempdir, "archive.pdf")
        ocr_args = {
-            'input_file': self.document_path,
+            'input_file': document_path,
            'output_file': archive_path,
            'use_threads': True,
            'jobs': settings.THREADS_PER_WORKER,
@ -96,17 +91,17 @@ class RasterisedDocumentParser(DocumentParser):
        try:
            ocrmypdf.ocr(**ocr_args)
-            # success! announce that we have an archive document
+            # success! announce results
-            self._archive_path = archive_path
+            self.archive_path = archive_path
-            self._text = get_text_from_pdf(self._archive_path)
+            self.text = get_text_from_pdf(archive_path)
        except InputFileError as e:
            # This happens with some PDFs when used with the redo_ocr option.
            # This is not the end of the world, we'll just use what we already
            # have in the document.
-            self._text = get_text_from_pdf(self.document_path)
+            self.text = get_text_from_pdf(document_path)
            # Also, no archived file.
-            if not self._text:
+            if not self.text:
                # However, if we don't have anything, fail:
                raise ParseError(e)
@ -114,27 +109,14 @@ class RasterisedDocumentParser(DocumentParser):
            # Anything else is probably serious.
            raise ParseError(e)
-        if not self._text:
+        if not self.text:
            # This may happen for files that don't have any text.
            self.log(
                'warning',
-                f"Document {self.document_path} does not have any text."
+                f"Document {document_path} does not have any text."
                f"This is probably an error or you tried to add an image "
                f"without text.")
-            return ""
+            self.text = ""
        return self._text
    def get_archive_path(self):
        return self._archive_path
    def _guess_language(self, text):
        try:
            guess = langdetect.detect(text)
            return guess
        except Exception as e:
            self.log('warning', f"Language detection failed with: {e}")
            return None
 def strip_excess_whitespace(text):
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser):
    This parser directly parses a text document (.txt, .md, or .csv)
    """
-    def __init__(self, path, logging_group):
+    def get_thumbnail(self, document_path, mime_type):
        super().__init__(path, logging_group)
        self._text = None
    def get_thumbnail(self):
        """
        The thumbnail of a text file is just a 500px wide image of the text
        rendered onto a letter-sized page.
@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser):
            )
        def read_text():
-            with open(self.document_path, 'r') as src:
+            with open(document_path, 'r') as src:
                lines = [line.strip() for line in src.readlines()]
                text = "\n".join([line for line in lines[:n_lines]])
                return text.replace('"', "'")
@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser):
        return out_path
-    def get_text(self):
+    def parse(self, document_path, mime_type):
-
+        with open(document_path, 'r') as f:
-        if self._text is not None:
+            self.text = f.read()
            return self._text
        with open(self.document_path, 'r') as f:
            self._text = f.read()
        return self._text
 def run_command(*args):