Merge branch 'dev' into feature-websockets-status

2026-02-03 23:22:42 -06:00 · 2020-12-06 22:53:54 +01:00
parent 6cf0b851b7 eede5595e9
commit 522ada88ea
179 changed files with 5678 additions and 2460 deletions
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -1,4 +1,5 @@
 import logging
+import mimetypes
 import os
 import re
 import shutil
@@ -42,6 +43,40 @@ def is_mime_type_supported(mime_type):
    return get_parser_class_for_mime_type(mime_type) is not None


+def get_default_file_extension(mime_type):
+    for response in document_consumer_declaration.send(None):
+        parser_declaration = response[1]
+        supported_mime_types = parser_declaration["mime_types"]
+
+        if mime_type in supported_mime_types:
+            return supported_mime_types[mime_type]
+
+    ext = mimetypes.guess_extension(mime_type)
+    if ext:
+        return ext
+    else:
+        return ""
+
+
+def is_file_ext_supported(ext):
+    if ext:
+        return ext.lower() in get_supported_file_extensions()
+    else:
+        return False
+
+
+def get_supported_file_extensions():
+    extensions = set()
+    for response in document_consumer_declaration.send(None):
+        parser_declaration = response[1]
+        supported_mime_types = parser_declaration["mime_types"]
+
+        for mime_type in supported_mime_types:
+            extensions.update(mimetypes.guess_all_extensions(mime_type))
+
+    return extensions
+
+
 def get_parser_class_for_mime_type(mime_type):

    options = []
@@ -107,21 +142,59 @@ def run_convert(input_file,
        raise ParseError("Convert failed at {}".format(args))


-def run_unpaper(pnm, logging_group=None):
-    pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
+def parse_date(filename, text):
+    """
+    Returns the date of the document.
+    """

-    command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
-                    pnm_out)
+    def __parser(ds, date_order):
+        """
+        Call dateparser.parse with a particular date ordering
+        """
+        return dateparser.parse(
+            ds,
+            settings={
+                "DATE_ORDER": date_order,
+                "PREFER_DAY_OF_MONTH": "first",
+                "RETURN_AS_TIMEZONE_AWARE":
+                True
+            }
+        )

-    logger.debug(f"Execute: {' '.join(command_args)}",
-                 extra={'group': logging_group})
+    date = None

-    if not subprocess.Popen(command_args,
-                            stdout=subprocess.DEVNULL,
-                            stderr=subprocess.DEVNULL).wait() == 0:
-        raise ParseError(f"Unpaper failed at {command_args}")
+    next_year = timezone.now().year + 5  # Arbitrary 5 year future limit

-    return pnm_out
+    # if filename date parsing is enabled, search there first:
+    if settings.FILENAME_DATE_ORDER:
+        for m in re.finditer(DATE_REGEX, filename):
+            date_string = m.group(0)
+
+            try:
+                date = __parser(date_string, settings.FILENAME_DATE_ORDER)
+            except (TypeError, ValueError):
+                # Skip all matches that do not parse to a proper date
+                continue
+
+            if date is not None and next_year > date.year > 1900:
+                return date
+
+    # Iterate through all regex matches in text and try to parse the date
+    for m in re.finditer(DATE_REGEX, text):
+        date_string = m.group(0)
+
+        try:
+            date = __parser(date_string, settings.DATE_ORDER)
+        except (TypeError, ValueError):
+            # Skip all matches that do not parse to a proper date
+            continue
+
+        if date is not None and next_year > date.year > 1900:
+            break
+        else:
+            date = None
+
+    return date


 class ParseError(Exception):
@@ -134,27 +207,36 @@ class DocumentParser(LoggingMixin):
    `paperless_tesseract.parsers` for inspiration.
    """

-    def __init__(self, path, logging_group, progress_callback):
+    def __init__(self, logging_group, progress_callback):
        super().__init__()
        self.logging_group = logging_group
-        self.document_path = path
        self.tempdir = tempfile.mkdtemp(
            prefix="paperless-", dir=settings.SCRATCH_DIR)
+
+        self.archive_path = None
+        self.text = None
+        self.date = None
        self.progress_callback = progress_callback

-    def get_thumbnail(self):
+    def parse(self, document_path, mime_type):
+        raise NotImplementedError()
+
+    def get_archive_path(self):
+        return self.archive_path
+
+    def get_thumbnail(self, document_path, mime_type):
        """
        Returns the path to a file we can use as a thumbnail for this document.
        """
        raise NotImplementedError()

-    def optimise_thumbnail(self, in_path):
-
+    def get_optimised_thumbnail(self, document_path, mime_type):
+        thumbnail = self.get_thumbnail(document_path, mime_type)
        if settings.OPTIMIZE_THUMBNAILS:
-            out_path = os.path.join(self.tempdir, "optipng.png")
+            out_path = os.path.join(self.tempdir, "thumb_optipng.png")

            args = (settings.OPTIPNG_BINARY,
-                    "-silent", "-o5", in_path, "-out", out_path)
+                    "-silent", "-o5", thumbnail, "-out", out_path)

            self.log('debug', f"Execute: {' '.join(args)}")

@@ -163,97 +245,13 @@ class DocumentParser(LoggingMixin):

            return out_path
        else:
-            return in_path
-
-    def get_optimised_thumbnail(self):
-        return self.optimise_thumbnail(self.get_thumbnail())
+            return thumbnail

    def get_text(self):
-        """
-        Returns the text from the document and only the text.
-        """
-        raise NotImplementedError()
+        return self.text

    def get_date(self):
-        """
-        Returns the date of the document.
-        """
-
-        def __parser(ds, date_order):
-            """
-            Call dateparser.parse with a particular date ordering
-            """
-            return dateparser.parse(
-                ds,
-                settings={
-                    "DATE_ORDER": date_order,
-                    "PREFER_DAY_OF_MONTH": "first",
-                    "RETURN_AS_TIMEZONE_AWARE":
-                    True
-                }
-            )
-
-        date = None
-        date_string = None
-
-        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
-        title = os.path.basename(self.document_path)
-
-        # if filename date parsing is enabled, search there first:
-        if settings.FILENAME_DATE_ORDER:
-            self.log("info", "Checking document title for date")
-            for m in re.finditer(DATE_REGEX, title):
-                date_string = m.group(0)
-
-                try:
-                    date = __parser(date_string, settings.FILENAME_DATE_ORDER)
-                except (TypeError, ValueError):
-                    # Skip all matches that do not parse to a proper date
-                    continue
-
-                if date is not None and next_year > date.year > 1900:
-                    self.log(
-                        "info",
-                        "Detected document date {} based on string {} "
-                        "from document title"
-                        "".format(date.isoformat(), date_string)
-                    )
-                    return date
-
-        try:
-            # getting text after checking filename will save time if only
-            # looking at the filename instead of the whole text
-            text = self.get_text()
-        except ParseError:
-            return None
-
-        # Iterate through all regex matches in text and try to parse the date
-        for m in re.finditer(DATE_REGEX, text):
-            date_string = m.group(0)
-
-            try:
-                date = __parser(date_string, settings.DATE_ORDER)
-            except (TypeError, ValueError):
-                # Skip all matches that do not parse to a proper date
-                continue
-
-            if date is not None and next_year > date.year > 1900:
-                break
-            else:
-                date = None
-
-        if date is not None:
-            self.log(
-                "info",
-                "Detected document date {} based on string {}".format(
-                    date.isoformat(),
-                    date_string
-                )
-            )
-        else:
-            self.log("info", "Unable to detect date for document")
-
-        return date
+        return self.date

    def cleanup(self):
        self.log("debug", "Deleting directory {}".format(self.tempdir))