From 9f55fb668d36ac68fe8e3bb3dd2dd20780d99001 Mon Sep 17 00:00:00 2001
From: Jonas Winkler <jonas.winkler@jpwinkler.de>
Date: Sun, 1 Nov 2020 23:23:42 +0100
Subject: [PATCH] silenced unpaper, optipng for cleaner output

moved parser settings to settings
removed forgiving ocr (now default) since tesseract is plenty accurate even without defining the correct language.
---
 paperless.conf.example             |  5 --
 src/documents/parsers.py           |  2 +-
 src/paperless/settings.py          | 16 +++----
 src/paperless_tesseract/parsers.py | 74 ++++++++++--------------------
 4 files changed, 33 insertions(+), 64 deletions(-)

diff --git a/paperless.conf.example b/paperless.conf.example
index 2bb24cee4..5c50acf8e 100644
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -212,11 +212,6 @@
 #PAPERLESS_CONSUMER_LOOP_TIME=10
 
 
-# By default Paperless stops consuming a document if no language can be
-# detected. Set to true to consume documents even if the language detection
-# fails.
-#PAPERLESS_FORGIVING_OCR="false"
-
 
 # By default Paperless does not OCR a document if the text can be retrieved from
 # the document directly. Set to true to always OCR documents.
diff --git a/src/documents/parsers.py b/src/documents/parsers.py
index c0a80a55d..c5594e56f 100644
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -60,7 +60,7 @@ class DocumentParser:
 
         out_path = os.path.join(self.tempdir, "optipng.png")
 
-        args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
+        args = (self.OPTIPNG, "-silent", "-o5", in_path, "-out", out_path)
         if not subprocess.Popen(args).wait() == 0:
             raise ParseError("Optipng failed at {}".format(args))
 
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 9c5f2bd0f..9aa2b98af 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -103,9 +103,6 @@ REST_FRAMEWORK = {
         'rest_framework.authentication.BasicAuthentication',
         'rest_framework.authentication.TokenAuthentication',
         'paperless.auth.QueryTokenAuthentication'
-    ],
-    'DEFAULT_PERMISSION_CLASSES': [
-        'rest_framework.permissions.IsAuthenticated',
     ]
 }
 
@@ -260,15 +257,11 @@ LOGGING = {
 OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 
 # The amount of threads to use for OCR
-OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
+OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", 4))
 
 # OCR all documents?
 OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS")
 
-# If this is true, any failed attempts to OCR a PDF will result in the PDF
-# being indexed anyway, with whatever we could get.  If it's False, the file
-# will simply be left in the CONSUMPTION_DIR.
-FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR")
 
 # GNUPG needs a home directory for some reason
 GNUPG_HOME = os.getenv("HOME", "/tmp")
@@ -277,7 +270,7 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
 CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
 CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
 CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
-CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")
+CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))
 
 # Ghostscript
 GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
@@ -327,3 +320,8 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 FILENAME_PARSE_TRANSFORMS = []
 for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
     FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
+
+
+CELERY_TASK_TRACK_STARTED = True
+CELERY_RESULT_BACKEND = 'db+sqlite:///results.sqlite'
+CELERY_WORKER_PREFETCH_MULTIPLIER = 1
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
index eeca540b1..f05bcb5f5 100644
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -28,14 +28,6 @@ class RasterisedDocumentParser(DocumentParser):
     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
     """
 
-    CONVERT = settings.CONVERT_BINARY
-    GHOSTSCRIPT = settings.GS_BINARY
-    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
-    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
-    UNPAPER = settings.UNPAPER_BINARY
-    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
-    OCR_ALWAYS = settings.OCR_ALWAYS
-
     def __init__(self, path):
         super().__init__(path)
         self._text = None
@@ -50,7 +42,7 @@ class RasterisedDocumentParser(DocumentParser):
         # Run convert to get a decent thumbnail
         try:
             run_convert(
-                self.CONVERT,
+                settings.CONVERT_BINARY,
                 "-density", "300",
                 "-scale", "500x5000>",
                 "-alpha", "remove",
@@ -67,7 +59,7 @@ class RasterisedDocumentParser(DocumentParser):
                 "falling back to Ghostscript."
             )
             gs_out_path = os.path.join(self.tempdir, "gs_out.png")
-            cmd = [self.GHOSTSCRIPT,
+            cmd = [settings.GS_BINARY,
                    "-q",
                    "-sDEVICE=pngalpha",
                    "-o", gs_out_path,
@@ -76,7 +68,7 @@ class RasterisedDocumentParser(DocumentParser):
                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
             # then run convert on the output from gs
             run_convert(
-                self.CONVERT,
+                settings.CONVERT_BINARY,
                 "-density", "300",
                 "-scale", "500x5000>",
                 "-alpha", "remove",
@@ -101,7 +93,7 @@ class RasterisedDocumentParser(DocumentParser):
         if self._text is not None:
             return self._text
 
-        if not self.OCR_ALWAYS and self._is_ocred():
+        if not settings.OCR_ALWAYS and self._is_ocred():
             self.log("info", "Skipping OCR, using Text from PDF")
             self._text = get_text_from_pdf(self.document_path)
             return self._text
@@ -122,8 +114,8 @@ class RasterisedDocumentParser(DocumentParser):
         # Convert PDF to multiple PNMs
         pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
         run_convert(
-            self.CONVERT,
-            "-density", str(self.DENSITY),
+            settings.CONVERT_BINARY,
+            "-density", str(settings.CONVERT_DENSITY),
             "-depth", "8",
             "-type", "grayscale",
             self.document_path, pnm,
@@ -136,8 +128,8 @@ class RasterisedDocumentParser(DocumentParser):
                 pnms.append(os.path.join(self.tempdir, f))
 
         # Run unpaper in parallel on converted images
-        with Pool(processes=self.THREADS) as pool:
-            pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
+        with Pool(processes=settings.OCR_THREADS) as pool:
+            pool.map(run_unpaper, itertools.product([settings.UNPAPER_BINARY], pnms))
 
         # Return list of converted images, processed with unpaper
         pnms = []
@@ -162,53 +154,38 @@ class RasterisedDocumentParser(DocumentParser):
         """
 
         if not imgs:
-            raise OCRError("No images found")
+            raise OCRError("Empty document, nothing to do.")
 
         self.log("info", "OCRing the document")
 
         # Since the division gets rounded down by int, this calculation works
         # for every edge-case, i.e. 1
         middle = int(len(imgs) / 2)
-        raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
-
+        raw_text = self._ocr([imgs[middle]], settings.OCR_LANGUAGE)
         guessed_language = self._guess_language(raw_text)
 
         if not guessed_language or guessed_language not in ISO639:
             self.log("warning", "Language detection failed!")
-            if settings.FORGIVING_OCR:
-                self.log(
-                    "warning",
-                    "As FORGIVING_OCR is enabled, we're going to make the "
-                    "best with what we have."
-                )
-                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
-                return raw_text
-            error_msg = ("Language detection failed. Set "
-                         "PAPERLESS_FORGIVING_OCR in config file to continue "
-                         "anyway.")
-            raise OCRError(error_msg)
 
-        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
+            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
+            return raw_text
+
+        if ISO639[guessed_language] == settings.OCR_LANGUAGE:
             raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
             return raw_text
 
         try:
             return self._ocr(imgs, ISO639[guessed_language])
         except pyocr.pyocr.tesseract.TesseractError:
-            if settings.FORGIVING_OCR:
-                self.log(
-                    "warning",
-                    "OCR for {} failed, but we're going to stick with what "
-                    "we've got since FORGIVING_OCR is enabled.".format(
-                        guessed_language
-                    )
+            self.log(
+                "warning",
+                "OCR for {} failed, but we're going to stick with what "
+                "we've got since FORGIVING_OCR is enabled.".format(
+                    guessed_language
                 )
-                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
-                return raw_text
-            raise OCRError(
-                "The guessed language ({}) is not available in this instance "
-                "of Tesseract.".format(guessed_language)
             )
+            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
+            return raw_text
 
     def _ocr(self, imgs, lang):
         """
@@ -220,7 +197,7 @@ class RasterisedDocumentParser(DocumentParser):
 
         self.log("info", "Parsing for {}".format(lang))
 
-        with Pool(processes=self.THREADS) as pool:
+        with Pool(processes=settings.OCR_THREADS) as pool:
             r = pool.map(image_to_string, itertools.product(imgs, [lang]))
             r = " ".join(r)
 
@@ -232,13 +209,12 @@ class RasterisedDocumentParser(DocumentParser):
         Given a `middle` value and the text that middle page represents, we OCR
         the remainder of the document and return the whole thing.
         """
-        text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
-        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
+        text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text
+        text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE)
         return text
 
 
 def run_convert(*args):
-
     environment = os.environ.copy()
     if settings.CONVERT_MEMORY_LIMIT:
         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@@ -251,7 +227,7 @@ def run_convert(*args):
 
 def run_unpaper(args):
     unpaper, pnm = args
-    command_args = (unpaper, "--overwrite", pnm,
+    command_args = (unpaper, "--overwrite", "--quiet", pnm,
                     pnm.replace(".pnm", ".unpaper.pnm"))
     if not subprocess.Popen(command_args).wait() == 0:
         raise ParseError("Unpaper failed at {}".format(command_args))