reorganised settings documentation and added OCR_USER_ARGS

2025-12-20 01:45:58 -06:00 · 2020-11-29 12:37:55 +01:00
parent 0e2a2eb53a
commit 388f6cfbe6
4 changed files with 135 additions and 80 deletions
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -350,6 +350,8 @@ OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")

 OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")

+OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
+
 # GNUPG needs a home directory for some reason
 GNUPG_HOME = os.getenv("HOME", "/tmp")

--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -1,3 +1,4 @@
+import json
 import os
 import re
 import subprocess
@@ -118,10 +119,22 @@ class RasterisedDocumentParser(DocumentParser):
                    f"no DPI information is present in this image and "
                    f"OCR_IMAGE_DPI is not set.")

+        if settings.OCR_USER_ARGS:
+            try:
+                user_args = json.loads(settings.OCR_USER_ARGS)
+                ocr_args = {**ocr_args, **user_args}
+            except Exception as e:
+                self.log(
+                    "warning",
+                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
+                    f"they will not be used: {e}")
+
        # This forces tesseract to use one core per page.
        os.environ['OMP_THREAD_LIMIT'] = "1"

        try:
+            self.log("debug",
+                     f"Calling OCRmyPDF with {str(ocr_args)}")
            ocrmypdf.ocr(**ocr_args)
            # success! announce results
            self.archive_path = archive_path