new setting: PAPERLESS_OCR_PAGES

2025-12-22 01:55:49 -06:00 · 2020-11-22 12:54:08 +01:00
parent 0e301e4563
commit ae198f0767
6 changed files with 54 additions and 5 deletions
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -26,6 +26,9 @@ next

 * Much better admin for mail rule editing.

+* New setting ``PAPERLESS_OCR_PAGES`` limits the tesseract parser
+  to the first n pages of scanned documents.
+

 paperless-ng 0.9.1
 ##################
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -184,6 +184,16 @@ PAPERLESS_TIME_ZONE=<timezone>



+PAPERLESS_OCR_PAGES=<num>
+    Tells paperless to use only the specified amount of pages for OCR. Documents
+    with less than the specified amount of pages get OCR'ed completely.
+
+    Specifying 1 here will only use the first page.
+
+    Defaults to 0, which disables this feature and always uses all pages.
+
+
+
 PAPERLESS_OCR_LANGUAGE=<lang>
    Customize the default language that tesseract will attempt to use when
    parsing documents. The default language is used whenever
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -358,4 +358,30 @@ management commands as below.
 7.  Start paperless.


+Considerations for less powerful devices
+########################################
+
+Paperless runs on Raspberry Pi. However, some things are rather slow on the Pi and 
+configuring some options in paperless can help improve performance immensely:
+
+*   Consider setting ``PAPERLESS_OCR_PAGES`` to 1, so that paperless will only OCR
+    the first page of your documents.
+*   ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured
+    to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that
+    paperless will use 2 workers and 2 threads per worker. This may result in
+    slugish response times during consumption, so you might want to lower these
+    settings (example: 2 workers and 1 thread to always have some computing power
+    left for other tasks).
+*   Keep ``PAPERLESS_OCR_ALWAYS`` at its default value 'false' and consider OCR'ing
+    your documents before feeding them into paperless. Some scanners are able to
+    do this!
+*   Lower ``PAPERLESS_CONVERT_DENSITY`` from its default value 300 to 200. This
+    will still result in rather accurate OCR, but will decrease consumption time
+    by quite a bit.
+*   Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
+    times. Thumbnails will be about 20% larger.
+
+For details, refer to :ref:`configuration`.
+
+
 .. _redis: https://redis.io/
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -35,6 +35,7 @@
 #PAPERLESS_TASK_WORKERS=1
 #PAPERLESS_THREADS_PER_WORKER=1
 #PAPERLESS_TIME_ZONE=UTC
+#PAPERLESS_OCR_PAGES=1
 #PAPERLESS_OCR_LANGUAGE=eng
 #PAPERLESS_OCR_ALWAYS=false
 #PAPERLESS_CONSUMER_POLLING=10
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -322,6 +322,8 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES

 OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")

+OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
+
 # The default language that tesseract will attempt to use when parsing
 # documents.  It should be a 3-letter language code consistent with ISO 639.
 OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -147,18 +147,25 @@ class RasterisedDocumentParser(DocumentParser):
        Greyscale images are easier for Tesseract to OCR
        """

+        # Convert PDF to multiple PNMs
+        input_file = self.document_path
+
+        if settings.OCR_PAGES == 1:
+            input_file += "[0]"
+        elif settings.OCR_PAGES > 1:
+            input_file += f"[0-{settings.OCR_PAGES - 1}]"
+
        self.log(
            "debug",
-            f"Converting document {self.document_path} into greyscale images")
+            f"Converting document {input_file} into greyscale images")

-        # Convert PDF to multiple PNMs
-        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
+        output_files = os.path.join(self.tempdir, "convert-%04d.pnm")

        run_convert(density=settings.CONVERT_DENSITY,
                    depth="8",
                    type="grayscale",
-                    input_file=self.document_path,
-                    output_file=pnm,
+                    input_file=input_file,
+                    output_file=output_files,
                    logging_group=self.logging_group)

        # Get a list of converted images