new setting: PAPERLESS_OCR_PAGES

2026-02-18 00:29:35 -06:00 · 2020-11-22 12:54:08 +01:00
parent ea089de3b3
commit fec9e54049
6 changed files with 54 additions and 5 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -147,18 +147,25 @@ class RasterisedDocumentParser(DocumentParser):
        Greyscale images are easier for Tesseract to OCR
        """

+        # Convert PDF to multiple PNMs
+        input_file = self.document_path
+
+        if settings.OCR_PAGES == 1:
+            input_file += "[0]"
+        elif settings.OCR_PAGES > 1:
+            input_file += f"[0-{settings.OCR_PAGES - 1}]"
+
        self.log(
            "debug",
-            f"Converting document {self.document_path} into greyscale images")
+            f"Converting document {input_file} into greyscale images")

-        # Convert PDF to multiple PNMs
-        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
+        output_files = os.path.join(self.tempdir, "convert-%04d.pnm")

        run_convert(density=settings.CONVERT_DENSITY,
                    depth="8",
                    type="grayscale",
-                    input_file=self.document_path,
-                    output_file=pnm,
+                    input_file=input_file,
+                    output_file=output_files,
                    logging_group=self.logging_group)

        # Get a list of converted images