Add support for using pre-existing text from PDFs

2026-01-30 23:08:59 -06:00 · 2018-01-30 20:13:35 +00:00
parent 7ad7323cc7
commit 269c32ce6a
7 changed files with 60 additions and 13 deletions
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 # The amount of threads to use for OCR
 OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")

+# OCR all documents?
+OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
+
 # If this is true, any failed attempts to OCR a PDF will result in the PDF
 # being indexed anyway, with whatever we could get.  If it's False, the file
 # will simply be left in the CONSUMPTION_DIR.
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -3,6 +3,7 @@ import os
 import re
 import subprocess
 from multiprocessing.pool import Pool
+import pdftotext

 import langdetect
 import pyocr
@@ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser):
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
    UNPAPER = settings.UNPAPER_BINARY
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
+    OCR_ALWAYS = settings.OCR_ALWAYS

    def get_thumbnail(self):
        """
@@ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser):

        return os.path.join(self.tempdir, "convert-0000.png")

+    def _is_ocred(self):
+        # Extract text from PDF using pdftotext
+        text = get_text_from_pdf(self.document_path)
+
+        # We assume, that a PDF with at least 50 characters contains text
+        # (so no OCR required)
+        if len(text) > 50:
+            return True
+
+        return False
+
    def get_text(self):
+        if not self.OCR_ALWAYS and self._is_ocred():
+            self.log("info", "Skipping OCR, using Text from PDF")
+            return get_text_from_pdf(self.document_path)

        images = self._get_greyscale()

@@ -212,3 +228,13 @@ def image_to_string(args):
            except (TesseractError, OtherTesseractError):
                pass
        return ocr.image_to_string(f, lang=lang)
+
+
+def get_text_from_pdf(pdf_file):
+    with open(pdf_file, "rb") as f:
+        try:
+            pdf = pdftotext.PDF(f)
+        except pdftotext.Error:
+            return False
+
+    return "\n".join(pdf)