Add support for using pre-existing text from PDFs

2026-01-30 23:08:59 -06:00 · 2018-02-02 22:37:58 +01:00
parent bd0b593c4a
commit 87e466c47c
7 changed files with 60 additions and 13 deletions
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 # The amount of threads to use for OCR
 OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")

+# OCR all documents?
+OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
+
 # If this is true, any failed attempts to OCR a PDF will result in the PDF
 # being indexed anyway, with whatever we could get.  If it's False, the file
 # will simply be left in the CONSUMPTION_DIR.
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -4,6 +4,7 @@ import re
 import subprocess
 from multiprocessing.pool import Pool
 import dateparser
+import pdftotext

 import langdetect
 import pyocr
@@ -33,6 +34,7 @@ class RasterisedDocumentParser(DocumentParser):
    UNPAPER = settings.UNPAPER_BINARY
    DATE_ORDER = settings.DATE_ORDER
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
+    OCR_ALWAYS = settings.OCR_ALWAYS

    def get_thumbnail(self):
        """
@@ -48,7 +50,21 @@ class RasterisedDocumentParser(DocumentParser):

        return os.path.join(self.tempdir, "convert-0000.png")

+    def _is_ocred(self):
+        # Extract text from PDF using pdftotext
+        text = get_text_from_pdf(self.document_path)
+
+        # We assume, that a PDF with at least 50 characters contains text
+        # (so no OCR required)
+        if len(text) > 50:
+            return True
+
+        return False
+
    def get_text(self):
+        if not self.OCR_ALWAYS and self._is_ocred():
+            self.log("info", "Skipping OCR, using Text from PDF")
+            return get_text_from_pdf(self.document_path)

        images = self._get_greyscale()

@@ -237,3 +253,13 @@ def image_to_string(args):
            except (TesseractError, OtherTesseractError):
                pass
        return ocr.image_to_string(f, lang=lang)
+
+
+def get_text_from_pdf(pdf_file):
+    with open(pdf_file, "rb") as f:
+        try:
+            pdf = pdftotext.PDF(f)
+        except pdftotext.Error:
+            return False
+
+    return "\n".join(pdf)