Add support for using pre-existing text from PDFs

This commit is contained in:
Wolf-Bastian Pöttner
2018-02-02 22:37:58 +01:00
parent bd0b593c4a
commit 87e466c47c
7 changed files with 60 additions and 13 deletions

View File

@@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# The amount of threads to use for OCR
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
# OCR all documents?
OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
# If this is true, any failed attempts to OCR a PDF will result in the PDF
# being indexed anyway, with whatever we could get. If it's False, the file
# will simply be left in the CONSUMPTION_DIR.

View File

@@ -4,6 +4,7 @@ import re
import subprocess
from multiprocessing.pool import Pool
import dateparser
import pdftotext
import langdetect
import pyocr
@@ -33,6 +34,7 @@ class RasterisedDocumentParser(DocumentParser):
UNPAPER = settings.UNPAPER_BINARY
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
def get_thumbnail(self):
"""
@@ -48,7 +50,21 @@ class RasterisedDocumentParser(DocumentParser):
return os.path.join(self.tempdir, "convert-0000.png")
def _is_ocred(self):
# Extract text from PDF using pdftotext
text = get_text_from_pdf(self.document_path)
# We assume, that a PDF with at least 50 characters contains text
# (so no OCR required)
if len(text) > 50:
return True
return False
def get_text(self):
if not self.OCR_ALWAYS and self._is_ocred():
self.log("info", "Skipping OCR, using Text from PDF")
return get_text_from_pdf(self.document_path)
images = self._get_greyscale()
@@ -237,3 +253,13 @@ def image_to_string(args):
except (TesseractError, OtherTesseractError):
pass
return ocr.image_to_string(f, lang=lang)
def get_text_from_pdf(pdf_file):
with open(pdf_file, "rb") as f:
try:
pdf = pdftotext.PDF(f)
except pdftotext.Error:
return False
return "\n".join(pdf)