silenced unpaper, optipng for cleaner output

moved parser settings to settings
removed forgiving ocr (now default) since tesseract is plenty accurate even without defining the correct language.
This commit is contained in:
Jonas Winkler 2020-11-01 23:23:42 +01:00
parent 1b5344ddee
commit 9f55fb668d
4 changed files with 33 additions and 64 deletions

View File

@ -212,11 +212,6 @@
#PAPERLESS_CONSUMER_LOOP_TIME=10 #PAPERLESS_CONSUMER_LOOP_TIME=10
# By default Paperless stops consuming a document if no language can be
# detected. Set to true to consume documents even if the language detection
# fails.
#PAPERLESS_FORGIVING_OCR="false"
# By default Paperless does not OCR a document if the text can be retrieved from # By default Paperless does not OCR a document if the text can be retrieved from
# the document directly. Set to true to always OCR documents. # the document directly. Set to true to always OCR documents.

View File

@ -60,7 +60,7 @@ class DocumentParser:
out_path = os.path.join(self.tempdir, "optipng.png") out_path = os.path.join(self.tempdir, "optipng.png")
args = (self.OPTIPNG, "-o5", in_path, "-out", out_path) args = (self.OPTIPNG, "-silent", "-o5", in_path, "-out", out_path)
if not subprocess.Popen(args).wait() == 0: if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args)) raise ParseError("Optipng failed at {}".format(args))

View File

@ -103,9 +103,6 @@ REST_FRAMEWORK = {
'rest_framework.authentication.BasicAuthentication', 'rest_framework.authentication.BasicAuthentication',
'rest_framework.authentication.TokenAuthentication', 'rest_framework.authentication.TokenAuthentication',
'paperless.auth.QueryTokenAuthentication' 'paperless.auth.QueryTokenAuthentication'
],
'DEFAULT_PERMISSION_CLASSES': [
'rest_framework.permissions.IsAuthenticated',
] ]
} }
@ -260,15 +257,11 @@ LOGGING = {
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# The amount of threads to use for OCR # The amount of threads to use for OCR
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", 4))
# OCR all documents? # OCR all documents?
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS") OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS")
# If this is true, any failed attempts to OCR a PDF will result in the PDF
# being indexed anyway, with whatever we could get. If it's False, the file
# will simply be left in the CONSUMPTION_DIR.
FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR")
# GNUPG needs a home directory for some reason # GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp") GNUPG_HOME = os.getenv("HOME", "/tmp")
@ -277,7 +270,7 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert") CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY") CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))
# Ghostscript # Ghostscript
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
@ -327,3 +320,8 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
FILENAME_PARSE_TRANSFORMS = [] FILENAME_PARSE_TRANSFORMS = []
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")): for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"])) FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
CELERY_TASK_TRACK_STARTED = True
CELERY_RESULT_BACKEND = 'db+sqlite:///results.sqlite'
CELERY_WORKER_PREFETCH_MULTIPLIER = 1

View File

@ -28,14 +28,6 @@ class RasterisedDocumentParser(DocumentParser):
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
""" """
CONVERT = settings.CONVERT_BINARY
GHOSTSCRIPT = settings.GS_BINARY
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
def __init__(self, path): def __init__(self, path):
super().__init__(path) super().__init__(path)
self._text = None self._text = None
@ -50,7 +42,7 @@ class RasterisedDocumentParser(DocumentParser):
# Run convert to get a decent thumbnail # Run convert to get a decent thumbnail
try: try:
run_convert( run_convert(
self.CONVERT, settings.CONVERT_BINARY,
"-density", "300", "-density", "300",
"-scale", "500x5000>", "-scale", "500x5000>",
"-alpha", "remove", "-alpha", "remove",
@ -67,7 +59,7 @@ class RasterisedDocumentParser(DocumentParser):
"falling back to Ghostscript." "falling back to Ghostscript."
) )
gs_out_path = os.path.join(self.tempdir, "gs_out.png") gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [self.GHOSTSCRIPT, cmd = [settings.GS_BINARY,
"-q", "-q",
"-sDEVICE=pngalpha", "-sDEVICE=pngalpha",
"-o", gs_out_path, "-o", gs_out_path,
@ -76,7 +68,7 @@ class RasterisedDocumentParser(DocumentParser):
raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs # then run convert on the output from gs
run_convert( run_convert(
self.CONVERT, settings.CONVERT_BINARY,
"-density", "300", "-density", "300",
"-scale", "500x5000>", "-scale", "500x5000>",
"-alpha", "remove", "-alpha", "remove",
@ -101,7 +93,7 @@ class RasterisedDocumentParser(DocumentParser):
if self._text is not None: if self._text is not None:
return self._text return self._text
if not self.OCR_ALWAYS and self._is_ocred(): if not settings.OCR_ALWAYS and self._is_ocred():
self.log("info", "Skipping OCR, using Text from PDF") self.log("info", "Skipping OCR, using Text from PDF")
self._text = get_text_from_pdf(self.document_path) self._text = get_text_from_pdf(self.document_path)
return self._text return self._text
@ -122,8 +114,8 @@ class RasterisedDocumentParser(DocumentParser):
# Convert PDF to multiple PNMs # Convert PDF to multiple PNMs
pnm = os.path.join(self.tempdir, "convert-%04d.pnm") pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
run_convert( run_convert(
self.CONVERT, settings.CONVERT_BINARY,
"-density", str(self.DENSITY), "-density", str(settings.CONVERT_DENSITY),
"-depth", "8", "-depth", "8",
"-type", "grayscale", "-type", "grayscale",
self.document_path, pnm, self.document_path, pnm,
@ -136,8 +128,8 @@ class RasterisedDocumentParser(DocumentParser):
pnms.append(os.path.join(self.tempdir, f)) pnms.append(os.path.join(self.tempdir, f))
# Run unpaper in parallel on converted images # Run unpaper in parallel on converted images
with Pool(processes=self.THREADS) as pool: with Pool(processes=settings.OCR_THREADS) as pool:
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) pool.map(run_unpaper, itertools.product([settings.UNPAPER_BINARY], pnms))
# Return list of converted images, processed with unpaper # Return list of converted images, processed with unpaper
pnms = [] pnms = []
@ -162,40 +154,29 @@ class RasterisedDocumentParser(DocumentParser):
""" """
if not imgs: if not imgs:
raise OCRError("No images found") raise OCRError("Empty document, nothing to do.")
self.log("info", "OCRing the document") self.log("info", "OCRing the document")
# Since the division gets rounded down by int, this calculation works # Since the division gets rounded down by int, this calculation works
# for every edge-case, i.e. 1 # for every edge-case, i.e. 1
middle = int(len(imgs) / 2) middle = int(len(imgs) / 2)
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) raw_text = self._ocr([imgs[middle]], settings.OCR_LANGUAGE)
guessed_language = self._guess_language(raw_text) guessed_language = self._guess_language(raw_text)
if not guessed_language or guessed_language not in ISO639: if not guessed_language or guessed_language not in ISO639:
self.log("warning", "Language detection failed!") self.log("warning", "Language detection failed!")
if settings.FORGIVING_OCR:
self.log(
"warning",
"As FORGIVING_OCR is enabled, we're going to make the "
"best with what we have."
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text return raw_text
error_msg = ("Language detection failed. Set "
"PAPERLESS_FORGIVING_OCR in config file to continue "
"anyway.")
raise OCRError(error_msg)
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: if ISO639[guessed_language] == settings.OCR_LANGUAGE:
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text return raw_text
try: try:
return self._ocr(imgs, ISO639[guessed_language]) return self._ocr(imgs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError: except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self.log( self.log(
"warning", "warning",
"OCR for {} failed, but we're going to stick with what " "OCR for {} failed, but we're going to stick with what "
@ -205,10 +186,6 @@ class RasterisedDocumentParser(DocumentParser):
) )
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text return raw_text
raise OCRError(
"The guessed language ({}) is not available in this instance "
"of Tesseract.".format(guessed_language)
)
def _ocr(self, imgs, lang): def _ocr(self, imgs, lang):
""" """
@ -220,7 +197,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log("info", "Parsing for {}".format(lang)) self.log("info", "Parsing for {}".format(lang))
with Pool(processes=self.THREADS) as pool: with Pool(processes=settings.OCR_THREADS) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang])) r = pool.map(image_to_string, itertools.product(imgs, [lang]))
r = " ".join(r) r = " ".join(r)
@ -232,13 +209,12 @@ class RasterisedDocumentParser(DocumentParser):
Given a `middle` value and the text that middle page represents, we OCR Given a `middle` value and the text that middle page represents, we OCR
the remainder of the document and return the whole thing. the remainder of the document and return the whole thing.
""" """
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE)
return text return text
def run_convert(*args): def run_convert(*args):
environment = os.environ.copy() environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT: if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@ -251,7 +227,7 @@ def run_convert(*args):
def run_unpaper(args): def run_unpaper(args):
unpaper, pnm = args unpaper, pnm = args
command_args = (unpaper, "--overwrite", pnm, command_args = (unpaper, "--overwrite", "--quiet", pnm,
pnm.replace(".pnm", ".unpaper.pnm")) pnm.replace(".pnm", ".unpaper.pnm"))
if not subprocess.Popen(command_args).wait() == 0: if not subprocess.Popen(command_args).wait() == 0:
raise ParseError("Unpaper failed at {}".format(command_args)) raise ParseError("Unpaper failed at {}".format(command_args))