silenced unpaper, optipng for cleaner output

moved parser settings to settings
removed forgiving ocr (now default) since tesseract is plenty accurate even without defining the correct language.
This commit is contained in:
Jonas Winkler 2020-11-01 23:23:42 +01:00
parent 1b5344ddee
commit 9f55fb668d
4 changed files with 33 additions and 64 deletions

View File

@ -212,11 +212,6 @@
#PAPERLESS_CONSUMER_LOOP_TIME=10
# By default Paperless stops consuming a document if no language can be
# detected. Set to true to consume documents even if the language detection
# fails.
#PAPERLESS_FORGIVING_OCR="false"
# By default Paperless does not OCR a document if the text can be retrieved from
# the document directly. Set to true to always OCR documents.

View File

@ -60,7 +60,7 @@ class DocumentParser:
out_path = os.path.join(self.tempdir, "optipng.png")
args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
args = (self.OPTIPNG, "-silent", "-o5", in_path, "-out", out_path)
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))

View File

@ -103,9 +103,6 @@ REST_FRAMEWORK = {
'rest_framework.authentication.BasicAuthentication',
'rest_framework.authentication.TokenAuthentication',
'paperless.auth.QueryTokenAuthentication'
],
'DEFAULT_PERMISSION_CLASSES': [
'rest_framework.permissions.IsAuthenticated',
]
}
@ -260,15 +257,11 @@ LOGGING = {
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# The amount of threads to use for OCR
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", 4))
# OCR all documents?
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS")
# If this is true, any failed attempts to OCR a PDF will result in the PDF
# being indexed anyway, with whatever we could get. If it's False, the file
# will simply be left in the CONSUMPTION_DIR.
FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR")
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp")
@ -277,7 +270,7 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")
CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))
# Ghostscript
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
@ -327,3 +320,8 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
FILENAME_PARSE_TRANSFORMS = []
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
CELERY_TASK_TRACK_STARTED = True
CELERY_RESULT_BACKEND = 'db+sqlite:///results.sqlite'
CELERY_WORKER_PREFETCH_MULTIPLIER = 1

View File

@ -28,14 +28,6 @@ class RasterisedDocumentParser(DocumentParser):
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
"""
CONVERT = settings.CONVERT_BINARY
GHOSTSCRIPT = settings.GS_BINARY
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
def __init__(self, path):
super().__init__(path)
self._text = None
@ -50,7 +42,7 @@ class RasterisedDocumentParser(DocumentParser):
# Run convert to get a decent thumbnail
try:
run_convert(
self.CONVERT,
settings.CONVERT_BINARY,
"-density", "300",
"-scale", "500x5000>",
"-alpha", "remove",
@ -67,7 +59,7 @@ class RasterisedDocumentParser(DocumentParser):
"falling back to Ghostscript."
)
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [self.GHOSTSCRIPT,
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
@ -76,7 +68,7 @@ class RasterisedDocumentParser(DocumentParser):
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(
self.CONVERT,
settings.CONVERT_BINARY,
"-density", "300",
"-scale", "500x5000>",
"-alpha", "remove",
@ -101,7 +93,7 @@ class RasterisedDocumentParser(DocumentParser):
if self._text is not None:
return self._text
if not self.OCR_ALWAYS and self._is_ocred():
if not settings.OCR_ALWAYS and self._is_ocred():
self.log("info", "Skipping OCR, using Text from PDF")
self._text = get_text_from_pdf(self.document_path)
return self._text
@ -122,8 +114,8 @@ class RasterisedDocumentParser(DocumentParser):
# Convert PDF to multiple PNMs
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
run_convert(
self.CONVERT,
"-density", str(self.DENSITY),
settings.CONVERT_BINARY,
"-density", str(settings.CONVERT_DENSITY),
"-depth", "8",
"-type", "grayscale",
self.document_path, pnm,
@ -136,8 +128,8 @@ class RasterisedDocumentParser(DocumentParser):
pnms.append(os.path.join(self.tempdir, f))
# Run unpaper in parallel on converted images
with Pool(processes=self.THREADS) as pool:
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
with Pool(processes=settings.OCR_THREADS) as pool:
pool.map(run_unpaper, itertools.product([settings.UNPAPER_BINARY], pnms))
# Return list of converted images, processed with unpaper
pnms = []
@ -162,53 +154,38 @@ class RasterisedDocumentParser(DocumentParser):
"""
if not imgs:
raise OCRError("No images found")
raise OCRError("Empty document, nothing to do.")
self.log("info", "OCRing the document")
# Since the division gets rounded down by int, this calculation works
# for every edge-case, i.e. 1
middle = int(len(imgs) / 2)
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
raw_text = self._ocr([imgs[middle]], settings.OCR_LANGUAGE)
guessed_language = self._guess_language(raw_text)
if not guessed_language or guessed_language not in ISO639:
self.log("warning", "Language detection failed!")
if settings.FORGIVING_OCR:
self.log(
"warning",
"As FORGIVING_OCR is enabled, we're going to make the "
"best with what we have."
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
error_msg = ("Language detection failed. Set "
"PAPERLESS_FORGIVING_OCR in config file to continue "
"anyway.")
raise OCRError(error_msg)
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
if ISO639[guessed_language] == settings.OCR_LANGUAGE:
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
try:
return self._ocr(imgs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self.log(
"warning",
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
)
self.log(
"warning",
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
raise OCRError(
"The guessed language ({}) is not available in this instance "
"of Tesseract.".format(guessed_language)
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
def _ocr(self, imgs, lang):
"""
@ -220,7 +197,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log("info", "Parsing for {}".format(lang))
with Pool(processes=self.THREADS) as pool:
with Pool(processes=settings.OCR_THREADS) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
r = " ".join(r)
@ -232,13 +209,12 @@ class RasterisedDocumentParser(DocumentParser):
Given a `middle` value and the text that middle page represents, we OCR
the remainder of the document and return the whole thing.
"""
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text
text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE)
return text
def run_convert(*args):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@ -251,7 +227,7 @@ def run_convert(*args):
def run_unpaper(args):
unpaper, pnm = args
command_args = (unpaper, "--overwrite", pnm,
command_args = (unpaper, "--overwrite", "--quiet", pnm,
pnm.replace(".pnm", ".unpaper.pnm"))
if not subprocess.Popen(command_args).wait() == 0:
raise ParseError("Unpaper failed at {}".format(command_args))