mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-15 10:13:15 -05:00
silenced unpaper, optipng for cleaner output
moved parser settings to settings removed forgiving ocr (now default) since tesseract is plenty accurate even without defining the correct language.
This commit is contained in:
parent
1b5344ddee
commit
9f55fb668d
@ -212,11 +212,6 @@
|
|||||||
#PAPERLESS_CONSUMER_LOOP_TIME=10
|
#PAPERLESS_CONSUMER_LOOP_TIME=10
|
||||||
|
|
||||||
|
|
||||||
# By default Paperless stops consuming a document if no language can be
|
|
||||||
# detected. Set to true to consume documents even if the language detection
|
|
||||||
# fails.
|
|
||||||
#PAPERLESS_FORGIVING_OCR="false"
|
|
||||||
|
|
||||||
|
|
||||||
# By default Paperless does not OCR a document if the text can be retrieved from
|
# By default Paperless does not OCR a document if the text can be retrieved from
|
||||||
# the document directly. Set to true to always OCR documents.
|
# the document directly. Set to true to always OCR documents.
|
||||||
|
@ -60,7 +60,7 @@ class DocumentParser:
|
|||||||
|
|
||||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||||
|
|
||||||
args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
|
args = (self.OPTIPNG, "-silent", "-o5", in_path, "-out", out_path)
|
||||||
if not subprocess.Popen(args).wait() == 0:
|
if not subprocess.Popen(args).wait() == 0:
|
||||||
raise ParseError("Optipng failed at {}".format(args))
|
raise ParseError("Optipng failed at {}".format(args))
|
||||||
|
|
||||||
|
@ -103,9 +103,6 @@ REST_FRAMEWORK = {
|
|||||||
'rest_framework.authentication.BasicAuthentication',
|
'rest_framework.authentication.BasicAuthentication',
|
||||||
'rest_framework.authentication.TokenAuthentication',
|
'rest_framework.authentication.TokenAuthentication',
|
||||||
'paperless.auth.QueryTokenAuthentication'
|
'paperless.auth.QueryTokenAuthentication'
|
||||||
],
|
|
||||||
'DEFAULT_PERMISSION_CLASSES': [
|
|
||||||
'rest_framework.permissions.IsAuthenticated',
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -260,15 +257,11 @@ LOGGING = {
|
|||||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||||
|
|
||||||
# The amount of threads to use for OCR
|
# The amount of threads to use for OCR
|
||||||
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
|
OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", 4))
|
||||||
|
|
||||||
# OCR all documents?
|
# OCR all documents?
|
||||||
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS")
|
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS")
|
||||||
|
|
||||||
# If this is true, any failed attempts to OCR a PDF will result in the PDF
|
|
||||||
# being indexed anyway, with whatever we could get. If it's False, the file
|
|
||||||
# will simply be left in the CONSUMPTION_DIR.
|
|
||||||
FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR")
|
|
||||||
|
|
||||||
# GNUPG needs a home directory for some reason
|
# GNUPG needs a home directory for some reason
|
||||||
GNUPG_HOME = os.getenv("HOME", "/tmp")
|
GNUPG_HOME = os.getenv("HOME", "/tmp")
|
||||||
@ -277,7 +270,7 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
|
|||||||
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
|
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
|
||||||
CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
|
CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
|
||||||
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
|
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
|
||||||
CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")
|
CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))
|
||||||
|
|
||||||
# Ghostscript
|
# Ghostscript
|
||||||
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
|
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
|
||||||
@ -327,3 +320,8 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
|||||||
FILENAME_PARSE_TRANSFORMS = []
|
FILENAME_PARSE_TRANSFORMS = []
|
||||||
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
|
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
|
||||||
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
|
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
|
||||||
|
|
||||||
|
|
||||||
|
CELERY_TASK_TRACK_STARTED = True
|
||||||
|
CELERY_RESULT_BACKEND = 'db+sqlite:///results.sqlite'
|
||||||
|
CELERY_WORKER_PREFETCH_MULTIPLIER = 1
|
||||||
|
@ -28,14 +28,6 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
CONVERT = settings.CONVERT_BINARY
|
|
||||||
GHOSTSCRIPT = settings.GS_BINARY
|
|
||||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
|
||||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
|
||||||
UNPAPER = settings.UNPAPER_BINARY
|
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
|
||||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
|
||||||
|
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
super().__init__(path)
|
super().__init__(path)
|
||||||
self._text = None
|
self._text = None
|
||||||
@ -50,7 +42,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
# Run convert to get a decent thumbnail
|
# Run convert to get a decent thumbnail
|
||||||
try:
|
try:
|
||||||
run_convert(
|
run_convert(
|
||||||
self.CONVERT,
|
settings.CONVERT_BINARY,
|
||||||
"-density", "300",
|
"-density", "300",
|
||||||
"-scale", "500x5000>",
|
"-scale", "500x5000>",
|
||||||
"-alpha", "remove",
|
"-alpha", "remove",
|
||||||
@ -67,7 +59,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"falling back to Ghostscript."
|
"falling back to Ghostscript."
|
||||||
)
|
)
|
||||||
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
|
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
|
||||||
cmd = [self.GHOSTSCRIPT,
|
cmd = [settings.GS_BINARY,
|
||||||
"-q",
|
"-q",
|
||||||
"-sDEVICE=pngalpha",
|
"-sDEVICE=pngalpha",
|
||||||
"-o", gs_out_path,
|
"-o", gs_out_path,
|
||||||
@ -76,7 +68,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
||||||
# then run convert on the output from gs
|
# then run convert on the output from gs
|
||||||
run_convert(
|
run_convert(
|
||||||
self.CONVERT,
|
settings.CONVERT_BINARY,
|
||||||
"-density", "300",
|
"-density", "300",
|
||||||
"-scale", "500x5000>",
|
"-scale", "500x5000>",
|
||||||
"-alpha", "remove",
|
"-alpha", "remove",
|
||||||
@ -101,7 +93,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
if self._text is not None:
|
if self._text is not None:
|
||||||
return self._text
|
return self._text
|
||||||
|
|
||||||
if not self.OCR_ALWAYS and self._is_ocred():
|
if not settings.OCR_ALWAYS and self._is_ocred():
|
||||||
self.log("info", "Skipping OCR, using Text from PDF")
|
self.log("info", "Skipping OCR, using Text from PDF")
|
||||||
self._text = get_text_from_pdf(self.document_path)
|
self._text = get_text_from_pdf(self.document_path)
|
||||||
return self._text
|
return self._text
|
||||||
@ -122,8 +114,8 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
# Convert PDF to multiple PNMs
|
# Convert PDF to multiple PNMs
|
||||||
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||||
run_convert(
|
run_convert(
|
||||||
self.CONVERT,
|
settings.CONVERT_BINARY,
|
||||||
"-density", str(self.DENSITY),
|
"-density", str(settings.CONVERT_DENSITY),
|
||||||
"-depth", "8",
|
"-depth", "8",
|
||||||
"-type", "grayscale",
|
"-type", "grayscale",
|
||||||
self.document_path, pnm,
|
self.document_path, pnm,
|
||||||
@ -136,8 +128,8 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
pnms.append(os.path.join(self.tempdir, f))
|
pnms.append(os.path.join(self.tempdir, f))
|
||||||
|
|
||||||
# Run unpaper in parallel on converted images
|
# Run unpaper in parallel on converted images
|
||||||
with Pool(processes=self.THREADS) as pool:
|
with Pool(processes=settings.OCR_THREADS) as pool:
|
||||||
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
pool.map(run_unpaper, itertools.product([settings.UNPAPER_BINARY], pnms))
|
||||||
|
|
||||||
# Return list of converted images, processed with unpaper
|
# Return list of converted images, processed with unpaper
|
||||||
pnms = []
|
pnms = []
|
||||||
@ -162,40 +154,29 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if not imgs:
|
if not imgs:
|
||||||
raise OCRError("No images found")
|
raise OCRError("Empty document, nothing to do.")
|
||||||
|
|
||||||
self.log("info", "OCRing the document")
|
self.log("info", "OCRing the document")
|
||||||
|
|
||||||
# Since the division gets rounded down by int, this calculation works
|
# Since the division gets rounded down by int, this calculation works
|
||||||
# for every edge-case, i.e. 1
|
# for every edge-case, i.e. 1
|
||||||
middle = int(len(imgs) / 2)
|
middle = int(len(imgs) / 2)
|
||||||
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
raw_text = self._ocr([imgs[middle]], settings.OCR_LANGUAGE)
|
||||||
|
|
||||||
guessed_language = self._guess_language(raw_text)
|
guessed_language = self._guess_language(raw_text)
|
||||||
|
|
||||||
if not guessed_language or guessed_language not in ISO639:
|
if not guessed_language or guessed_language not in ISO639:
|
||||||
self.log("warning", "Language detection failed!")
|
self.log("warning", "Language detection failed!")
|
||||||
if settings.FORGIVING_OCR:
|
|
||||||
self.log(
|
|
||||||
"warning",
|
|
||||||
"As FORGIVING_OCR is enabled, we're going to make the "
|
|
||||||
"best with what we have."
|
|
||||||
)
|
|
||||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
return raw_text
|
return raw_text
|
||||||
error_msg = ("Language detection failed. Set "
|
|
||||||
"PAPERLESS_FORGIVING_OCR in config file to continue "
|
|
||||||
"anyway.")
|
|
||||||
raise OCRError(error_msg)
|
|
||||||
|
|
||||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
if ISO639[guessed_language] == settings.OCR_LANGUAGE:
|
||||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
return raw_text
|
return raw_text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return self._ocr(imgs, ISO639[guessed_language])
|
return self._ocr(imgs, ISO639[guessed_language])
|
||||||
except pyocr.pyocr.tesseract.TesseractError:
|
except pyocr.pyocr.tesseract.TesseractError:
|
||||||
if settings.FORGIVING_OCR:
|
|
||||||
self.log(
|
self.log(
|
||||||
"warning",
|
"warning",
|
||||||
"OCR for {} failed, but we're going to stick with what "
|
"OCR for {} failed, but we're going to stick with what "
|
||||||
@ -205,10 +186,6 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
)
|
)
|
||||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
return raw_text
|
return raw_text
|
||||||
raise OCRError(
|
|
||||||
"The guessed language ({}) is not available in this instance "
|
|
||||||
"of Tesseract.".format(guessed_language)
|
|
||||||
)
|
|
||||||
|
|
||||||
def _ocr(self, imgs, lang):
|
def _ocr(self, imgs, lang):
|
||||||
"""
|
"""
|
||||||
@ -220,7 +197,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
self.log("info", "Parsing for {}".format(lang))
|
self.log("info", "Parsing for {}".format(lang))
|
||||||
|
|
||||||
with Pool(processes=self.THREADS) as pool:
|
with Pool(processes=settings.OCR_THREADS) as pool:
|
||||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||||
r = " ".join(r)
|
r = " ".join(r)
|
||||||
|
|
||||||
@ -232,13 +209,12 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
Given a `middle` value and the text that middle page represents, we OCR
|
Given a `middle` value and the text that middle page represents, we OCR
|
||||||
the remainder of the document and return the whole thing.
|
the remainder of the document and return the whole thing.
|
||||||
"""
|
"""
|
||||||
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text
|
||||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def run_convert(*args):
|
def run_convert(*args):
|
||||||
|
|
||||||
environment = os.environ.copy()
|
environment = os.environ.copy()
|
||||||
if settings.CONVERT_MEMORY_LIMIT:
|
if settings.CONVERT_MEMORY_LIMIT:
|
||||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||||
@ -251,7 +227,7 @@ def run_convert(*args):
|
|||||||
|
|
||||||
def run_unpaper(args):
|
def run_unpaper(args):
|
||||||
unpaper, pnm = args
|
unpaper, pnm = args
|
||||||
command_args = (unpaper, "--overwrite", pnm,
|
command_args = (unpaper, "--overwrite", "--quiet", pnm,
|
||||||
pnm.replace(".pnm", ".unpaper.pnm"))
|
pnm.replace(".pnm", ".unpaper.pnm"))
|
||||||
if not subprocess.Popen(command_args).wait() == 0:
|
if not subprocess.Popen(command_args).wait() == 0:
|
||||||
raise ParseError("Unpaper failed at {}".format(command_args))
|
raise ParseError("Unpaper failed at {}".format(command_args))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user