reworked most of the tesseract parser, better logging

This commit is contained in:
Jonas Winkler 2020-11-02 15:40:44 +01:00
parent a3c71ddad4
commit d15405ef56
8 changed files with 90 additions and 123 deletions

View File

@ -89,11 +89,13 @@ class Consumer:
if self._is_duplicate(doc): if self._is_duplicate(doc):
self.log( self.log(
"info", "warning",
"Skipping {} as it appears to be a duplicate".format(doc) "Skipping {} as it appears to be a duplicate".format(doc)
) )
return False return False
self.log("info", "Consuming {}".format(doc))
parser_class = self._get_parser_class(doc) parser_class = self._get_parser_class(doc)
if not parser_class: if not parser_class:
self.log( self.log(
@ -102,7 +104,6 @@ class Consumer:
self.logging_group = uuid.uuid4() self.logging_group = uuid.uuid4()
self.log("info", "Consuming {}".format(doc))
document_consumption_started.send( document_consumption_started.send(
sender=self.__class__, sender=self.__class__,
@ -110,23 +111,23 @@ class Consumer:
logging_group=self.logging_group logging_group=self.logging_group
) )
parsed_document = parser_class(doc) document_parser = parser_class(doc, self.logging_group)
try: try:
thumbnail = parsed_document.get_optimised_thumbnail() thumbnail = document_parser.get_optimised_thumbnail()
date = parsed_document.get_date() date = document_parser.get_date()
document = self._store( document = self._store(
parsed_document.get_text(), document_parser.get_text(),
doc, doc,
thumbnail, thumbnail,
date date
) )
except ParseError as e: except ParseError as e:
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
parsed_document.cleanup() document_parser.cleanup()
return False return False
else: else:
parsed_document.cleanup() document_parser.cleanup()
self._cleanup_doc(doc) self._cleanup_doc(doc)
self.log( self.log(
@ -140,9 +141,10 @@ class Consumer:
self.classifier.reload() self.classifier.reload()
classifier = self.classifier classifier = self.classifier
except FileNotFoundError: except FileNotFoundError:
logging.getLogger(__name__).warning("Cannot classify documents, " self.log("warning", "Cannot classify documents, classifier "
"classifier model file was not " "model file was not found. Consider "
"found.") "running python manage.py "
"document_create_classifier.")
document_consumption_finished.send( document_consumption_finished.send(
sender=self.__class__, sender=self.__class__,
@ -211,7 +213,7 @@ class Consumer:
document.save() document.save()
self.log("info", "Completed") self.log("debug", "Completed")
return document return document

View File

@ -2,15 +2,7 @@ import logging
class PaperlessLogger(logging.StreamHandler): class PaperlessLogger(logging.StreamHandler):
"""
A logger smart enough to know to log some kinds of messages to the database
for later retrieval in a pretty interface.
"""
def emit(self, record): def emit(self, record):
logging.StreamHandler.emit(self, record)
# We have to do the import here or Django will barf when it tries to # We have to do the import here or Django will barf when it tries to
# load this because the apps aren't loaded at that point # load this because the apps aren't loaded at that point
from .models import Log from .models import Log

View File

@ -3,7 +3,6 @@
import logging import logging
import os import os
import re import re
import uuid
from collections import OrderedDict from collections import OrderedDict
import dateutil.parser import dateutil.parser

View File

@ -39,11 +39,11 @@ class DocumentParser:
`paperless_tesseract.parsers` for inspiration. `paperless_tesseract.parsers` for inspiration.
""" """
def __init__(self, path): def __init__(self, path, logging_group):
self.document_path = path self.document_path = path
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
self.logging_group = None self.logging_group = logging_group
def get_thumbnail(self): def get_thumbnail(self):
""" """

View File

@ -56,6 +56,7 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
'Assigning correspondent "{}" to "{}" '.format(selected, document), 'Assigning correspondent "{}" to "{}" '.format(selected, document),
logging_group logging_group
) )
# TODO: during consumption, this saves even though no updates have been made
document.correspondent = selected document.correspondent = selected
document.save(update_fields=("correspondent",)) document.save(update_fields=("correspondent",))

View File

@ -239,14 +239,14 @@ LOGGING = {
"version": 1, "version": 1,
"disable_existing_loggers": False, "disable_existing_loggers": False,
"handlers": { "handlers": {
"consumer": { "dblogger": {
"class": "documents.loggers.PaperlessLogger", "class": "documents.loggers.PaperlessLogger",
} }
}, },
"loggers": { "loggers": {
"documents": { "documents": {
"handlers": ["consumer"], "handlers": ["dblogger"],
"level": os.getenv("PAPERLESS_CONSUMER_LOG_LEVEL", "INFO"), "level": "DEBUG"
}, },
}, },
} }
@ -260,7 +260,7 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", 4)) OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", 4))
# OCR all documents? # OCR all documents?
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS") OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", False)
# GNUPG needs a home directory for some reason # GNUPG needs a home directory for some reason

View File

@ -8,9 +8,7 @@ import langdetect
import pyocr import pyocr
from django.conf import settings from django.conf import settings
from PIL import Image from PIL import Image
from pyocr.libtesseract.tesseract_raw import \ from pyocr import PyocrException
TesseractError as OtherTesseractError
from pyocr.tesseract import TesseractError
import pdftotext import pdftotext
from documents.parsers import DocumentParser, ParseError from documents.parsers import DocumentParser, ParseError
@ -28,8 +26,8 @@ class RasterisedDocumentParser(DocumentParser):
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
""" """
def __init__(self, path): def __init__(self, path, logging_group):
super().__init__(path) super().__init__(path, logging_group)
self._text = None self._text = None
def get_thumbnail(self): def get_thumbnail(self):
@ -53,11 +51,7 @@ class RasterisedDocumentParser(DocumentParser):
except ParseError: except ParseError:
# if convert fails, fall back to extracting # if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript # the first PDF page as a PNG using Ghostscript
self.log( self.log('warning', 'Thumbnail generation with ImageMagick failed, falling back to ghostscript. Check your /etc/ImageMagick-x/policy.xml!')
"warning",
"Thumbnail generation with ImageMagick failed, "
"falling back to Ghostscript."
)
gs_out_path = os.path.join(self.tempdir, "gs_out.png") gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [settings.GS_BINARY, cmd = [settings.GS_BINARY,
"-q", "-q",
@ -100,9 +94,33 @@ class RasterisedDocumentParser(DocumentParser):
images = self._get_greyscale() images = self._get_greyscale()
if not images:
raise ParseError("Empty document, nothing to do.")
try: try:
self._text = self._get_ocr(images)
sample_page_index = int(len(images) / 2)
self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index+1, len(images)))
sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
guessed_language = self._guess_language(sample_page_text)
if not guessed_language or guessed_language not in ISO639:
self.log("warning", "Language detection failed.")
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
self.log("info", "Detected language: {} (default language)".format(guessed_language))
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
self.log("warning","Detected language {} is not available on this system.".format(guessed_language))
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
else:
self.log("info","Detected language: {}".format(guessed_language))
ocr_pages = self._ocr(images, ISO639[guessed_language])
self.log("info", "OCR completed.")
self._text = strip_excess_whitespace(" ".join(ocr_pages))
return self._text return self._text
except OCRError as e: except OCRError as e:
raise ParseError(e) raise ParseError(e)
@ -111,6 +129,8 @@ class RasterisedDocumentParser(DocumentParser):
Greyscale images are easier for Tesseract to OCR Greyscale images are easier for Tesseract to OCR
""" """
self.log("info", "Converting document {} into greyscale images...".format(self.document_path))
# Convert PDF to multiple PNMs # Convert PDF to multiple PNMs
pnm = os.path.join(self.tempdir, "convert-%04d.pnm") pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
run_convert( run_convert(
@ -127,91 +147,43 @@ class RasterisedDocumentParser(DocumentParser):
if f.endswith(".pnm"): if f.endswith(".pnm"):
pnms.append(os.path.join(self.tempdir, f)) pnms.append(os.path.join(self.tempdir, f))
self.log("info", "Running unpaper on {} pages...".format(len(pnms)))
# Run unpaper in parallel on converted images # Run unpaper in parallel on converted images
with Pool(processes=settings.OCR_THREADS) as pool: with Pool(processes=settings.OCR_THREADS) as pool:
pool.map(run_unpaper, itertools.product([settings.UNPAPER_BINARY], pnms)) pnms = pool.map(run_unpaper, pnms)
# Return list of converted images, processed with unpaper
pnms = []
for f in os.listdir(self.tempdir):
if f.endswith(".unpaper.pnm"):
pnms.append(os.path.join(self.tempdir, f))
return sorted(filter(lambda __: os.path.isfile(__), pnms)) return sorted(filter(lambda __: os.path.isfile(__), pnms))
def _guess_language(self, text): def _guess_language(self, text):
try: try:
guess = langdetect.detect(text) guess = langdetect.detect(text)
self.log("debug", "Language detected: {}".format(guess))
return guess return guess
except Exception as e: except Exception as e:
self.log("warning", "Language detection error: {}".format(e)) return None
def _get_ocr(self, imgs):
"""
Attempts to do the best job possible OCR'ing the document based on
simple language detection trial & error.
"""
if not imgs:
raise OCRError("Empty document, nothing to do.")
self.log("info", "OCRing the document")
# Since the division gets rounded down by int, this calculation works
# for every edge-case, i.e. 1
middle = int(len(imgs) / 2)
raw_text = self._ocr([imgs[middle]], settings.OCR_LANGUAGE)
guessed_language = self._guess_language(raw_text)
if not guessed_language or guessed_language not in ISO639:
self.log("warning", "Language detection failed!")
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
if ISO639[guessed_language] == settings.OCR_LANGUAGE:
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
try:
return self._ocr(imgs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
self.log(
"warning",
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
)
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
def _ocr(self, imgs, lang): def _ocr(self, imgs, lang):
""" self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
Performs a single OCR attempt.
"""
if not imgs:
return ""
self.log("info", "Parsing for {}".format(lang))
with Pool(processes=settings.OCR_THREADS) as pool: with Pool(processes=settings.OCR_THREADS) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang])) r = pool.map(image_to_string, itertools.product(imgs, [lang]))
r = " ".join(r) return r
# Strip out excess white space to allow matching to go smoother def _complete_ocr_default_language(self, images, sample_page_index, sample_page):
return strip_excess_whitespace(r)
def _assemble_ocr_sections(self, imgs, middle, text):
""" """
Given a `middle` value and the text that middle page represents, we OCR Given a `middle` value and the text that middle page represents, we OCR
the remainder of the document and return the whole thing. the remainder of the document and return the whole thing.
""" """
text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text # text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text
text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE) # text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE)
return text images_copy = list(images)
del images_copy[sample_page_index]
if images_copy:
self.log('info', 'Continuing ocr with default language.')
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
ocr_pages.insert(sample_page_index, sample_page)
return ocr_pages
else:
return [sample_page]
def run_convert(*args): def run_convert(*args):
@ -225,13 +197,16 @@ def run_convert(*args):
raise ParseError("Convert failed at {}".format(args)) raise ParseError("Convert failed at {}".format(args))
def run_unpaper(args): def run_unpaper(pnm):
unpaper, pnm = args pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
command_args = (unpaper, "--overwrite", "--quiet", pnm,
pnm.replace(".pnm", ".unpaper.pnm")) command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
pnm_out)
if not subprocess.Popen(command_args).wait() == 0: if not subprocess.Popen(command_args).wait() == 0:
raise ParseError("Unpaper failed at {}".format(command_args)) raise ParseError("Unpaper failed at {}".format(command_args))
return pnm_out
def strip_excess_whitespace(text): def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
@ -245,14 +220,18 @@ def strip_excess_whitespace(text):
def image_to_string(args): def image_to_string(args):
img, lang = args img, lang = args
ocr = pyocr.get_available_tools()[0] ocr = pyocr.get_available_tools()[0]
with Image.open(os.path.join(settings.SCRATCH_DIR, img)) as f: with Image.open(img) as f:
if ocr.can_detect_orientation(): if ocr.can_detect_orientation():
try: try:
orientation = ocr.detect_orientation(f, lang=lang) orientation = ocr.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1) f = f.rotate(orientation["angle"], expand=1)
except (TesseractError, OtherTesseractError, AttributeError): except Exception:
# Rotation not possible, ignore
pass pass
try:
return ocr.image_to_string(f, lang=lang) return ocr.image_to_string(f, lang=lang)
except PyocrException as e:
raise OCRError(e)
def get_text_from_pdf(pdf_file): def get_text_from_pdf(pdf_file):

View File

@ -11,14 +11,8 @@ class TextDocumentParser(DocumentParser):
This parser directly parses a text document (.txt, .md, or .csv) This parser directly parses a text document (.txt, .md, or .csv)
""" """
CONVERT = settings.CONVERT_BINARY def __init__(self, path, logging_group):
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None super().__init__(path, logging_group)
UNPAPER = settings.UNPAPER_BINARY
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
def __init__(self, path):
super().__init__(path)
self._text = None self._text = None
def get_thumbnail(self): def get_thumbnail(self):
@ -44,7 +38,7 @@ class TextDocumentParser(DocumentParser):
r = str(round(psize[0] / 10)) r = str(round(psize[0] / 10))
rounded = ",".join([r, r]) rounded = ",".join([r, r])
run_command( run_command(
self.CONVERT, settings.CONVERT_BINARY,
"-size ", picsize, "-size ", picsize,
' xc:none -draw ', ' xc:none -draw ',
'"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ', # NOQA: E501 '"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ', # NOQA: E501
@ -59,7 +53,7 @@ class TextDocumentParser(DocumentParser):
def create_txlayer(): def create_txlayer():
run_command( run_command(
self.CONVERT, settings.CONVERT_BINARY,
"-background none", "-background none",
"-fill", "-fill",
text_color, text_color,
@ -73,7 +67,7 @@ class TextDocumentParser(DocumentParser):
create_txlayer() create_txlayer()
create_bg() create_bg()
run_command( run_command(
self.CONVERT, settings.CONVERT_BINARY,
temp_bg, temp_bg,
temp_txlayer, temp_txlayer,
"-background None -layers merge ", "-background None -layers merge ",