reworked the interface of the parsers.

This commit is contained in:
Jonas Winkler 2020-11-25 19:36:18 +01:00
parent d3c13f6c93
commit df801d17e1
4 changed files with 101 additions and 146 deletions

View File

@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory from .file_handling import generate_filename, create_source_path_directory
from .loggers import LoggingMixin from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class_for_mime_type from .parsers import ParseError, get_parser_class_for_mime_type, parse_date
from .signals import ( from .signals import (
document_consumption_finished, document_consumption_finished,
document_consumption_started document_consumption_started
@ -121,7 +121,7 @@ class Consumer(LoggingMixin):
# This doesn't parse the document yet, but gives us a parser. # This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.path, self.logging_group) document_parser = parser_class(self.logging_group)
# However, this already created working directories which we have to # However, this already created working directories which we have to
# clean up. # clean up.
@ -129,12 +129,18 @@ class Consumer(LoggingMixin):
# Parse the document. This may take some time. # Parse the document. This may take some time.
try: try:
self.log("debug", f"Generating thumbnail for {self.filename}...")
thumbnail = document_parser.get_optimised_thumbnail()
self.log("debug", "Parsing {}...".format(self.filename)) self.log("debug", "Parsing {}...".format(self.filename))
document_parser.parse(self.path, mime_type)
self.log("debug", f"Generating thumbnail for {self.filename}...")
thumbnail = document_parser.get_optimised_thumbnail(self.path, mime_type)
text = document_parser.get_text() text = document_parser.get_text()
date = document_parser.get_date() date = document_parser.get_date()
if not date:
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path() archive_path = document_parser.get_archive_path()
except ParseError as e: except ParseError as e:
document_parser.cleanup() document_parser.cleanup()
raise ConsumerError(e) raise ConsumerError(e)

View File

@ -107,59 +107,7 @@ def run_convert(input_file,
raise ParseError("Convert failed at {}".format(args)) raise ParseError("Convert failed at {}".format(args))
class ParseError(Exception): def parse_date(filename, text):
pass
class DocumentParser(LoggingMixin):
"""
Subclass this to make your own parser. Have a look at
`paperless_tesseract.parsers` for inspiration.
"""
def __init__(self, path, logging_group):
super().__init__()
self.logging_group = logging_group
self.document_path = path
self.tempdir = tempfile.mkdtemp(
prefix="paperless-", dir=settings.SCRATCH_DIR)
def get_archive_path(self):
return None
def get_thumbnail(self):
"""
Returns the path to a file we can use as a thumbnail for this document.
"""
raise NotImplementedError()
def optimise_thumbnail(self, in_path):
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "optipng.png")
args = (settings.OPTIPNG_BINARY,
"-silent", "-o5", in_path, "-out", out_path)
self.log('debug', f"Execute: {' '.join(args)}")
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
return out_path
else:
return in_path
def get_optimised_thumbnail(self):
return self.optimise_thumbnail(self.get_thumbnail())
def get_text(self):
"""
Returns the text from the document and only the text.
"""
raise NotImplementedError()
def get_date(self):
""" """
Returns the date of the document. Returns the date of the document.
""" """
@ -179,15 +127,12 @@ class DocumentParser(LoggingMixin):
) )
date = None date = None
date_string = None
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
title = os.path.basename(self.document_path)
# if filename date parsing is enabled, search there first: # if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER: if settings.FILENAME_DATE_ORDER:
self.log("info", "Checking document title for date") for m in re.finditer(DATE_REGEX, filename):
for m in re.finditer(DATE_REGEX, title):
date_string = m.group(0) date_string = m.group(0)
try: try:
@ -197,21 +142,8 @@ class DocumentParser(LoggingMixin):
continue continue
if date is not None and next_year > date.year > 1900: if date is not None and next_year > date.year > 1900:
self.log(
"info",
"Detected document date {} based on string {} "
"from document title"
"".format(date.isoformat(), date_string)
)
return date return date
try:
# getting text after checking filename will save time if only
# looking at the filename instead of the whole text
text = self.get_text()
except ParseError:
return None
# Iterate through all regex matches in text and try to parse the date # Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text): for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0) date_string = m.group(0)
@ -227,19 +159,64 @@ class DocumentParser(LoggingMixin):
else: else:
date = None date = None
if date is not None:
self.log(
"info",
"Detected document date {} based on string {}".format(
date.isoformat(),
date_string
)
)
else:
self.log("info", "Unable to detect date for document")
return date return date
class ParseError(Exception):
pass
class DocumentParser(LoggingMixin):
"""
Subclass this to make your own parser. Have a look at
`paperless_tesseract.parsers` for inspiration.
"""
def __init__(self, logging_group):
super().__init__()
self.logging_group = logging_group
self.tempdir = tempfile.mkdtemp(
prefix="paperless-", dir=settings.SCRATCH_DIR)
self.archive_path = None
self.text = None
self.date = None
def parse(self, document_path, mime_type):
raise NotImplementedError()
def get_archive_path(self):
return self.archive_path
def get_thumbnail(self, document_path, mime_type):
"""
Returns the path to a file we can use as a thumbnail for this document.
"""
raise NotImplementedError()
def get_optimised_thumbnail(self, document_path, mime_type):
thumbnail = self.get_thumbnail(document_path, mime_type)
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
args = (settings.OPTIPNG_BINARY,
"-silent", "-o5", thumbnail, "-out", out_path)
self.log('debug', f"Execute: {' '.join(args)}")
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
return out_path
else:
return thumbnail
def get_text(self):
return self.text
def get_date(self):
return self.date
def cleanup(self): def cleanup(self):
self.log("debug", "Deleting directory {}".format(self.tempdir)) self.log("debug", "Deleting directory {}".format(self.tempdir))
shutil.rmtree(self.tempdir) shutil.rmtree(self.tempdir)

View File

@ -2,7 +2,6 @@ import os
import re import re
import subprocess import subprocess
import langdetect
import ocrmypdf import ocrmypdf
import pdftotext import pdftotext
from django.conf import settings from django.conf import settings
@ -17,12 +16,7 @@ class RasterisedDocumentParser(DocumentParser):
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
""" """
def __init__(self, path, logging_group): def get_thumbnail(self, document_path, mime_type):
super().__init__(path, logging_group)
self._text = None
self._archive_path = None
def get_thumbnail(self):
""" """
The thumbnail of a PDF is just a 500px wide image of the first page. The thumbnail of a PDF is just a 500px wide image of the first page.
""" """
@ -36,7 +30,7 @@ class RasterisedDocumentParser(DocumentParser):
alpha="remove", alpha="remove",
strip=True, strip=True,
trim=True, trim=True,
input_file="{}[0]".format(self.document_path), input_file="{}[0]".format(document_path),
output_file=out_path, output_file=out_path,
logging_group=self.logging_group) logging_group=self.logging_group)
except ParseError: except ParseError:
@ -51,7 +45,7 @@ class RasterisedDocumentParser(DocumentParser):
"-q", "-q",
"-sDEVICE=pngalpha", "-sDEVICE=pngalpha",
"-o", gs_out_path, "-o", gs_out_path,
self.document_path] document_path]
if not subprocess.Popen(cmd).wait() == 0: if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs # then run convert on the output from gs
@ -71,10 +65,11 @@ class RasterisedDocumentParser(DocumentParser):
if self._text: if self._text:
return self._text return self._text
def parse(self, document_path, mime_type):
archive_path = os.path.join(self.tempdir, "archive.pdf") archive_path = os.path.join(self.tempdir, "archive.pdf")
ocr_args = { ocr_args = {
'input_file': self.document_path, 'input_file': document_path,
'output_file': archive_path, 'output_file': archive_path,
'use_threads': True, 'use_threads': True,
'jobs': settings.THREADS_PER_WORKER, 'jobs': settings.THREADS_PER_WORKER,
@ -96,17 +91,17 @@ class RasterisedDocumentParser(DocumentParser):
try: try:
ocrmypdf.ocr(**ocr_args) ocrmypdf.ocr(**ocr_args)
# success! announce that we have an archive document # success! announce results
self._archive_path = archive_path self.archive_path = archive_path
self._text = get_text_from_pdf(self._archive_path) self.text = get_text_from_pdf(archive_path)
except InputFileError as e: except InputFileError as e:
# This happens with some PDFs when used with the redo_ocr option. # This happens with some PDFs when used with the redo_ocr option.
# This is not the end of the world, we'll just use what we already # This is not the end of the world, we'll just use what we already
# have in the document. # have in the document.
self._text = get_text_from_pdf(self.document_path) self.text = get_text_from_pdf(document_path)
# Also, no archived file. # Also, no archived file.
if not self._text: if not self.text:
# However, if we don't have anything, fail: # However, if we don't have anything, fail:
raise ParseError(e) raise ParseError(e)
@ -114,27 +109,14 @@ class RasterisedDocumentParser(DocumentParser):
# Anything else is probably serious. # Anything else is probably serious.
raise ParseError(e) raise ParseError(e)
if not self._text: if not self.text:
# This may happen for files that don't have any text. # This may happen for files that don't have any text.
self.log( self.log(
'warning', 'warning',
f"Document {self.document_path} does not have any text." f"Document {document_path} does not have any text."
f"This is probably an error or you tried to add an image " f"This is probably an error or you tried to add an image "
f"without text.") f"without text.")
return "" self.text = ""
return self._text
def get_archive_path(self):
return self._archive_path
def _guess_language(self, text):
try:
guess = langdetect.detect(text)
return guess
except Exception as e:
self.log('warning', f"Language detection failed with: {e}")
return None
def strip_excess_whitespace(text): def strip_excess_whitespace(text):

View File

@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser):
This parser directly parses a text document (.txt, .md, or .csv) This parser directly parses a text document (.txt, .md, or .csv)
""" """
def __init__(self, path, logging_group): def get_thumbnail(self, document_path, mime_type):
super().__init__(path, logging_group)
self._text = None
def get_thumbnail(self):
""" """
The thumbnail of a text file is just a 500px wide image of the text The thumbnail of a text file is just a 500px wide image of the text
rendered onto a letter-sized page. rendered onto a letter-sized page.
@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser):
) )
def read_text(): def read_text():
with open(self.document_path, 'r') as src: with open(document_path, 'r') as src:
lines = [line.strip() for line in src.readlines()] lines = [line.strip() for line in src.readlines()]
text = "\n".join([line for line in lines[:n_lines]]) text = "\n".join([line for line in lines[:n_lines]])
return text.replace('"', "'") return text.replace('"', "'")
@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser):
return out_path return out_path
def get_text(self): def parse(self, document_path, mime_type):
with open(document_path, 'r') as f:
if self._text is not None: self.text = f.read()
return self._text
with open(self.document_path, 'r') as f:
self._text = f.read()
return self._text
def run_command(*args): def run_command(*args):