diff --git a/src/documents/consumer.py b/src/documents/consumer.py index b6a0a5912..fa61e9376 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError from .file_handling import generate_filename, create_source_path_directory from .loggers import LoggingMixin from .models import Document, FileInfo, Correspondent, DocumentType, Tag -from .parsers import ParseError, get_parser_class_for_mime_type +from .parsers import ParseError, get_parser_class_for_mime_type, parse_date from .signals import ( document_consumption_finished, document_consumption_started @@ -121,7 +121,7 @@ class Consumer(LoggingMixin): # This doesn't parse the document yet, but gives us a parser. - document_parser = parser_class(self.path, self.logging_group) + document_parser = parser_class(self.logging_group) # However, this already created working directories which we have to # clean up. @@ -129,12 +129,18 @@ class Consumer(LoggingMixin): # Parse the document. This may take some time. try: - self.log("debug", f"Generating thumbnail for {self.filename}...") - thumbnail = document_parser.get_optimised_thumbnail() self.log("debug", "Parsing {}...".format(self.filename)) + document_parser.parse(self.path, mime_type) + + self.log("debug", f"Generating thumbnail for {self.filename}...") + thumbnail = document_parser.get_optimised_thumbnail(self.path, mime_type) + text = document_parser.get_text() date = document_parser.get_date() + if not date: + date = parse_date(self.filename, text) archive_path = document_parser.get_archive_path() + except ParseError as e: document_parser.cleanup() raise ConsumerError(e) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 542a5dae9..4ae1d1a92 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -107,6 +107,61 @@ def run_convert(input_file, raise ParseError("Convert failed at {}".format(args)) +def parse_date(filename, text): + """ + Returns the date of the document. + """ + + def __parser(ds, date_order): + """ + Call dateparser.parse with a particular date ordering + """ + return dateparser.parse( + ds, + settings={ + "DATE_ORDER": date_order, + "PREFER_DAY_OF_MONTH": "first", + "RETURN_AS_TIMEZONE_AWARE": + True + } + ) + + date = None + + next_year = timezone.now().year + 5 # Arbitrary 5 year future limit + + # if filename date parsing is enabled, search there first: + if settings.FILENAME_DATE_ORDER: + for m in re.finditer(DATE_REGEX, filename): + date_string = m.group(0) + + try: + date = __parser(date_string, settings.FILENAME_DATE_ORDER) + except (TypeError, ValueError): + # Skip all matches that do not parse to a proper date + continue + + if date is not None and next_year > date.year > 1900: + return date + + # Iterate through all regex matches in text and try to parse the date + for m in re.finditer(DATE_REGEX, text): + date_string = m.group(0) + + try: + date = __parser(date_string, settings.DATE_ORDER) + except (TypeError, ValueError): + # Skip all matches that do not parse to a proper date + continue + + if date is not None and next_year > date.year > 1900: + break + else: + date = None + + return date + + class ParseError(Exception): pass @@ -117,29 +172,35 @@ class DocumentParser(LoggingMixin): `paperless_tesseract.parsers` for inspiration. """ - def __init__(self, path, logging_group): + def __init__(self, logging_group): super().__init__() self.logging_group = logging_group - self.document_path = path self.tempdir = tempfile.mkdtemp( prefix="paperless-", dir=settings.SCRATCH_DIR) - def get_archive_path(self): - return None + self.archive_path = None + self.text = None + self.date = None - def get_thumbnail(self): + def parse(self, document_path, mime_type): + raise NotImplementedError() + + def get_archive_path(self): + return self.archive_path + + def get_thumbnail(self, document_path, mime_type): """ Returns the path to a file we can use as a thumbnail for this document. """ raise NotImplementedError() - def optimise_thumbnail(self, in_path): - + def get_optimised_thumbnail(self, document_path, mime_type): + thumbnail = self.get_thumbnail(document_path, mime_type) if settings.OPTIMIZE_THUMBNAILS: - out_path = os.path.join(self.tempdir, "optipng.png") + out_path = os.path.join(self.tempdir, "thumb_optipng.png") args = (settings.OPTIPNG_BINARY, - "-silent", "-o5", in_path, "-out", out_path) + "-silent", "-o5", thumbnail, "-out", out_path) self.log('debug', f"Execute: {' '.join(args)}") @@ -148,97 +209,13 @@ class DocumentParser(LoggingMixin): return out_path else: - return in_path - - def get_optimised_thumbnail(self): - return self.optimise_thumbnail(self.get_thumbnail()) + return thumbnail def get_text(self): - """ - Returns the text from the document and only the text. - """ - raise NotImplementedError() + return self.text def get_date(self): - """ - Returns the date of the document. - """ - - def __parser(ds, date_order): - """ - Call dateparser.parse with a particular date ordering - """ - return dateparser.parse( - ds, - settings={ - "DATE_ORDER": date_order, - "PREFER_DAY_OF_MONTH": "first", - "RETURN_AS_TIMEZONE_AWARE": - True - } - ) - - date = None - date_string = None - - next_year = timezone.now().year + 5 # Arbitrary 5 year future limit - title = os.path.basename(self.document_path) - - # if filename date parsing is enabled, search there first: - if settings.FILENAME_DATE_ORDER: - self.log("info", "Checking document title for date") - for m in re.finditer(DATE_REGEX, title): - date_string = m.group(0) - - try: - date = __parser(date_string, settings.FILENAME_DATE_ORDER) - except (TypeError, ValueError): - # Skip all matches that do not parse to a proper date - continue - - if date is not None and next_year > date.year > 1900: - self.log( - "info", - "Detected document date {} based on string {} " - "from document title" - "".format(date.isoformat(), date_string) - ) - return date - - try: - # getting text after checking filename will save time if only - # looking at the filename instead of the whole text - text = self.get_text() - except ParseError: - return None - - # Iterate through all regex matches in text and try to parse the date - for m in re.finditer(DATE_REGEX, text): - date_string = m.group(0) - - try: - date = __parser(date_string, settings.DATE_ORDER) - except (TypeError, ValueError): - # Skip all matches that do not parse to a proper date - continue - - if date is not None and next_year > date.year > 1900: - break - else: - date = None - - if date is not None: - self.log( - "info", - "Detected document date {} based on string {}".format( - date.isoformat(), - date_string - ) - ) - else: - self.log("info", "Unable to detect date for document") - - return date + return self.date def cleanup(self): self.log("debug", "Deleting directory {}".format(self.tempdir)) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 8f694ef56..b72f95e2d 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -2,7 +2,6 @@ import os import re import subprocess -import langdetect import ocrmypdf import pdftotext from django.conf import settings @@ -17,12 +16,7 @@ class RasterisedDocumentParser(DocumentParser): image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) """ - def __init__(self, path, logging_group): - super().__init__(path, logging_group) - self._text = None - self._archive_path = None - - def get_thumbnail(self): + def get_thumbnail(self, document_path, mime_type): """ The thumbnail of a PDF is just a 500px wide image of the first page. """ @@ -36,7 +30,7 @@ class RasterisedDocumentParser(DocumentParser): alpha="remove", strip=True, trim=True, - input_file="{}[0]".format(self.document_path), + input_file="{}[0]".format(document_path), output_file=out_path, logging_group=self.logging_group) except ParseError: @@ -51,7 +45,7 @@ class RasterisedDocumentParser(DocumentParser): "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, - self.document_path] + document_path] if not subprocess.Popen(cmd).wait() == 0: raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) # then run convert on the output from gs @@ -71,10 +65,11 @@ class RasterisedDocumentParser(DocumentParser): if self._text: return self._text + def parse(self, document_path, mime_type): archive_path = os.path.join(self.tempdir, "archive.pdf") ocr_args = { - 'input_file': self.document_path, + 'input_file': document_path, 'output_file': archive_path, 'use_threads': True, 'jobs': settings.THREADS_PER_WORKER, @@ -96,17 +91,17 @@ class RasterisedDocumentParser(DocumentParser): try: ocrmypdf.ocr(**ocr_args) - # success! announce that we have an archive document - self._archive_path = archive_path - self._text = get_text_from_pdf(self._archive_path) + # success! announce results + self.archive_path = archive_path + self.text = get_text_from_pdf(archive_path) except InputFileError as e: # This happens with some PDFs when used with the redo_ocr option. # This is not the end of the world, we'll just use what we already # have in the document. - self._text = get_text_from_pdf(self.document_path) + self.text = get_text_from_pdf(document_path) # Also, no archived file. - if not self._text: + if not self.text: # However, if we don't have anything, fail: raise ParseError(e) @@ -114,27 +109,14 @@ class RasterisedDocumentParser(DocumentParser): # Anything else is probably serious. raise ParseError(e) - if not self._text: + if not self.text: # This may happen for files that don't have any text. self.log( 'warning', - f"Document {self.document_path} does not have any text." + f"Document {document_path} does not have any text." f"This is probably an error or you tried to add an image " f"without text.") - return "" - - return self._text - - def get_archive_path(self): - return self._archive_path - - def _guess_language(self, text): - try: - guess = langdetect.detect(text) - return guess - except Exception as e: - self.log('warning', f"Language detection failed with: {e}") - return None + self.text = "" def strip_excess_whitespace(text): diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 015016fb3..f8f369ab0 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser): This parser directly parses a text document (.txt, .md, or .csv) """ - def __init__(self, path, logging_group): - super().__init__(path, logging_group) - self._text = None - - def get_thumbnail(self): + def get_thumbnail(self, document_path, mime_type): """ The thumbnail of a text file is just a 500px wide image of the text rendered onto a letter-sized page. @@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser): ) def read_text(): - with open(self.document_path, 'r') as src: + with open(document_path, 'r') as src: lines = [line.strip() for line in src.readlines()] text = "\n".join([line for line in lines[:n_lines]]) return text.replace('"', "'") @@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser): return out_path - def get_text(self): - - if self._text is not None: - return self._text - - with open(self.document_path, 'r') as f: - self._text = f.read() - - return self._text + def parse(self, document_path, mime_type): + with open(document_path, 'r') as f: + self.text = f.read() def run_command(*args):