diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 639152725..3920f2942 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -12,9 +12,8 @@ from django.utils import timezone from paperless.db import GnuPG from .classifier import DocumentClassifier from .models import Document, FileInfo -from .parsers import ParseError +from .parsers import ParseError, get_parser_class from .signals import ( - document_consumer_declaration, document_consumption_finished, document_consumption_started ) @@ -61,15 +60,6 @@ class Consumer: raise ConsumerError( "Consumption directory {} does not exist".format(self.consume)) - self.parsers = [] - for response in document_consumer_declaration.send(self): - self.parsers.append(response[1]) - - if not self.parsers: - raise ConsumerError( - "No parsers could be found, not even the default. " - "This is a problem." - ) def log(self, level, message): getattr(self.logger, level)(message, extra={ @@ -82,6 +72,8 @@ class Consumer: Return True if file was consumed """ + self.logging_group = uuid.uuid4() + if not re.match(FileInfo.REGEXES["title"], file): return False @@ -96,13 +88,13 @@ class Consumer: self.log("info", "Consuming {}".format(doc)) - parser_class = self._get_parser_class(doc) + parser_class = get_parser_class(doc) if not parser_class: self.log( "error", "No parsers could be found for {}".format(doc)) return False - - self.logging_group = uuid.uuid4() + else: + self.log("info", "Parser: {}".format(parser_class.__name__)) document_consumption_started.send( @@ -114,6 +106,7 @@ class Consumer: document_parser = parser_class(doc, self.logging_group) try: + self.log("info", "Generating thumbnail for {}...".format(doc)) thumbnail = document_parser.get_optimised_thumbnail() date = document_parser.get_date() document = self._store( @@ -154,31 +147,6 @@ class Consumer: ) return True - def _get_parser_class(self, doc): - """ - Determine the appropriate parser class based on the file - """ - - options = [] - for parser in self.parsers: - result = parser(doc) - if result: - options.append(result) - - self.log( - "info", - "Parsers available: {}".format( - ", ".join([str(o["parser"].__name__) for o in options]) - ) - ) - - if not options: - return None - - # Return the parser with the highest weight. - return sorted( - options, key=lambda _: _["weight"], reverse=True)[0]["parser"] - def _store(self, text, doc, thumbnail, date): file_info = FileInfo.from_path(doc) @@ -211,10 +179,9 @@ class Consumer: self._write(document, doc, document.source_path) self._write(document, thumbnail, document.thumbnail_path) + #TODO: why do we need to save the document again? document.save() - self.log("debug", "Completed") - return document def _write(self, document, source, target): diff --git a/src/documents/management/commands/document_rerun_ocr.py b/src/documents/management/commands/document_rerun_ocr.py new file mode 100644 index 000000000..794357420 --- /dev/null +++ b/src/documents/management/commands/document_rerun_ocr.py @@ -0,0 +1,60 @@ +import argparse +import threading +from multiprocessing import Pool +from multiprocessing.pool import ThreadPool + +from django.core.management.base import BaseCommand + +from documents.consumer import Consumer +from documents.models import Log, Document +from documents.parsers import get_parser_class + + +def process_document(doc): + parser_class = get_parser_class(doc.file_name) + if not parser_class: + print("no parser available") + else: + print("Parser: {}".format(parser_class.__name__)) + parser = parser_class(doc.source_path, None) + try: + text = parser.get_text() + doc.content = text + doc.save() + finally: + parser.cleanup() + + +def document_index(value): + ivalue = int(value) + if not (1 <= ivalue <= Document.objects.count()): + raise argparse.ArgumentTypeError( + "{} is not a valid document index (out of range)".format(value)) + + return ivalue + + +class Command(BaseCommand): + + help = "Performs OCR on all documents again!" + + + def add_arguments(self, parser): + parser.add_argument( + "-s", "--start_index", + default=None, + type=document_index + ) + + def handle(self, *args, **options): + + docs = Document.objects.all().order_by("added") + + indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs)) + + for i in indices: + doc = docs[i] + print("==================================") + print("{} out of {}: {}".format(i+1, len(docs), doc.file_name)) + print("==================================") + process_document(doc) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 63afa906d..60ad5cd7d 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -20,6 +20,8 @@ from django.utils import timezone # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits +from documents.signals import document_consumer_declaration + DATE_REGEX = re.compile( r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501 r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501 @@ -32,6 +34,31 @@ DATE_REGEX = re.compile( logger = logging.getLogger(__name__) +def get_parser_class(doc): + """ + Determine the appropriate parser class based on the file + """ + + parsers = [] + for response in document_consumer_declaration.send(None): + parsers.append(response[1]) + + #TODO: add a check that checks parser availability. + + options = [] + for parser in parsers: + result = parser(doc) + if result: + options.append(result) + + if not options: + return None + + # Return the parser with the highest weight. + return sorted( + options, key=lambda _: _["weight"], reverse=True)[0]["parser"] + + def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None): environment = os.environ.copy() if settings.CONVERT_MEMORY_LIMIT: