A handy script to redo ocr on all documents,

This commit is contained in:
Jonas Winkler 2020-11-03 14:04:11 +01:00
parent 6f3d25d7b1
commit f4cebda085
3 changed files with 95 additions and 41 deletions

View File

@ -12,9 +12,8 @@ from django.utils import timezone
from paperless.db import GnuPG from paperless.db import GnuPG
from .classifier import DocumentClassifier from .classifier import DocumentClassifier
from .models import Document, FileInfo from .models import Document, FileInfo
from .parsers import ParseError from .parsers import ParseError, get_parser_class
from .signals import ( from .signals import (
document_consumer_declaration,
document_consumption_finished, document_consumption_finished,
document_consumption_started document_consumption_started
) )
@ -61,15 +60,6 @@ class Consumer:
raise ConsumerError( raise ConsumerError(
"Consumption directory {} does not exist".format(self.consume)) "Consumption directory {} does not exist".format(self.consume))
self.parsers = []
for response in document_consumer_declaration.send(self):
self.parsers.append(response[1])
if not self.parsers:
raise ConsumerError(
"No parsers could be found, not even the default. "
"This is a problem."
)
def log(self, level, message): def log(self, level, message):
getattr(self.logger, level)(message, extra={ getattr(self.logger, level)(message, extra={
@ -82,6 +72,8 @@ class Consumer:
Return True if file was consumed Return True if file was consumed
""" """
self.logging_group = uuid.uuid4()
if not re.match(FileInfo.REGEXES["title"], file): if not re.match(FileInfo.REGEXES["title"], file):
return False return False
@ -96,13 +88,13 @@ class Consumer:
self.log("info", "Consuming {}".format(doc)) self.log("info", "Consuming {}".format(doc))
parser_class = self._get_parser_class(doc) parser_class = get_parser_class(doc)
if not parser_class: if not parser_class:
self.log( self.log(
"error", "No parsers could be found for {}".format(doc)) "error", "No parsers could be found for {}".format(doc))
return False return False
else:
self.logging_group = uuid.uuid4() self.log("info", "Parser: {}".format(parser_class.__name__))
document_consumption_started.send( document_consumption_started.send(
@ -114,6 +106,7 @@ class Consumer:
document_parser = parser_class(doc, self.logging_group) document_parser = parser_class(doc, self.logging_group)
try: try:
self.log("info", "Generating thumbnail for {}...".format(doc))
thumbnail = document_parser.get_optimised_thumbnail() thumbnail = document_parser.get_optimised_thumbnail()
date = document_parser.get_date() date = document_parser.get_date()
document = self._store( document = self._store(
@ -154,31 +147,6 @@ class Consumer:
) )
return True return True
def _get_parser_class(self, doc):
"""
Determine the appropriate parser class based on the file
"""
options = []
for parser in self.parsers:
result = parser(doc)
if result:
options.append(result)
self.log(
"info",
"Parsers available: {}".format(
", ".join([str(o["parser"].__name__) for o in options])
)
)
if not options:
return None
# Return the parser with the highest weight.
return sorted(
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def _store(self, text, doc, thumbnail, date): def _store(self, text, doc, thumbnail, date):
file_info = FileInfo.from_path(doc) file_info = FileInfo.from_path(doc)
@ -211,10 +179,9 @@ class Consumer:
self._write(document, doc, document.source_path) self._write(document, doc, document.source_path)
self._write(document, thumbnail, document.thumbnail_path) self._write(document, thumbnail, document.thumbnail_path)
#TODO: why do we need to save the document again?
document.save() document.save()
self.log("debug", "Completed")
return document return document
def _write(self, document, source, target): def _write(self, document, source, target):

View File

@ -0,0 +1,60 @@
import argparse
import threading
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from django.core.management.base import BaseCommand
from documents.consumer import Consumer
from documents.models import Log, Document
from documents.parsers import get_parser_class
def process_document(doc):
parser_class = get_parser_class(doc.file_name)
if not parser_class:
print("no parser available")
else:
print("Parser: {}".format(parser_class.__name__))
parser = parser_class(doc.source_path, None)
try:
text = parser.get_text()
doc.content = text
doc.save()
finally:
parser.cleanup()
def document_index(value):
ivalue = int(value)
if not (1 <= ivalue <= Document.objects.count()):
raise argparse.ArgumentTypeError(
"{} is not a valid document index (out of range)".format(value))
return ivalue
class Command(BaseCommand):
help = "Performs OCR on all documents again!"
def add_arguments(self, parser):
parser.add_argument(
"-s", "--start_index",
default=None,
type=document_index
)
def handle(self, *args, **options):
docs = Document.objects.all().order_by("added")
indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
for i in indices:
doc = docs[i]
print("==================================")
print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
print("==================================")
process_document(doc)

View File

@ -20,6 +20,8 @@ from django.utils import timezone
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
from documents.signals import document_consumer_declaration
DATE_REGEX = re.compile( DATE_REGEX = re.compile(
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501 r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501 r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
@ -32,6 +34,31 @@ DATE_REGEX = re.compile(
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_parser_class(doc):
"""
Determine the appropriate parser class based on the file
"""
parsers = []
for response in document_consumer_declaration.send(None):
parsers.append(response[1])
#TODO: add a check that checks parser availability.
options = []
for parser in parsers:
result = parser(doc)
if result:
options.append(result)
if not options:
return None
# Return the parser with the highest weight.
return sorted(
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None): def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
environment = os.environ.copy() environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT: if settings.CONVERT_MEMORY_LIMIT: