A handy script to redo ocr on all documents,

This commit is contained in:
Jonas Winkler
2020-11-03 14:04:11 +01:00
parent 6f3d25d7b1
commit f4cebda085
3 changed files with 95 additions and 41 deletions

View File

@@ -20,6 +20,8 @@ from django.utils import timezone
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
from documents.signals import document_consumer_declaration
DATE_REGEX = re.compile(
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
@@ -32,6 +34,31 @@ DATE_REGEX = re.compile(
logger = logging.getLogger(__name__)
def get_parser_class(doc):
"""
Determine the appropriate parser class based on the file
"""
parsers = []
for response in document_consumer_declaration.send(None):
parsers.append(response[1])
#TODO: add a check that checks parser availability.
options = []
for parser in parsers:
result = parser(doc)
if result:
options.append(result)
if not options:
return None
# Return the parser with the highest weight.
return sorted(
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT: