Merge branch 'dev' into feature-websockets-status

This commit is contained in:
jonaswinkler
2020-12-06 22:53:54 +01:00
179 changed files with 5678 additions and 2460 deletions

View File

@@ -1,4 +1,5 @@
import logging
import mimetypes
import os
import re
import shutil
@@ -42,6 +43,40 @@ def is_mime_type_supported(mime_type):
return get_parser_class_for_mime_type(mime_type) is not None
def get_default_file_extension(mime_type):
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
if mime_type in supported_mime_types:
return supported_mime_types[mime_type]
ext = mimetypes.guess_extension(mime_type)
if ext:
return ext
else:
return ""
def is_file_ext_supported(ext):
if ext:
return ext.lower() in get_supported_file_extensions()
else:
return False
def get_supported_file_extensions():
extensions = set()
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
for mime_type in supported_mime_types:
extensions.update(mimetypes.guess_all_extensions(mime_type))
return extensions
def get_parser_class_for_mime_type(mime_type):
options = []
@@ -107,21 +142,59 @@ def run_convert(input_file,
raise ParseError("Convert failed at {}".format(args))
def run_unpaper(pnm, logging_group=None):
pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
def parse_date(filename, text):
"""
Returns the date of the document.
"""
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
pnm_out)
def __parser(ds, date_order):
"""
Call dateparser.parse with a particular date ordering
"""
return dateparser.parse(
ds,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE":
True
}
)
logger.debug(f"Execute: {' '.join(command_args)}",
extra={'group': logging_group})
date = None
if not subprocess.Popen(command_args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL).wait() == 0:
raise ParseError(f"Unpaper failed at {command_args}")
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
return pnm_out
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
for m in re.finditer(DATE_REGEX, filename):
date_string = m.group(0)
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
return date
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
try:
date = __parser(date_string, settings.DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
break
else:
date = None
return date
class ParseError(Exception):
@@ -134,27 +207,36 @@ class DocumentParser(LoggingMixin):
`paperless_tesseract.parsers` for inspiration.
"""
def __init__(self, path, logging_group, progress_callback):
def __init__(self, logging_group, progress_callback):
super().__init__()
self.logging_group = logging_group
self.document_path = path
self.tempdir = tempfile.mkdtemp(
prefix="paperless-", dir=settings.SCRATCH_DIR)
self.archive_path = None
self.text = None
self.date = None
self.progress_callback = progress_callback
def get_thumbnail(self):
def parse(self, document_path, mime_type):
raise NotImplementedError()
def get_archive_path(self):
return self.archive_path
def get_thumbnail(self, document_path, mime_type):
"""
Returns the path to a file we can use as a thumbnail for this document.
"""
raise NotImplementedError()
def optimise_thumbnail(self, in_path):
def get_optimised_thumbnail(self, document_path, mime_type):
thumbnail = self.get_thumbnail(document_path, mime_type)
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "optipng.png")
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
args = (settings.OPTIPNG_BINARY,
"-silent", "-o5", in_path, "-out", out_path)
"-silent", "-o5", thumbnail, "-out", out_path)
self.log('debug', f"Execute: {' '.join(args)}")
@@ -163,97 +245,13 @@ class DocumentParser(LoggingMixin):
return out_path
else:
return in_path
def get_optimised_thumbnail(self):
return self.optimise_thumbnail(self.get_thumbnail())
return thumbnail
def get_text(self):
"""
Returns the text from the document and only the text.
"""
raise NotImplementedError()
return self.text
def get_date(self):
"""
Returns the date of the document.
"""
def __parser(ds, date_order):
"""
Call dateparser.parse with a particular date ordering
"""
return dateparser.parse(
ds,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE":
True
}
)
date = None
date_string = None
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
title = os.path.basename(self.document_path)
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
self.log("info", "Checking document title for date")
for m in re.finditer(DATE_REGEX, title):
date_string = m.group(0)
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
self.log(
"info",
"Detected document date {} based on string {} "
"from document title"
"".format(date.isoformat(), date_string)
)
return date
try:
# getting text after checking filename will save time if only
# looking at the filename instead of the whole text
text = self.get_text()
except ParseError:
return None
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
try:
date = __parser(date_string, settings.DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
break
else:
date = None
if date is not None:
self.log(
"info",
"Detected document date {} based on string {}".format(
date.isoformat(),
date_string
)
)
else:
self.log("info", "Unable to detect date for document")
return date
return self.date
def cleanup(self):
self.log("debug", "Deleting directory {}".format(self.tempdir))