mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
160 lines
5.5 KiB
Python
160 lines
5.5 KiB
Python
import logging
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import tempfile
|
|
|
|
import dateparser
|
|
from django.conf import settings
|
|
from django.utils import timezone
|
|
|
|
# This regular expression will try to find dates in the document at
|
|
# hand and will match the following formats:
|
|
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
|
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
|
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
|
DATE_REGEX = re.compile(
|
|
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' +
|
|
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' +
|
|
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' +
|
|
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
|
|
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
|
|
)
|
|
|
|
|
|
class ParseError(Exception):
|
|
pass
|
|
|
|
|
|
class DocumentParser:
|
|
"""
|
|
Subclass this to make your own parser. Have a look at
|
|
`paperless_tesseract.parsers` for inspiration.
|
|
"""
|
|
|
|
SCRATCH = settings.SCRATCH_DIR
|
|
DATE_ORDER = settings.DATE_ORDER
|
|
FILENAME_DATE_ORDER = settings.FILENAME_DATE_ORDER
|
|
OPTIPNG = settings.OPTIPNG_BINARY
|
|
|
|
def __init__(self, path):
|
|
self.document_path = path
|
|
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=self.SCRATCH)
|
|
self.logger = logging.getLogger(__name__)
|
|
self.logging_group = None
|
|
|
|
def get_thumbnail(self):
|
|
"""
|
|
Returns the path to a file we can use as a thumbnail for this document.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def optimise_thumbnail(self, in_path):
|
|
|
|
out_path = os.path.join(self.tempdir, "optipng.png")
|
|
|
|
args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
|
|
if not subprocess.Popen(args).wait() == 0:
|
|
raise ParseError("Optipng failed at {}".format(args))
|
|
|
|
return out_path
|
|
|
|
def get_optimised_thumbnail(self):
|
|
return self.optimise_thumbnail(self.get_thumbnail())
|
|
|
|
def get_text(self):
|
|
"""
|
|
Returns the text from the document and only the text.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def get_date(self):
|
|
"""
|
|
Returns the date of the document.
|
|
"""
|
|
|
|
def __parser__(ds, date_order):
|
|
"""
|
|
Call dateparser.parse with a particular date ordering
|
|
"""
|
|
return dateparser.parse(ds,
|
|
settings={"DATE_ORDER": date_order,
|
|
"PREFER_DAY_OF_MONTH": "first",
|
|
"RETURN_AS_TIMEZONE_AWARE":
|
|
True})
|
|
date = None
|
|
date_string = None
|
|
|
|
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
|
title = os.path.basename(self.document_path)
|
|
|
|
# if filename date parsing is enabled, search there first:
|
|
if self.FILENAME_DATE_ORDER:
|
|
self.log("info", "Checking document title for date")
|
|
for m in re.finditer(DATE_REGEX, title):
|
|
date_string = m.group(0)
|
|
|
|
try:
|
|
date = __parser__(date_string, self.FILENAME_DATE_ORDER)
|
|
except TypeError:
|
|
# Skip all matches that do not parse to a proper date
|
|
continue
|
|
|
|
if date is not None and next_year > date.year > 1900:
|
|
self.log("info",
|
|
"Detected document date {} based on string {} "
|
|
"from document title"
|
|
"".format(date.isoformat(), date_string))
|
|
return date
|
|
|
|
try:
|
|
# getting text after checking filename will save time if only
|
|
# looking at the filename instead of the whole text
|
|
text = self.get_text()
|
|
except ParseError:
|
|
return None
|
|
|
|
# Iterate through all regex matches in text and try to parse the date
|
|
for m in re.finditer(DATE_REGEX, text):
|
|
date_string = m.group(0)
|
|
|
|
try:
|
|
date = __parser__(date_string, self.DATE_ORDER)
|
|
except TypeError:
|
|
# Skip all matches that do not parse to a proper date
|
|
continue
|
|
|
|
if date is not None and next_year > date.year > 1900:
|
|
break
|
|
else:
|
|
date = None
|
|
|
|
if date is not None:
|
|
self.log(
|
|
"info",
|
|
"Detected document date {} based on string {}".format(
|
|
date.isoformat(),
|
|
date_string
|
|
)
|
|
)
|
|
else:
|
|
self.log("info", "Unable to detect date for document")
|
|
|
|
return date
|
|
|
|
def log(self, level, message):
|
|
getattr(self.logger, level)(message, extra={
|
|
"group": self.logging_group
|
|
})
|
|
|
|
def cleanup(self):
|
|
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
|
shutil.rmtree(self.tempdir)
|