mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-01 11:19:32 -05:00
Add support for a heuristic that extracts the document date from its text
This commit is contained in:
parent
c16c9a1325
commit
21fc51c09a
@ -13,6 +13,7 @@ python-dateutil>=2.6.0
|
|||||||
python-dotenv>=0.6.2
|
python-dotenv>=0.6.2
|
||||||
python-gnupg>=0.3.9
|
python-gnupg>=0.3.9
|
||||||
pytz>=2016.10
|
pytz>=2016.10
|
||||||
|
dateparser>=0.6.0
|
||||||
gunicorn==19.7.1
|
gunicorn==19.7.1
|
||||||
|
|
||||||
# For the tests
|
# For the tests
|
||||||
|
@ -118,12 +118,14 @@ class Consumer(object):
|
|||||||
|
|
||||||
parsed_document = parser_class(doc)
|
parsed_document = parser_class(doc)
|
||||||
thumbnail = parsed_document.get_thumbnail()
|
thumbnail = parsed_document.get_thumbnail()
|
||||||
|
date = parsed_document.get_date()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
document = self._store(
|
document = self._store(
|
||||||
parsed_document.get_text(),
|
parsed_document.get_text(),
|
||||||
doc,
|
doc,
|
||||||
thumbnail
|
thumbnail,
|
||||||
|
date
|
||||||
)
|
)
|
||||||
except ParseError as e:
|
except ParseError as e:
|
||||||
|
|
||||||
@ -174,7 +176,7 @@ class Consumer(object):
|
|||||||
return sorted(
|
return sorted(
|
||||||
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||||
|
|
||||||
def _store(self, text, doc, thumbnail):
|
def _store(self, text, doc, thumbnail, date):
|
||||||
|
|
||||||
file_info = FileInfo.from_path(doc)
|
file_info = FileInfo.from_path(doc)
|
||||||
|
|
||||||
@ -182,7 +184,7 @@ class Consumer(object):
|
|||||||
|
|
||||||
self.log("debug", "Saving record to database")
|
self.log("debug", "Saving record to database")
|
||||||
|
|
||||||
created = file_info.created or timezone.make_aware(
|
created = file_info.created or date or timezone.make_aware(
|
||||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||||
|
|
||||||
with open(doc, "rb") as f:
|
with open(doc, "rb") as f:
|
||||||
|
@ -35,6 +35,12 @@ class DocumentParser(object):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_date(self):
|
||||||
|
"""
|
||||||
|
Returns the date of the document.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def log(self, level, message):
|
def log(self, level, message):
|
||||||
getattr(self.logger, level)(message, extra={
|
getattr(self.logger, level)(message, extra={
|
||||||
"group": self.logging_group
|
"group": self.logging_group
|
||||||
|
@ -258,3 +258,6 @@ PAPERLESS_LIST_PER_PAGE = int(os.getenv("PAPERLESS_LIST_PER_PAGE", 100))
|
|||||||
|
|
||||||
FY_START = os.getenv("PAPERLESS_FINANCIAL_YEAR_START")
|
FY_START = os.getenv("PAPERLESS_FINANCIAL_YEAR_START")
|
||||||
FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
|
FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
|
||||||
|
|
||||||
|
# Specify the default date order (for autodetected dates)
|
||||||
|
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||||
|
@ -3,6 +3,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import Pool
|
||||||
|
import dateparser
|
||||||
|
|
||||||
import langdetect
|
import langdetect
|
||||||
import pyocr
|
import pyocr
|
||||||
@ -30,6 +31,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||||
UNPAPER = settings.UNPAPER_BINARY
|
UNPAPER = settings.UNPAPER_BINARY
|
||||||
|
DATE_ORDER = settings.DATE_ORDER
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
|
|
||||||
def get_thumbnail(self):
|
def get_thumbnail(self):
|
||||||
@ -175,6 +177,29 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def get_date(self):
|
||||||
|
text = self.get_text()
|
||||||
|
|
||||||
|
# This regular expression will try to find dates in the document at
|
||||||
|
# hand and will match the following formats:
|
||||||
|
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||||
|
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||||
|
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||||
|
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||||
|
# - MONTH ZZZZ
|
||||||
|
m = re.search(
|
||||||
|
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
|
||||||
|
r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||||
|
r'\b([^ ]{3,9} [0-9]{4})\b', text)
|
||||||
|
|
||||||
|
if m is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return dateparser.parse(m.group(0),
|
||||||
|
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||||
|
'PREFER_DAY_OF_MONTH': 'first',
|
||||||
|
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||||
|
|
||||||
|
|
||||||
def run_convert(*args):
|
def run_convert(*args):
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user