Add support for a heuristic that extracts the document date from its text

This commit is contained in:
Wolf-Bastian Pöttner
2018-01-28 19:09:52 +01:00
parent c16c9a1325
commit 21fc51c09a
5 changed files with 40 additions and 3 deletions

View File

@@ -118,12 +118,14 @@ class Consumer(object):
parsed_document = parser_class(doc)
thumbnail = parsed_document.get_thumbnail()
date = parsed_document.get_date()
try:
document = self._store(
parsed_document.get_text(),
doc,
thumbnail
thumbnail,
date
)
except ParseError as e:
@@ -174,7 +176,7 @@ class Consumer(object):
return sorted(
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def _store(self, text, doc, thumbnail):
def _store(self, text, doc, thumbnail, date):
file_info = FileInfo.from_path(doc)
@@ -182,7 +184,7 @@ class Consumer(object):
self.log("debug", "Saving record to database")
created = file_info.created or timezone.make_aware(
created = file_info.created or date or timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
with open(doc, "rb") as f:

View File

@@ -35,6 +35,12 @@ class DocumentParser(object):
"""
raise NotImplementedError()
def get_date(self):
"""
Returns the date of the document.
"""
raise NotImplementedError()
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group