Adds additional testing for both date parsing and consumed document created date

This commit is contained in:
Trenton Holmes
2022-04-12 19:52:56 -07:00
parent ce32089cc4
commit 8a6aaf4e2d
9 changed files with 345 additions and 42 deletions

View File

@@ -3,6 +3,8 @@ import hashlib
import os
import uuid
from subprocess import Popen
from typing import Optional
from typing import Type
import magic
from asgiref.sync import async_to_sync
@@ -23,6 +25,7 @@ from .models import Document
from .models import DocumentType
from .models import FileInfo
from .models import Tag
from .parsers import DocumentParser
from .parsers import get_parser_class_for_mime_type
from .parsers import parse_date
from .parsers import ParseError
@@ -186,7 +189,7 @@ class Consumer(LoggingMixin):
override_document_type_id=None,
override_tag_ids=None,
task_id=None,
):
) -> Document:
"""
Return the document object if it was successfully created.
"""
@@ -220,7 +223,10 @@ class Consumer(LoggingMixin):
self.log("debug", f"Detected mime type: {mime_type}")
parser_class = get_parser_class_for_mime_type(mime_type)
# Based on the mime type, get the parser for that type
parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type(
mime_type,
)
if not parser_class:
self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
@@ -241,7 +247,10 @@ class Consumer(LoggingMixin):
# This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.logging_group, progress_callback)
document_parser: DocumentParser = parser_class(
self.logging_group,
progress_callback,
)
self.log("debug", f"Parser: {type(document_parser).__name__}")
@@ -270,7 +279,7 @@ class Consumer(LoggingMixin):
text = document_parser.get_text()
date = document_parser.get_date()
if not date:
if date is None:
self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE)
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path()
@@ -342,7 +351,7 @@ class Consumer(LoggingMixin):
).hexdigest()
# Don't save with the lock active. Saving will cause the file
# renaming logic to aquire the lock as well.
# renaming logic to acquire the lock as well.
document.save()
# Delete the file only if it was successfully consumed
@@ -362,7 +371,8 @@ class Consumer(LoggingMixin):
except Exception as e:
self._fail(
str(e),
f"The following error occured while consuming " f"{self.filename}: {e}",
f"The following error occurred while consuming "
f"{self.filename}: {e}",
exc_info=True,
)
finally:
@@ -376,21 +386,26 @@ class Consumer(LoggingMixin):
return document
def _store(self, text, date, mime_type):
def _store(self, text, date, mime_type) -> Document:
# If someone gave us the original filename, use it instead of doc.
file_info = FileInfo.from_filename(self.filename)
stats = os.stat(self.path)
self.log("debug", "Saving record to database")
created = (
file_info.created
or date
or timezone.make_aware(datetime.datetime.fromtimestamp(stats.st_mtime))
)
if file_info.created is not None:
create_date = file_info.created
self.log("debug", f"Creation date from FileInfo: {create_date}")
elif date is not None:
create_date = date
self.log("debug", f"Creation date from parse_date: {create_date}")
else:
stats = os.stat(self.path)
create_date = timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime),
)
self.log("debug", "Creation date from st_mtime: {create_date}")
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
@@ -400,8 +415,8 @@ class Consumer(LoggingMixin):
content=text,
mime_type=mime_type,
checksum=hashlib.md5(f.read()).hexdigest(),
created=created,
modified=created,
created=create_date,
modified=create_date,
storage_type=storage_type,
)