Merge pull request #721 from paperless-ngx/bug-fix-date-ignore

Fix Ignore Date Parsing
This commit is contained in:
shamoon
2022-05-10 16:45:58 -07:00
committed by GitHub
10 changed files with 373 additions and 49 deletions

View File

@@ -3,6 +3,8 @@ import hashlib
import os
import uuid
from subprocess import Popen
from typing import Optional
from typing import Type
import magic
from asgiref.sync import async_to_sync
@@ -23,6 +25,7 @@ from .models import Document
from .models import DocumentType
from .models import FileInfo
from .models import Tag
from .parsers import DocumentParser
from .parsers import get_parser_class_for_mime_type
from .parsers import parse_date
from .parsers import ParseError
@@ -186,7 +189,7 @@ class Consumer(LoggingMixin):
override_document_type_id=None,
override_tag_ids=None,
task_id=None,
):
) -> Document:
"""
Return the document object if it was successfully created.
"""
@@ -220,7 +223,10 @@ class Consumer(LoggingMixin):
self.log("debug", f"Detected mime type: {mime_type}")
parser_class = get_parser_class_for_mime_type(mime_type)
# Based on the mime type, get the parser for that type
parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type(
mime_type,
)
if not parser_class:
self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
@@ -241,7 +247,10 @@ class Consumer(LoggingMixin):
# This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.logging_group, progress_callback)
document_parser: DocumentParser = parser_class(
self.logging_group,
progress_callback,
)
self.log("debug", f"Parser: {type(document_parser).__name__}")
@@ -270,7 +279,7 @@ class Consumer(LoggingMixin):
text = document_parser.get_text()
date = document_parser.get_date()
if not date:
if date is None:
self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE)
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path()
@@ -342,7 +351,7 @@ class Consumer(LoggingMixin):
).hexdigest()
# Don't save with the lock active. Saving will cause the file
# renaming logic to aquire the lock as well.
# renaming logic to acquire the lock as well.
document.save()
# Delete the file only if it was successfully consumed
@@ -362,7 +371,8 @@ class Consumer(LoggingMixin):
except Exception as e:
self._fail(
str(e),
f"The following error occured while consuming " f"{self.filename}: {e}",
f"The following error occurred while consuming "
f"{self.filename}: {e}",
exc_info=True,
)
finally:
@@ -376,21 +386,26 @@ class Consumer(LoggingMixin):
return document
def _store(self, text, date, mime_type):
def _store(self, text, date, mime_type) -> Document:
# If someone gave us the original filename, use it instead of doc.
file_info = FileInfo.from_filename(self.filename)
stats = os.stat(self.path)
self.log("debug", "Saving record to database")
created = (
file_info.created
or date
or timezone.make_aware(datetime.datetime.fromtimestamp(stats.st_mtime))
)
if file_info.created is not None:
create_date = file_info.created
self.log("debug", f"Creation date from FileInfo: {create_date}")
elif date is not None:
create_date = date
self.log("debug", f"Creation date from parse_date: {create_date}")
else:
stats = os.stat(self.path)
create_date = timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime),
)
self.log("debug", f"Creation date from st_mtime: {create_date}")
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
@@ -400,8 +415,8 @@ class Consumer(LoggingMixin):
content=text,
mime_type=mime_type,
checksum=hashlib.md5(f.read()).hexdigest(),
created=created,
modified=created,
created=create_date,
modified=create_date,
storage_type=storage_type,
)

View File

@@ -379,6 +379,10 @@ class SavedViewFilterRule(models.Model):
# TODO: why is this in the models file?
# TODO: how about, what is this and where is it documented?
# It appears to parsing JSON from an environment variable to get a title and date from
# the filename, if possible, as a higher priority than either document filename or
# content parsing
class FileInfo:
REGEXES = OrderedDict(
@@ -386,8 +390,7 @@ class FileInfo:
(
"created-title",
re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)$",
r"^(?P<created>\d{8}(\d{6})?Z) - " r"(?P<title>.*)$",
flags=re.IGNORECASE,
),
),
@@ -427,7 +430,7 @@ class FileInfo:
properties[name] = getattr(cls, f"_get_{name}")(properties[name])
@classmethod
def from_filename(cls, filename):
def from_filename(cls, filename) -> "FileInfo":
# Mutate filename in-place before parsing its components
# by applying at most one of the configured transformations.
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:

View File

@@ -1,3 +1,4 @@
import datetime
import logging
import mimetypes
import os
@@ -5,6 +6,8 @@ import re
import shutil
import subprocess
import tempfile
from typing import Optional
from typing import Set
import magic
from django.conf import settings
@@ -40,11 +43,11 @@ DATE_REGEX = re.compile(
logger = logging.getLogger("paperless.parsing")
def is_mime_type_supported(mime_type):
def is_mime_type_supported(mime_type) -> bool:
return get_parser_class_for_mime_type(mime_type) is not None
def get_default_file_extension(mime_type):
def get_default_file_extension(mime_type) -> str:
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
@@ -59,14 +62,14 @@ def get_default_file_extension(mime_type):
return ""
def is_file_ext_supported(ext):
def is_file_ext_supported(ext) -> bool:
if ext:
return ext.lower() in get_supported_file_extensions()
else:
return False
def get_supported_file_extensions():
def get_supported_file_extensions() -> Set[str]:
extensions = set()
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
@@ -121,7 +124,7 @@ def run_convert(
auto_orient=False,
extra=None,
logging_group=None,
):
) -> None:
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
@@ -146,11 +149,11 @@ def run_convert(
raise ParseError(f"Convert failed at {args}")
def get_default_thumbnail():
def get_default_thumbnail() -> str:
return os.path.join(os.path.dirname(__file__), "resources", "document.png")
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str:
out_path = os.path.join(temp_dir, "convert_gs.png")
# if convert fails, fall back to extracting
@@ -184,7 +187,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
return get_default_thumbnail()
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
@@ -209,12 +212,12 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
return out_path
def parse_date(filename, text):
def parse_date(filename, text) -> Optional[datetime.datetime]:
"""
Returns the date of the document.
"""
def __parser(ds, date_order):
def __parser(ds: str, date_order: str) -> datetime.datetime:
"""
Call dateparser.parse with a particular date ordering
"""
@@ -230,9 +233,9 @@ def parse_date(filename, text):
},
)
def __filter(date):
def __filter(date: datetime.datetime) -> Optional[datetime.datetime]:
if (
date
date is not None
and date.year > 1900
and date <= timezone.now()
and date.date() not in settings.IGNORE_DATES
@@ -269,7 +272,7 @@ def parse_date(filename, text):
date = __filter(date)
if date is not None:
break
return date
return date
@@ -294,7 +297,7 @@ class DocumentParser(LoggingMixin):
self.archive_path = None
self.text = None
self.date = None
self.date: Optional[datetime.datetime] = None
self.progress_callback = progress_callback
def progress(self, current_progress, max_progress):
@@ -342,7 +345,7 @@ class DocumentParser(LoggingMixin):
def get_text(self):
return self.text
def get_date(self):
def get_date(self) -> Optional[datetime.datetime]:
return self.date
def cleanup(self):

Binary file not shown.

Binary file not shown.

View File

@@ -1,3 +1,4 @@
import datetime
import os
import re
import shutil
@@ -5,6 +6,8 @@ import tempfile
from unittest import mock
from unittest.mock import MagicMock
from dateutil import tz
try:
import zoneinfo
except ImportError:
@@ -502,7 +505,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertRaisesMessage(
ConsumerError,
"sample.pdf: The following error occured while consuming sample.pdf: NO.",
"sample.pdf: The following error occurred while consuming sample.pdf: NO.",
self.consumer.try_consume_file,
filename,
)
@@ -654,6 +657,127 @@ class TestConsumer(DirectoriesMixin, TestCase):
sanity_check()
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumerCreatedDate(DirectoriesMixin, TestCase):
def setUp(self):
super(TestConsumerCreatedDate, self).setUp()
# this prevents websocket message reports during testing.
patcher = mock.patch("documents.consumer.Consumer._send_progress")
self._send_progress = patcher.start()
self.addCleanup(patcher.stop)
self.consumer = Consumer()
def test_consume_date_from_content(self):
"""
GIVEN:
- File content with date in DMY (default) format
THEN:
- Should parse the date from the file content
"""
src = os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000005.pdf",
)
dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
shutil.copy(src, dst)
document = self.consumer.try_consume_file(dst)
self.assertEqual(
document.created,
datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_consume_date_from_filename(self):
"""
GIVEN:
- File content with date in DMY (default) format
- Filename with date in YMD format
THEN:
- Should parse the date from the filename
"""
src = os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000005.pdf",
)
dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf")
shutil.copy(src, dst)
document = self.consumer.try_consume_file(dst)
self.assertEqual(
document.created,
datetime.datetime(2022, 2, 1, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_consume_date_filename_date_use_content(self):
"""
GIVEN:
- File content with date in DMY (default) format
- Filename date parsing disabled
- Filename with date in YMD format
THEN:
- Should parse the date from the content
"""
src = os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000005.pdf",
)
dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf")
shutil.copy(src, dst)
document = self.consumer.try_consume_file(dst)
self.assertEqual(
document.created,
datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(
IGNORE_DATES=(datetime.date(2010, 12, 13), datetime.date(2011, 11, 12)),
)
def test_consume_date_use_content_with_ignore(self):
"""
GIVEN:
- File content with dates in DMY (default) format
- File content includes ignored dates
THEN:
- Should parse the date from the filename
"""
src = os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000006.pdf",
)
dst = os.path.join(self.dirs.scratch_dir, "0000006.pdf")
shutil.copy(src, dst)
document = self.consumer.try_consume_file(dst)
self.assertEqual(
document.created,
datetime.datetime(1997, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
class PreConsumeTestCase(TestCase):
@mock.patch("documents.consumer.Popen")
@override_settings(PRE_CONSUME_SCRIPT=None)

View File

@@ -8,6 +8,7 @@ from django.conf import settings
from django.test import override_settings
from django.test import TestCase
from documents.parsers import parse_date
from paperless.settings import DATE_ORDER
class TestDate(TestCase):
@@ -160,19 +161,112 @@ class TestDate(TestCase):
def test_crazy_date_with_spaces(self, *args):
self.assertIsNone(parse_date("", "20 408000l 2475"))
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_valid_ymd(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Year Month Day (YMD)
- Filename contains date matching the format
THEN:
- Should parse the date from the filename
"""
self.assertEqual(
parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"),
datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="DMY")
def test_filename_date_parse_valid_dmy(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Day Month Year (DMY)
- Filename contains date matching the format
THEN:
- Should parse the date from the filename
"""
self.assertEqual(
parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"),
datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_invalid(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename includes no date
- File content includes no date
THEN:
- No date is parsed
"""
self.assertIsNone(
parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"),
)
@override_settings(
FILENAME_DATE_ORDER="YMD",
IGNORE_DATES=(datetime.date(2022, 4, 1),),
)
def test_filename_date_ignored_use_content(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Day Month Year (YMD)
- Date order is Day Month Year (DMY, the default)
- Filename contains date matching the format
- Filename date is an ignored date
- File content includes a date
THEN:
- Should parse the date from the content not filename
"""
self.assertEqual(
parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"),
datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
)
def test_ignored_dates(self, *args):
def test_ignored_dates_default_order(self, *args):
"""
GIVEN:
- Ignore dates have been set
- File content includes ignored dates
- File content includes 1 non-ignored date
THEN:
- Should parse the date non-ignored date from content
"""
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " "ipsum"
date = parse_date("", text)
self.assertEqual(
date,
parse_date("", text),
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
DATE_ORDER="YMD",
)
def test_ignored_dates_order_ymd(self, *args):
"""
GIVEN:
- Ignore dates have been set
- Date order is Year Month Date (YMD)
- File content includes ignored dates
- File content includes 1 non-ignored date
THEN:
- Should parse the date non-ignored date from content
"""
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem " "ipsum"
self.assertEqual(
parse_date("", text),
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)