mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #721 from paperless-ngx/bug-fix-date-ignore
Fix Ignore Date Parsing
This commit is contained in:
commit
536576518e
@ -650,7 +650,6 @@ PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
|
|||||||
|
|
||||||
Defaults to "PATCHT"
|
Defaults to "PATCHT"
|
||||||
|
|
||||||
|
|
||||||
PAPERLESS_CONVERT_MEMORY_LIMIT=<num>
|
PAPERLESS_CONVERT_MEMORY_LIMIT=<num>
|
||||||
On smaller systems, or even in the case of Very Large Documents, the consumer
|
On smaller systems, or even in the case of Very Large Documents, the consumer
|
||||||
may explode, complaining about how it's "unable to extend pixel cache". In
|
may explode, complaining about how it's "unable to extend pixel cache". In
|
||||||
@ -696,6 +695,9 @@ PAPERLESS_FILENAME_DATE_ORDER=<format>
|
|||||||
The filename will be checked first, and if nothing is found, the document
|
The filename will be checked first, and if nothing is found, the document
|
||||||
text will be checked as normal.
|
text will be checked as normal.
|
||||||
|
|
||||||
|
A date in a filename must have some separators (`.`, `-`, `/`, etc)
|
||||||
|
for it to be parsed.
|
||||||
|
|
||||||
Defaults to none, which disables this feature.
|
Defaults to none, which disables this feature.
|
||||||
|
|
||||||
PAPERLESS_THUMBNAIL_FONT_NAME=<filename>
|
PAPERLESS_THUMBNAIL_FONT_NAME=<filename>
|
||||||
@ -713,10 +715,7 @@ PAPERLESS_IGNORE_DATES=<string>
|
|||||||
this process. This is useful for special dates (like date of birth) that appear
|
this process. This is useful for special dates (like date of birth) that appear
|
||||||
in documents regularly but are very unlikely to be the documents creation date.
|
in documents regularly but are very unlikely to be the documents creation date.
|
||||||
|
|
||||||
You may specify dates in a multitude of formats supported by dateparser (see
|
The date is parsed using the order specified in PAPERLESS_DATE_ORDER
|
||||||
https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates
|
|
||||||
need to be comma separated, the options are limited.
|
|
||||||
Example: "2020-12-02,22.04.1999"
|
|
||||||
|
|
||||||
Defaults to an empty string to not ignore any dates.
|
Defaults to an empty string to not ignore any dates.
|
||||||
|
|
||||||
|
@ -3,6 +3,8 @@ import hashlib
|
|||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
from subprocess import Popen
|
from subprocess import Popen
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
from asgiref.sync import async_to_sync
|
from asgiref.sync import async_to_sync
|
||||||
@ -23,6 +25,7 @@ from .models import Document
|
|||||||
from .models import DocumentType
|
from .models import DocumentType
|
||||||
from .models import FileInfo
|
from .models import FileInfo
|
||||||
from .models import Tag
|
from .models import Tag
|
||||||
|
from .parsers import DocumentParser
|
||||||
from .parsers import get_parser_class_for_mime_type
|
from .parsers import get_parser_class_for_mime_type
|
||||||
from .parsers import parse_date
|
from .parsers import parse_date
|
||||||
from .parsers import ParseError
|
from .parsers import ParseError
|
||||||
@ -186,7 +189,7 @@ class Consumer(LoggingMixin):
|
|||||||
override_document_type_id=None,
|
override_document_type_id=None,
|
||||||
override_tag_ids=None,
|
override_tag_ids=None,
|
||||||
task_id=None,
|
task_id=None,
|
||||||
):
|
) -> Document:
|
||||||
"""
|
"""
|
||||||
Return the document object if it was successfully created.
|
Return the document object if it was successfully created.
|
||||||
"""
|
"""
|
||||||
@ -220,7 +223,10 @@ class Consumer(LoggingMixin):
|
|||||||
|
|
||||||
self.log("debug", f"Detected mime type: {mime_type}")
|
self.log("debug", f"Detected mime type: {mime_type}")
|
||||||
|
|
||||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
# Based on the mime type, get the parser for that type
|
||||||
|
parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type(
|
||||||
|
mime_type,
|
||||||
|
)
|
||||||
if not parser_class:
|
if not parser_class:
|
||||||
self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
|
self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
|
||||||
|
|
||||||
@ -241,7 +247,10 @@ class Consumer(LoggingMixin):
|
|||||||
|
|
||||||
# This doesn't parse the document yet, but gives us a parser.
|
# This doesn't parse the document yet, but gives us a parser.
|
||||||
|
|
||||||
document_parser = parser_class(self.logging_group, progress_callback)
|
document_parser: DocumentParser = parser_class(
|
||||||
|
self.logging_group,
|
||||||
|
progress_callback,
|
||||||
|
)
|
||||||
|
|
||||||
self.log("debug", f"Parser: {type(document_parser).__name__}")
|
self.log("debug", f"Parser: {type(document_parser).__name__}")
|
||||||
|
|
||||||
@ -270,7 +279,7 @@ class Consumer(LoggingMixin):
|
|||||||
|
|
||||||
text = document_parser.get_text()
|
text = document_parser.get_text()
|
||||||
date = document_parser.get_date()
|
date = document_parser.get_date()
|
||||||
if not date:
|
if date is None:
|
||||||
self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE)
|
self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE)
|
||||||
date = parse_date(self.filename, text)
|
date = parse_date(self.filename, text)
|
||||||
archive_path = document_parser.get_archive_path()
|
archive_path = document_parser.get_archive_path()
|
||||||
@ -342,7 +351,7 @@ class Consumer(LoggingMixin):
|
|||||||
).hexdigest()
|
).hexdigest()
|
||||||
|
|
||||||
# Don't save with the lock active. Saving will cause the file
|
# Don't save with the lock active. Saving will cause the file
|
||||||
# renaming logic to aquire the lock as well.
|
# renaming logic to acquire the lock as well.
|
||||||
document.save()
|
document.save()
|
||||||
|
|
||||||
# Delete the file only if it was successfully consumed
|
# Delete the file only if it was successfully consumed
|
||||||
@ -362,7 +371,8 @@ class Consumer(LoggingMixin):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._fail(
|
self._fail(
|
||||||
str(e),
|
str(e),
|
||||||
f"The following error occured while consuming " f"{self.filename}: {e}",
|
f"The following error occurred while consuming "
|
||||||
|
f"{self.filename}: {e}",
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
@ -376,21 +386,26 @@ class Consumer(LoggingMixin):
|
|||||||
|
|
||||||
return document
|
return document
|
||||||
|
|
||||||
def _store(self, text, date, mime_type):
|
def _store(self, text, date, mime_type) -> Document:
|
||||||
|
|
||||||
# If someone gave us the original filename, use it instead of doc.
|
# If someone gave us the original filename, use it instead of doc.
|
||||||
|
|
||||||
file_info = FileInfo.from_filename(self.filename)
|
file_info = FileInfo.from_filename(self.filename)
|
||||||
|
|
||||||
stats = os.stat(self.path)
|
|
||||||
|
|
||||||
self.log("debug", "Saving record to database")
|
self.log("debug", "Saving record to database")
|
||||||
|
|
||||||
created = (
|
if file_info.created is not None:
|
||||||
file_info.created
|
create_date = file_info.created
|
||||||
or date
|
self.log("debug", f"Creation date from FileInfo: {create_date}")
|
||||||
or timezone.make_aware(datetime.datetime.fromtimestamp(stats.st_mtime))
|
elif date is not None:
|
||||||
)
|
create_date = date
|
||||||
|
self.log("debug", f"Creation date from parse_date: {create_date}")
|
||||||
|
else:
|
||||||
|
stats = os.stat(self.path)
|
||||||
|
create_date = timezone.make_aware(
|
||||||
|
datetime.datetime.fromtimestamp(stats.st_mtime),
|
||||||
|
)
|
||||||
|
self.log("debug", f"Creation date from st_mtime: {create_date}")
|
||||||
|
|
||||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||||
|
|
||||||
@ -400,8 +415,8 @@ class Consumer(LoggingMixin):
|
|||||||
content=text,
|
content=text,
|
||||||
mime_type=mime_type,
|
mime_type=mime_type,
|
||||||
checksum=hashlib.md5(f.read()).hexdigest(),
|
checksum=hashlib.md5(f.read()).hexdigest(),
|
||||||
created=created,
|
created=create_date,
|
||||||
modified=created,
|
modified=create_date,
|
||||||
storage_type=storage_type,
|
storage_type=storage_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -379,6 +379,10 @@ class SavedViewFilterRule(models.Model):
|
|||||||
|
|
||||||
|
|
||||||
# TODO: why is this in the models file?
|
# TODO: why is this in the models file?
|
||||||
|
# TODO: how about, what is this and where is it documented?
|
||||||
|
# It appears to parsing JSON from an environment variable to get a title and date from
|
||||||
|
# the filename, if possible, as a higher priority than either document filename or
|
||||||
|
# content parsing
|
||||||
class FileInfo:
|
class FileInfo:
|
||||||
|
|
||||||
REGEXES = OrderedDict(
|
REGEXES = OrderedDict(
|
||||||
@ -386,8 +390,7 @@ class FileInfo:
|
|||||||
(
|
(
|
||||||
"created-title",
|
"created-title",
|
||||||
re.compile(
|
re.compile(
|
||||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
r"^(?P<created>\d{8}(\d{6})?Z) - " r"(?P<title>.*)$",
|
||||||
r"(?P<title>.*)$",
|
|
||||||
flags=re.IGNORECASE,
|
flags=re.IGNORECASE,
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
@ -427,7 +430,7 @@ class FileInfo:
|
|||||||
properties[name] = getattr(cls, f"_get_{name}")(properties[name])
|
properties[name] = getattr(cls, f"_get_{name}")(properties[name])
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_filename(cls, filename):
|
def from_filename(cls, filename) -> "FileInfo":
|
||||||
# Mutate filename in-place before parsing its components
|
# Mutate filename in-place before parsing its components
|
||||||
# by applying at most one of the configured transformations.
|
# by applying at most one of the configured transformations.
|
||||||
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
|
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
@ -5,6 +6,8 @@ import re
|
|||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Set
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
@ -40,11 +43,11 @@ DATE_REGEX = re.compile(
|
|||||||
logger = logging.getLogger("paperless.parsing")
|
logger = logging.getLogger("paperless.parsing")
|
||||||
|
|
||||||
|
|
||||||
def is_mime_type_supported(mime_type):
|
def is_mime_type_supported(mime_type) -> bool:
|
||||||
return get_parser_class_for_mime_type(mime_type) is not None
|
return get_parser_class_for_mime_type(mime_type) is not None
|
||||||
|
|
||||||
|
|
||||||
def get_default_file_extension(mime_type):
|
def get_default_file_extension(mime_type) -> str:
|
||||||
for response in document_consumer_declaration.send(None):
|
for response in document_consumer_declaration.send(None):
|
||||||
parser_declaration = response[1]
|
parser_declaration = response[1]
|
||||||
supported_mime_types = parser_declaration["mime_types"]
|
supported_mime_types = parser_declaration["mime_types"]
|
||||||
@ -59,14 +62,14 @@ def get_default_file_extension(mime_type):
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def is_file_ext_supported(ext):
|
def is_file_ext_supported(ext) -> bool:
|
||||||
if ext:
|
if ext:
|
||||||
return ext.lower() in get_supported_file_extensions()
|
return ext.lower() in get_supported_file_extensions()
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_supported_file_extensions():
|
def get_supported_file_extensions() -> Set[str]:
|
||||||
extensions = set()
|
extensions = set()
|
||||||
for response in document_consumer_declaration.send(None):
|
for response in document_consumer_declaration.send(None):
|
||||||
parser_declaration = response[1]
|
parser_declaration = response[1]
|
||||||
@ -121,7 +124,7 @@ def run_convert(
|
|||||||
auto_orient=False,
|
auto_orient=False,
|
||||||
extra=None,
|
extra=None,
|
||||||
logging_group=None,
|
logging_group=None,
|
||||||
):
|
) -> None:
|
||||||
|
|
||||||
environment = os.environ.copy()
|
environment = os.environ.copy()
|
||||||
if settings.CONVERT_MEMORY_LIMIT:
|
if settings.CONVERT_MEMORY_LIMIT:
|
||||||
@ -146,11 +149,11 @@ def run_convert(
|
|||||||
raise ParseError(f"Convert failed at {args}")
|
raise ParseError(f"Convert failed at {args}")
|
||||||
|
|
||||||
|
|
||||||
def get_default_thumbnail():
|
def get_default_thumbnail() -> str:
|
||||||
return os.path.join(os.path.dirname(__file__), "resources", "document.png")
|
return os.path.join(os.path.dirname(__file__), "resources", "document.png")
|
||||||
|
|
||||||
|
|
||||||
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
|
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str:
|
||||||
out_path = os.path.join(temp_dir, "convert_gs.png")
|
out_path = os.path.join(temp_dir, "convert_gs.png")
|
||||||
|
|
||||||
# if convert fails, fall back to extracting
|
# if convert fails, fall back to extracting
|
||||||
@ -184,7 +187,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
|
|||||||
return get_default_thumbnail()
|
return get_default_thumbnail()
|
||||||
|
|
||||||
|
|
||||||
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
|
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
|
||||||
"""
|
"""
|
||||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||||
"""
|
"""
|
||||||
@ -209,12 +212,12 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
|
|||||||
return out_path
|
return out_path
|
||||||
|
|
||||||
|
|
||||||
def parse_date(filename, text):
|
def parse_date(filename, text) -> Optional[datetime.datetime]:
|
||||||
"""
|
"""
|
||||||
Returns the date of the document.
|
Returns the date of the document.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __parser(ds, date_order):
|
def __parser(ds: str, date_order: str) -> datetime.datetime:
|
||||||
"""
|
"""
|
||||||
Call dateparser.parse with a particular date ordering
|
Call dateparser.parse with a particular date ordering
|
||||||
"""
|
"""
|
||||||
@ -230,9 +233,9 @@ def parse_date(filename, text):
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
def __filter(date):
|
def __filter(date: datetime.datetime) -> Optional[datetime.datetime]:
|
||||||
if (
|
if (
|
||||||
date
|
date is not None
|
||||||
and date.year > 1900
|
and date.year > 1900
|
||||||
and date <= timezone.now()
|
and date <= timezone.now()
|
||||||
and date.date() not in settings.IGNORE_DATES
|
and date.date() not in settings.IGNORE_DATES
|
||||||
@ -269,7 +272,7 @@ def parse_date(filename, text):
|
|||||||
|
|
||||||
date = __filter(date)
|
date = __filter(date)
|
||||||
if date is not None:
|
if date is not None:
|
||||||
break
|
return date
|
||||||
|
|
||||||
return date
|
return date
|
||||||
|
|
||||||
@ -294,7 +297,7 @@ class DocumentParser(LoggingMixin):
|
|||||||
|
|
||||||
self.archive_path = None
|
self.archive_path = None
|
||||||
self.text = None
|
self.text = None
|
||||||
self.date = None
|
self.date: Optional[datetime.datetime] = None
|
||||||
self.progress_callback = progress_callback
|
self.progress_callback = progress_callback
|
||||||
|
|
||||||
def progress(self, current_progress, max_progress):
|
def progress(self, current_progress, max_progress):
|
||||||
@ -342,7 +345,7 @@ class DocumentParser(LoggingMixin):
|
|||||||
def get_text(self):
|
def get_text(self):
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
def get_date(self):
|
def get_date(self) -> Optional[datetime.datetime]:
|
||||||
return self.date
|
return self.date
|
||||||
|
|
||||||
def cleanup(self):
|
def cleanup(self):
|
||||||
|
BIN
src/documents/tests/samples/documents/originals/0000005.pdf
Executable file
BIN
src/documents/tests/samples/documents/originals/0000005.pdf
Executable file
Binary file not shown.
BIN
src/documents/tests/samples/documents/originals/0000006.pdf
Executable file
BIN
src/documents/tests/samples/documents/originals/0000006.pdf
Executable file
Binary file not shown.
@ -1,3 +1,4 @@
|
|||||||
|
import datetime
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
@ -5,6 +6,8 @@ import tempfile
|
|||||||
from unittest import mock
|
from unittest import mock
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
from dateutil import tz
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import zoneinfo
|
import zoneinfo
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -502,7 +505,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
|||||||
|
|
||||||
self.assertRaisesMessage(
|
self.assertRaisesMessage(
|
||||||
ConsumerError,
|
ConsumerError,
|
||||||
"sample.pdf: The following error occured while consuming sample.pdf: NO.",
|
"sample.pdf: The following error occurred while consuming sample.pdf: NO.",
|
||||||
self.consumer.try_consume_file,
|
self.consumer.try_consume_file,
|
||||||
filename,
|
filename,
|
||||||
)
|
)
|
||||||
@ -654,6 +657,127 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
|||||||
sanity_check()
|
sanity_check()
|
||||||
|
|
||||||
|
|
||||||
|
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||||
|
class TestConsumerCreatedDate(DirectoriesMixin, TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
super(TestConsumerCreatedDate, self).setUp()
|
||||||
|
|
||||||
|
# this prevents websocket message reports during testing.
|
||||||
|
patcher = mock.patch("documents.consumer.Consumer._send_progress")
|
||||||
|
self._send_progress = patcher.start()
|
||||||
|
self.addCleanup(patcher.stop)
|
||||||
|
|
||||||
|
self.consumer = Consumer()
|
||||||
|
|
||||||
|
def test_consume_date_from_content(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File content with date in DMY (default) format
|
||||||
|
|
||||||
|
THEN:
|
||||||
|
- Should parse the date from the file content
|
||||||
|
"""
|
||||||
|
src = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"documents",
|
||||||
|
"originals",
|
||||||
|
"0000005.pdf",
|
||||||
|
)
|
||||||
|
dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
|
||||||
|
shutil.copy(src, dst)
|
||||||
|
|
||||||
|
document = self.consumer.try_consume_file(dst)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
document.created,
|
||||||
|
datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||||
|
def test_consume_date_from_filename(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File content with date in DMY (default) format
|
||||||
|
- Filename with date in YMD format
|
||||||
|
|
||||||
|
THEN:
|
||||||
|
- Should parse the date from the filename
|
||||||
|
"""
|
||||||
|
src = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"documents",
|
||||||
|
"originals",
|
||||||
|
"0000005.pdf",
|
||||||
|
)
|
||||||
|
dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf")
|
||||||
|
shutil.copy(src, dst)
|
||||||
|
|
||||||
|
document = self.consumer.try_consume_file(dst)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
document.created,
|
||||||
|
datetime.datetime(2022, 2, 1, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_consume_date_filename_date_use_content(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File content with date in DMY (default) format
|
||||||
|
- Filename date parsing disabled
|
||||||
|
- Filename with date in YMD format
|
||||||
|
|
||||||
|
THEN:
|
||||||
|
- Should parse the date from the content
|
||||||
|
"""
|
||||||
|
src = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"documents",
|
||||||
|
"originals",
|
||||||
|
"0000005.pdf",
|
||||||
|
)
|
||||||
|
dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf")
|
||||||
|
shutil.copy(src, dst)
|
||||||
|
|
||||||
|
document = self.consumer.try_consume_file(dst)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
document.created,
|
||||||
|
datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(
|
||||||
|
IGNORE_DATES=(datetime.date(2010, 12, 13), datetime.date(2011, 11, 12)),
|
||||||
|
)
|
||||||
|
def test_consume_date_use_content_with_ignore(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File content with dates in DMY (default) format
|
||||||
|
- File content includes ignored dates
|
||||||
|
|
||||||
|
THEN:
|
||||||
|
- Should parse the date from the filename
|
||||||
|
"""
|
||||||
|
src = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"documents",
|
||||||
|
"originals",
|
||||||
|
"0000006.pdf",
|
||||||
|
)
|
||||||
|
dst = os.path.join(self.dirs.scratch_dir, "0000006.pdf")
|
||||||
|
shutil.copy(src, dst)
|
||||||
|
|
||||||
|
document = self.consumer.try_consume_file(dst)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
document.created,
|
||||||
|
datetime.datetime(1997, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class PreConsumeTestCase(TestCase):
|
class PreConsumeTestCase(TestCase):
|
||||||
@mock.patch("documents.consumer.Popen")
|
@mock.patch("documents.consumer.Popen")
|
||||||
@override_settings(PRE_CONSUME_SCRIPT=None)
|
@override_settings(PRE_CONSUME_SCRIPT=None)
|
||||||
|
@ -8,6 +8,7 @@ from django.conf import settings
|
|||||||
from django.test import override_settings
|
from django.test import override_settings
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from documents.parsers import parse_date
|
from documents.parsers import parse_date
|
||||||
|
from paperless.settings import DATE_ORDER
|
||||||
|
|
||||||
|
|
||||||
class TestDate(TestCase):
|
class TestDate(TestCase):
|
||||||
@ -160,19 +161,112 @@ class TestDate(TestCase):
|
|||||||
def test_crazy_date_with_spaces(self, *args):
|
def test_crazy_date_with_spaces(self, *args):
|
||||||
self.assertIsNone(parse_date("", "20 408000l 2475"))
|
self.assertIsNone(parse_date("", "20 408000l 2475"))
|
||||||
|
|
||||||
|
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||||
|
def test_filename_date_parse_valid_ymd(self, *args):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Date parsing from the filename is enabled
|
||||||
|
- Filename date format is with Year Month Day (YMD)
|
||||||
|
- Filename contains date matching the format
|
||||||
|
|
||||||
|
THEN:
|
||||||
|
- Should parse the date from the filename
|
||||||
|
"""
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"),
|
||||||
|
datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(FILENAME_DATE_ORDER="DMY")
|
||||||
|
def test_filename_date_parse_valid_dmy(self, *args):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Date parsing from the filename is enabled
|
||||||
|
- Filename date format is with Day Month Year (DMY)
|
||||||
|
- Filename contains date matching the format
|
||||||
|
|
||||||
|
THEN:
|
||||||
|
- Should parse the date from the filename
|
||||||
|
"""
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"),
|
||||||
|
datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
@override_settings(FILENAME_DATE_ORDER="YMD")
|
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||||
def test_filename_date_parse_invalid(self, *args):
|
def test_filename_date_parse_invalid(self, *args):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Date parsing from the filename is enabled
|
||||||
|
- Filename includes no date
|
||||||
|
- File content includes no date
|
||||||
|
|
||||||
|
THEN:
|
||||||
|
- No date is parsed
|
||||||
|
"""
|
||||||
self.assertIsNone(
|
self.assertIsNone(
|
||||||
parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"),
|
parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@override_settings(
|
||||||
|
FILENAME_DATE_ORDER="YMD",
|
||||||
|
IGNORE_DATES=(datetime.date(2022, 4, 1),),
|
||||||
|
)
|
||||||
|
def test_filename_date_ignored_use_content(self, *args):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Date parsing from the filename is enabled
|
||||||
|
- Filename date format is with Day Month Year (YMD)
|
||||||
|
- Date order is Day Month Year (DMY, the default)
|
||||||
|
- Filename contains date matching the format
|
||||||
|
- Filename date is an ignored date
|
||||||
|
- File content includes a date
|
||||||
|
|
||||||
|
THEN:
|
||||||
|
- Should parse the date from the content not filename
|
||||||
|
"""
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"),
|
||||||
|
datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
@override_settings(
|
@override_settings(
|
||||||
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
|
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
|
||||||
)
|
)
|
||||||
def test_ignored_dates(self, *args):
|
def test_ignored_dates_default_order(self, *args):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Ignore dates have been set
|
||||||
|
- File content includes ignored dates
|
||||||
|
- File content includes 1 non-ignored date
|
||||||
|
|
||||||
|
THEN:
|
||||||
|
- Should parse the date non-ignored date from content
|
||||||
|
"""
|
||||||
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " "ipsum"
|
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " "ipsum"
|
||||||
date = parse_date("", text)
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
date,
|
parse_date("", text),
|
||||||
|
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(
|
||||||
|
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
|
||||||
|
DATE_ORDER="YMD",
|
||||||
|
)
|
||||||
|
def test_ignored_dates_order_ymd(self, *args):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Ignore dates have been set
|
||||||
|
- Date order is Year Month Date (YMD)
|
||||||
|
- File content includes ignored dates
|
||||||
|
- File content includes 1 non-ignored date
|
||||||
|
|
||||||
|
THEN:
|
||||||
|
- Should parse the date non-ignored date from content
|
||||||
|
"""
|
||||||
|
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem " "ipsum"
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("", text),
|
||||||
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
)
|
)
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
|
import datetime
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import Final
|
from typing import Final
|
||||||
|
from typing import Set
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from concurrent_log_handler.queue import setup_logging_queues
|
from concurrent_log_handler.queue import setup_logging_queues
|
||||||
@ -603,16 +605,42 @@ PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
|||||||
if PAPERLESS_TIKA_ENABLED:
|
if PAPERLESS_TIKA_ENABLED:
|
||||||
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
||||||
|
|
||||||
# List dates that should be ignored when trying to parse date from document text
|
|
||||||
IGNORE_DATES = set()
|
|
||||||
|
|
||||||
if os.getenv("PAPERLESS_IGNORE_DATES", ""):
|
def _parse_ignore_dates(
|
||||||
|
env_ignore: str,
|
||||||
|
date_order: str = DATE_ORDER,
|
||||||
|
) -> Set[datetime.datetime]:
|
||||||
|
"""
|
||||||
|
If the PAPERLESS_IGNORE_DATES environment variable is set, parse the
|
||||||
|
user provided string(s) into dates
|
||||||
|
|
||||||
|
Args:
|
||||||
|
env_ignore (str): The value of the environment variable, comma seperated dates
|
||||||
|
date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Set[datetime.datetime]: The set of parsed date objects
|
||||||
|
"""
|
||||||
import dateparser
|
import dateparser
|
||||||
|
|
||||||
for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","):
|
ignored_dates = set()
|
||||||
d = dateparser.parse(s)
|
for s in env_ignore.split(","):
|
||||||
|
d = dateparser.parse(
|
||||||
|
s,
|
||||||
|
settings={
|
||||||
|
"DATE_ORDER": date_order,
|
||||||
|
},
|
||||||
|
)
|
||||||
if d:
|
if d:
|
||||||
IGNORE_DATES.add(d.date())
|
ignored_dates.add(d.date())
|
||||||
|
return ignored_dates
|
||||||
|
|
||||||
|
|
||||||
|
# List dates that should be ignored when trying to parse date from document text
|
||||||
|
IGNORE_DATES: Set[datetime.date] = set()
|
||||||
|
|
||||||
|
if os.getenv("PAPERLESS_IGNORE_DATES") is not None:
|
||||||
|
IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES"))
|
||||||
|
|
||||||
ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
|
ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
|
||||||
if ENABLE_UPDATE_CHECK != "default":
|
if ENABLE_UPDATE_CHECK != "default":
|
||||||
|
58
src/paperless/tests/test_settings.py
Normal file
58
src/paperless/tests/test_settings.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import datetime
|
||||||
|
from unittest import TestCase
|
||||||
|
|
||||||
|
from paperless.settings import _parse_ignore_dates
|
||||||
|
|
||||||
|
|
||||||
|
class TestIgnoreDateParsing(TestCase):
|
||||||
|
"""
|
||||||
|
Tests the parsing of the PAPERLESS_IGNORE_DATES setting value
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _parse_checker(self, test_cases):
|
||||||
|
"""
|
||||||
|
Helper function to check ignore date parsing
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_cases (_type_): _description_
|
||||||
|
"""
|
||||||
|
for env_str, date_format, expected_date_set in test_cases:
|
||||||
|
|
||||||
|
self.assertSetEqual(
|
||||||
|
_parse_ignore_dates(env_str, date_format),
|
||||||
|
expected_date_set,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_no_ignore_dates_set(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- No ignore dates are set
|
||||||
|
THEN:
|
||||||
|
- No ignore dates are parsed
|
||||||
|
"""
|
||||||
|
self.assertSetEqual(_parse_ignore_dates(""), set())
|
||||||
|
|
||||||
|
def test_single_ignore_dates_set(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Ignore dates are set per certain inputs
|
||||||
|
THEN:
|
||||||
|
- All ignore dates are parsed
|
||||||
|
"""
|
||||||
|
test_cases = [
|
||||||
|
("1985-05-01", "YMD", {datetime.date(1985, 5, 1)}),
|
||||||
|
(
|
||||||
|
"1985-05-01,1991-12-05",
|
||||||
|
"YMD",
|
||||||
|
{datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)},
|
||||||
|
),
|
||||||
|
("2010-12-13", "YMD", {datetime.date(2010, 12, 13)}),
|
||||||
|
("11.01.10", "DMY", {datetime.date(2010, 1, 11)}),
|
||||||
|
(
|
||||||
|
"11.01.2001,15-06-1996",
|
||||||
|
"DMY",
|
||||||
|
{datetime.date(2001, 1, 11), datetime.date(1996, 6, 15)},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
self._parse_checker(test_cases)
|
Loading…
x
Reference in New Issue
Block a user