mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-28 22:59:03 -06:00
Hooks up the class and fixes up the old testing. Includes ocr to date parser conversion we now do
This commit is contained in:
@@ -32,12 +32,12 @@ from documents.models import WorkflowTrigger
|
|||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
from documents.parsers import parse_date
|
|
||||||
from documents.permissions import set_permissions_for_object
|
from documents.permissions import set_permissions_for_object
|
||||||
from documents.plugins.base import AlwaysRunPluginMixin
|
from documents.plugins.base import AlwaysRunPluginMixin
|
||||||
from documents.plugins.base import ConsumeTaskPlugin
|
from documents.plugins.base import ConsumeTaskPlugin
|
||||||
from documents.plugins.base import NoCleanupPluginMixin
|
from documents.plugins.base import NoCleanupPluginMixin
|
||||||
from documents.plugins.base import NoSetupPluginMixin
|
from documents.plugins.base import NoSetupPluginMixin
|
||||||
|
from documents.plugins.date_parsing import get_date_parser
|
||||||
from documents.plugins.helpers import ProgressManager
|
from documents.plugins.helpers import ProgressManager
|
||||||
from documents.plugins.helpers import ProgressStatusOptions
|
from documents.plugins.helpers import ProgressStatusOptions
|
||||||
from documents.signals import document_consumption_finished
|
from documents.signals import document_consumption_finished
|
||||||
@@ -426,7 +426,8 @@ class ConsumerPlugin(
|
|||||||
ProgressStatusOptions.WORKING,
|
ProgressStatusOptions.WORKING,
|
||||||
ConsumerStatusShortMessage.PARSE_DATE,
|
ConsumerStatusShortMessage.PARSE_DATE,
|
||||||
)
|
)
|
||||||
date = parse_date(self.filename, text)
|
date_parser = get_date_parser()
|
||||||
|
date = next(date_parser.parse(self.filename, text), None)
|
||||||
archive_path = document_parser.get_archive_path()
|
archive_path = document_parser.get_archive_path()
|
||||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
||||||
|
|
||||||
|
|||||||
@@ -9,22 +9,17 @@ import subprocess
|
|||||||
import tempfile
|
import tempfile
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from re import Match
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.utils import timezone
|
|
||||||
|
|
||||||
from documents.loggers import LoggingMixin
|
from documents.loggers import LoggingMixin
|
||||||
from documents.signals import document_consumer_declaration
|
from documents.signals import document_consumer_declaration
|
||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
from paperless.config import OcrConfig
|
|
||||||
from paperless.utils import ocr_to_dateparser_languages
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import datetime
|
import datetime
|
||||||
from collections.abc import Iterator
|
|
||||||
|
|
||||||
# This regular expression will try to find dates in the document at
|
# This regular expression will try to find dates in the document at
|
||||||
# hand and will match the following formats:
|
# hand and will match the following formats:
|
||||||
@@ -259,75 +254,6 @@ def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -
|
|||||||
return out_path
|
return out_path
|
||||||
|
|
||||||
|
|
||||||
def parse_date(filename, text) -> datetime.datetime | None:
|
|
||||||
return next(parse_date_generator(filename, text), None)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
|
||||||
"""
|
|
||||||
Returns the date of the document.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __parser(ds: str, date_order: str) -> datetime.datetime:
|
|
||||||
"""
|
|
||||||
Call dateparser.parse with a particular date ordering
|
|
||||||
"""
|
|
||||||
import dateparser
|
|
||||||
|
|
||||||
ocr_config = OcrConfig()
|
|
||||||
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
|
|
||||||
ocr_config.language,
|
|
||||||
)
|
|
||||||
|
|
||||||
return dateparser.parse(
|
|
||||||
ds,
|
|
||||||
settings={
|
|
||||||
"DATE_ORDER": date_order,
|
|
||||||
"PREFER_DAY_OF_MONTH": "first",
|
|
||||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
|
||||||
"TIMEZONE": settings.TIME_ZONE,
|
|
||||||
},
|
|
||||||
locales=languages,
|
|
||||||
)
|
|
||||||
|
|
||||||
def __filter(date: datetime.datetime) -> datetime.datetime | None:
|
|
||||||
if (
|
|
||||||
date is not None
|
|
||||||
and date.year > 1900
|
|
||||||
and date <= timezone.now()
|
|
||||||
and date.date() not in settings.IGNORE_DATES
|
|
||||||
):
|
|
||||||
return date
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __process_match(
|
|
||||||
match: Match[str],
|
|
||||||
date_order: str,
|
|
||||||
) -> datetime.datetime | None:
|
|
||||||
date_string = match.group(0)
|
|
||||||
|
|
||||||
try:
|
|
||||||
date = __parser(date_string, date_order)
|
|
||||||
except Exception:
|
|
||||||
# Skip all matches that do not parse to a proper date
|
|
||||||
date = None
|
|
||||||
|
|
||||||
return __filter(date)
|
|
||||||
|
|
||||||
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
|
|
||||||
for m in re.finditer(DATE_REGEX, content):
|
|
||||||
date = __process_match(m, date_order)
|
|
||||||
if date is not None:
|
|
||||||
yield date
|
|
||||||
|
|
||||||
# if filename date parsing is enabled, search there first:
|
|
||||||
if settings.FILENAME_DATE_ORDER:
|
|
||||||
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
|
|
||||||
|
|
||||||
# Iterate through all regex matches in text and try to parse the date
|
|
||||||
yield from __process_content(text, settings.DATE_ORDER)
|
|
||||||
|
|
||||||
|
|
||||||
class ParseError(Exception):
|
class ParseError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from django.utils import timezone
|
|||||||
from documents.plugins.date_parsing.base import DateParserConfig
|
from documents.plugins.date_parsing.base import DateParserConfig
|
||||||
from documents.plugins.date_parsing.base import DateParserPluginBase
|
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||||
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||||
|
from paperless.utils import ocr_to_dateparser_languages
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -73,7 +74,10 @@ def get_date_parser() -> DateParserPluginBase:
|
|||||||
|
|
||||||
# 2. Load configuration from settings
|
# 2. Load configuration from settings
|
||||||
# TODO: Get the language from the settings and/or configuration object, depending
|
# TODO: Get the language from the settings and/or configuration object, depending
|
||||||
languages = settings.DATE_PARSER_LANGUAGES
|
languages = languages = (
|
||||||
|
settings.DATE_PARSER_LANGUAGES
|
||||||
|
or ocr_to_dateparser_languages(settings.OCR_LANGUAGE)
|
||||||
|
)
|
||||||
|
|
||||||
config = DateParserConfig(
|
config = DateParserConfig(
|
||||||
languages=languages,
|
languages=languages,
|
||||||
|
|||||||
@@ -1978,11 +1978,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
|||||||
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
||||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||||
|
|
||||||
@mock.patch("documents.parsers.parse_date_generator")
|
@mock.patch("documents.views.get_date_parser")
|
||||||
@override_settings(NUMBER_OF_SUGGESTED_DATES=0)
|
@override_settings(NUMBER_OF_SUGGESTED_DATES=0)
|
||||||
def test_get_suggestions_dates_disabled(
|
def test_get_suggestions_dates_disabled(
|
||||||
self,
|
self,
|
||||||
parse_date_generator,
|
mock_get_date_parser: mock.MagicMock,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
@@ -1999,7 +1999,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
||||||
self.assertFalse(parse_date_generator.called)
|
|
||||||
|
mock_get_date_parser.assert_not_called()
|
||||||
|
|
||||||
def test_saved_views(self):
|
def test_saved_views(self):
|
||||||
u1 = User.objects.create_superuser("user1")
|
u1 = User.objects.create_superuser("user1")
|
||||||
|
|||||||
@@ -1,538 +0,0 @@
|
|||||||
import datetime
|
|
||||||
from zoneinfo import ZoneInfo
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from pytest_django.fixtures import SettingsWrapper
|
|
||||||
|
|
||||||
from documents.parsers import parse_date
|
|
||||||
from documents.parsers import parse_date_generator
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db()
|
|
||||||
class TestDate:
|
|
||||||
def test_date_format_1(self):
|
|
||||||
text = "lorem ipsum 130218 lorem ipsum"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_2(self):
|
|
||||||
text = "lorem ipsum 2018 lorem ipsum"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_3(self):
|
|
||||||
text = "lorem ipsum 20180213 lorem ipsum"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_4(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "lorem ipsum 13.02.2018 lorem ipsum"
|
|
||||||
date = parse_date("", text)
|
|
||||||
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_date_format_5(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
|
|
||||||
date = parse_date("", text)
|
|
||||||
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_date_format_6(self):
|
|
||||||
text = (
|
|
||||||
"lorem ipsum\n"
|
|
||||||
"Wohnort\n"
|
|
||||||
"3100\n"
|
|
||||||
"IBAN\n"
|
|
||||||
"AT87 4534\n"
|
|
||||||
"1234\n"
|
|
||||||
"1234 5678\n"
|
|
||||||
"BIC\n"
|
|
||||||
"lorem ipsum"
|
|
||||||
)
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_7(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
|
||||||
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
|
|
||||||
date = parse_date("", text)
|
|
||||||
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_date_format_8(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
|
||||||
text = (
|
|
||||||
"lorem ipsum\n"
|
|
||||||
"Wohnort\n"
|
|
||||||
"3100\n"
|
|
||||||
"IBAN\n"
|
|
||||||
"AT87 4534\n"
|
|
||||||
"1234\n"
|
|
||||||
"1234 5678\n"
|
|
||||||
"BIC\n"
|
|
||||||
"lorem ipsum\n"
|
|
||||||
"März 2020"
|
|
||||||
)
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2020,
|
|
||||||
3,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_9(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
|
||||||
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2020,
|
|
||||||
3,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_10(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_11(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_12(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_13(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_14(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_15(self):
|
|
||||||
text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_16(self):
|
|
||||||
text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_17(self):
|
|
||||||
text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_18(self):
|
|
||||||
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_19(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
21,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_20(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_21(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
2,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_22(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
23,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_23(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
24,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_24(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
21,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_25(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
25,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_26(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2019,
|
|
||||||
9,
|
|
||||||
25,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_crazy_date_past(self):
|
|
||||||
assert parse_date("", "01-07-0590 00:00:00") is None
|
|
||||||
|
|
||||||
def test_crazy_date_future(self):
|
|
||||||
assert parse_date("", "01-07-2350 00:00:00") is None
|
|
||||||
|
|
||||||
def test_crazy_date_with_spaces(self):
|
|
||||||
assert parse_date("", "20 408000l 2475") is None
|
|
||||||
|
|
||||||
def test_utf_month_names(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"]
|
|
||||||
assert parse_date("", "13 décembre 2023") == datetime.datetime(
|
|
||||||
2023,
|
|
||||||
12,
|
|
||||||
13,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "13 août 2022") == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
8,
|
|
||||||
13,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "11 März 2020") == datetime.datetime(
|
|
||||||
2020,
|
|
||||||
3,
|
|
||||||
11,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
|
|
||||||
2018,
|
|
||||||
3,
|
|
||||||
17,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "1. veljače 2016.") == datetime.datetime(
|
|
||||||
2016,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "15. února 1985") == datetime.datetime(
|
|
||||||
1985,
|
|
||||||
2,
|
|
||||||
15,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "30. září 2011") == datetime.datetime(
|
|
||||||
2011,
|
|
||||||
9,
|
|
||||||
30,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "28. května 1990") == datetime.datetime(
|
|
||||||
1990,
|
|
||||||
5,
|
|
||||||
28,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "1. grudzień 1997") == datetime.datetime(
|
|
||||||
1997,
|
|
||||||
12,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "17 Şubat 2024") == datetime.datetime(
|
|
||||||
2024,
|
|
||||||
2,
|
|
||||||
17,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
|
|
||||||
2012,
|
|
||||||
8,
|
|
||||||
30,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "17 Eylül 2000") == datetime.datetime(
|
|
||||||
2000,
|
|
||||||
9,
|
|
||||||
17,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "5. október 1992") == datetime.datetime(
|
|
||||||
1992,
|
|
||||||
10,
|
|
||||||
5,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_multiple_dates(self, settings_timezone: ZoneInfo):
|
|
||||||
text = """This text has multiple dates.
|
|
||||||
For example 02.02.2018, 22 July 2022 and December 2021.
|
|
||||||
But not 24-12-9999 because it's in the future..."""
|
|
||||||
dates = list(parse_date_generator("", text))
|
|
||||||
|
|
||||||
assert dates == [
|
|
||||||
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
|
|
||||||
datetime.datetime(
|
|
||||||
2022,
|
|
||||||
7,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
),
|
|
||||||
datetime.datetime(
|
|
||||||
2021,
|
|
||||||
12,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
def test_filename_date_parse_valid_ymd(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Date parsing from the filename is enabled
|
|
||||||
- Filename date format is with Year Month Day (YMD)
|
|
||||||
- Filename contains date matching the format
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- Should parse the date from the filename
|
|
||||||
"""
|
|
||||||
settings.FILENAME_DATE_ORDER = "YMD"
|
|
||||||
|
|
||||||
assert parse_date(
|
|
||||||
"/tmp/Scan-2022-04-01.pdf",
|
|
||||||
"No date in here",
|
|
||||||
) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_filename_date_parse_valid_dmy(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Date parsing from the filename is enabled
|
|
||||||
- Filename date format is with Day Month Year (DMY)
|
|
||||||
- Filename contains date matching the format
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- Should parse the date from the filename
|
|
||||||
"""
|
|
||||||
settings.FILENAME_DATE_ORDER = "DMY"
|
|
||||||
assert parse_date(
|
|
||||||
"/tmp/Scan-10.01.2021.pdf",
|
|
||||||
"No date in here",
|
|
||||||
) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_filename_date_parse_invalid(self, settings: SettingsWrapper):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Date parsing from the filename is enabled
|
|
||||||
- Filename includes no date
|
|
||||||
- File content includes no date
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- No date is parsed
|
|
||||||
"""
|
|
||||||
settings.FILENAME_DATE_ORDER = "YMD"
|
|
||||||
assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
|
|
||||||
|
|
||||||
def test_filename_date_ignored_use_content(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Date parsing from the filename is enabled
|
|
||||||
- Filename date format is with Day Month Year (YMD)
|
|
||||||
- Date order is Day Month Year (DMY, the default)
|
|
||||||
- Filename contains date matching the format
|
|
||||||
- Filename date is an ignored date
|
|
||||||
- File content includes a date
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- Should parse the date from the content not filename
|
|
||||||
"""
|
|
||||||
settings.FILENAME_DATE_ORDER = "YMD"
|
|
||||||
settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
|
|
||||||
assert parse_date(
|
|
||||||
"/tmp/Scan-2022-04-01.pdf",
|
|
||||||
"The matching date is 24.03.2022",
|
|
||||||
) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_ignored_dates_default_order(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Ignore dates have been set
|
|
||||||
- File content includes ignored dates
|
|
||||||
- File content includes 1 non-ignored date
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- Should parse the date non-ignored date from content
|
|
||||||
"""
|
|
||||||
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
|
|
||||||
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2018,
|
|
||||||
2,
|
|
||||||
13,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_ignored_dates_order_ymd(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Ignore dates have been set
|
|
||||||
- Date order is Year Month Date (YMD)
|
|
||||||
- File content includes ignored dates
|
|
||||||
- File content includes 1 non-ignored date
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- Should parse the date non-ignored date from content
|
|
||||||
"""
|
|
||||||
|
|
||||||
settings.FILENAME_DATE_ORDER = "YMD"
|
|
||||||
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
|
|
||||||
|
|
||||||
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
|
|
||||||
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2018,
|
|
||||||
2,
|
|
||||||
13,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
@@ -148,7 +148,6 @@ from documents.models import Workflow
|
|||||||
from documents.models import WorkflowAction
|
from documents.models import WorkflowAction
|
||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
from documents.parsers import parse_date_generator
|
|
||||||
from documents.permissions import AcknowledgeTasksPermissions
|
from documents.permissions import AcknowledgeTasksPermissions
|
||||||
from documents.permissions import PaperlessAdminPermissions
|
from documents.permissions import PaperlessAdminPermissions
|
||||||
from documents.permissions import PaperlessNotePermissions
|
from documents.permissions import PaperlessNotePermissions
|
||||||
@@ -158,6 +157,7 @@ from documents.permissions import get_document_count_filter_for_user
|
|||||||
from documents.permissions import get_objects_for_user_owner_aware
|
from documents.permissions import get_objects_for_user_owner_aware
|
||||||
from documents.permissions import has_perms_owner_aware
|
from documents.permissions import has_perms_owner_aware
|
||||||
from documents.permissions import set_permissions_for_object
|
from documents.permissions import set_permissions_for_object
|
||||||
|
from documents.plugins.date_parsing import get_date_parser
|
||||||
from documents.schema import generate_object_with_permissions_schema
|
from documents.schema import generate_object_with_permissions_schema
|
||||||
from documents.serialisers import AcknowledgeTasksViewSerializer
|
from documents.serialisers import AcknowledgeTasksViewSerializer
|
||||||
from documents.serialisers import BulkDownloadSerializer
|
from documents.serialisers import BulkDownloadSerializer
|
||||||
@@ -1023,7 +1023,8 @@ class DocumentViewSet(
|
|||||||
|
|
||||||
dates = []
|
dates = []
|
||||||
if settings.NUMBER_OF_SUGGESTED_DATES > 0:
|
if settings.NUMBER_OF_SUGGESTED_DATES > 0:
|
||||||
gen = parse_date_generator(doc.filename, doc.content)
|
date_parser = get_date_parser()
|
||||||
|
gen = date_parser.parse(doc.filename, doc.content)
|
||||||
dates = sorted(
|
dates = sorted(
|
||||||
{
|
{
|
||||||
i
|
i
|
||||||
|
|||||||
Reference in New Issue
Block a user