Merge pull request #1367 from Eckii24/feat/date-suggestions

Adding date suggestions to the documents details view
This commit is contained in:
shamoon
2022-08-25 11:47:37 -07:00
committed by GitHub
11 changed files with 114 additions and 34 deletions

View File

@@ -6,6 +6,8 @@ import re
import shutil
import subprocess
import tempfile
from typing import Iterator
from typing import Match
from typing import Optional
from typing import Set
@@ -216,6 +218,10 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
def parse_date(filename, text) -> Optional[datetime.datetime]:
return next(parse_date_generator(filename, text), None)
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"""
Returns the date of the document.
"""
@@ -246,38 +252,32 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
return date
return None
date = None
def __process_match(
match: Match[str],
date_order: str,
) -> Optional[datetime.datetime]:
date_string = match.group(0)
try:
date = __parser(date_string, date_order)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
date = None
return __filter(date)
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
for m in re.finditer(DATE_REGEX, content):
date = __process_match(m, date_order)
if date is not None:
yield date
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
for m in re.finditer(DATE_REGEX, filename):
date_string = m.group(0)
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
return date
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
try:
date = __parser(date_string, settings.DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
return date
return date
yield from __process_content(text, settings.DATE_ORDER)
class ParseError(Exception):

View File

@@ -1107,6 +1107,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
"tags": [],
"document_types": [],
"storage_paths": [],
"dates": [],
},
)
@@ -1118,6 +1119,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.match_document_types")
@mock.patch("documents.views.match_tags")
@mock.patch("documents.views.match_correspondents")
@override_settings(NUMBER_OF_SUGGESTED_DATES=10)
def test_get_suggestions(
self,
match_correspondents,
@@ -1128,7 +1130,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
doc = Document.objects.create(
title="test",
mime_type="application/pdf",
content="this is an invoice!",
content="this is an invoice from 12.04.2022!",
)
match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
@@ -1144,6 +1146,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
"tags": [56, 123],
"document_types": [23],
"storage_paths": [99, 77],
"dates": ["2022-04-12"],
},
)

View File

@@ -8,6 +8,7 @@ from django.conf import settings
from django.test import override_settings
from django.test import TestCase
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
from paperless.settings import DATE_ORDER
@@ -161,6 +162,25 @@ class TestDate(TestCase):
def test_crazy_date_with_spaces(self, *args):
self.assertIsNone(parse_date("", "20 408000l 2475"))
def test_multiple_dates(self):
text = """This text has multiple dates.
For example 02.02.2018, 22 July 2022 and Dezember 2021.
But not 24-12-9999 because its in the future..."""
dates = list(parse_date_generator("", text))
self.assertEqual(len(dates), 3)
self.assertEqual(
dates[0],
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
self.assertEqual(
dates[1],
datetime.datetime(2022, 7, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
self.assertEqual(
dates[2],
datetime.datetime(2021, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_valid_ymd(self, *args):
"""

View File

@@ -1,3 +1,4 @@
import itertools
import json
import logging
import os
@@ -21,6 +22,7 @@ from django.db.models.functions import Lower
from django.http import Http404
from django.http import HttpResponse
from django.http import HttpResponseBadRequest
from django.shortcuts import get_object_or_404
from django.utils.decorators import method_decorator
from django.utils.translation import get_language
from django.views.decorators.cache import cache_control
@@ -70,6 +72,7 @@ from .models import SavedView
from .models import StoragePath
from .models import Tag
from .parsers import get_parser_class_for_mime_type
from .parsers import parse_date_generator
from .serialisers import AcknowledgeTasksViewSerializer
from .serialisers import BulkDownloadSerializer
from .serialisers import BulkEditSerializer
@@ -330,13 +333,15 @@ class DocumentViewSet(
@action(methods=["get"], detail=True)
def suggestions(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404()
doc = get_object_or_404(Document, pk=pk)
classifier = load_classifier()
gen = parse_date_generator(doc.filename, doc.content)
dates = sorted(
{i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
)
return Response(
{
"correspondents": [c.id for c in match_correspondents(doc, classifier)],
@@ -345,6 +350,9 @@ class DocumentViewSet(
dt.id for dt in match_document_types(doc, classifier)
],
"storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)],
"dates": [
date.strftime("%Y-%m-%d") for date in dates if date is not None
],
},
)