diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 721346fb0..26a4e11c6 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -6,6 +6,7 @@ import re import shutil import subprocess import tempfile +from typing import Iterator from typing import Optional from typing import Set @@ -216,6 +217,10 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: def parse_date(filename, text) -> Optional[datetime.datetime]: + return next(parse_date_generator(filename, text), None) + + +def parse_date_generator(filename, text) -> Iterator[datetime.datetime]: """ Returns the date of the document. """ @@ -246,38 +251,32 @@ def parse_date(filename, text) -> Optional[datetime.datetime]: return date return None - date = None + def __process_match( + match: re.Match[str], + date_order: str, + ) -> Optional[datetime.datetime]: + date_string = match.group(0) + + try: + date = __parser(date_string, date_order) + except (TypeError, ValueError): + # Skip all matches that do not parse to a proper date + date = None + + return __filter(date) + + def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]: + for m in re.finditer(DATE_REGEX, content): + date = __process_match(m, date_order) + if date is not None: + yield date # if filename date parsing is enabled, search there first: if settings.FILENAME_DATE_ORDER: - for m in re.finditer(DATE_REGEX, filename): - date_string = m.group(0) - - try: - date = __parser(date_string, settings.FILENAME_DATE_ORDER) - except (TypeError, ValueError): - # Skip all matches that do not parse to a proper date - continue - - date = __filter(date) - if date is not None: - return date + yield from __process_content(filename, settings.FILENAME_DATE_ORDER) # Iterate through all regex matches in text and try to parse the date - for m in re.finditer(DATE_REGEX, text): - date_string = m.group(0) - - try: - date = __parser(date_string, settings.DATE_ORDER) - except (TypeError, ValueError): - # Skip all matches that do not parse to a proper date - continue - - date = __filter(date) - if date is not None: - return date - - return date + yield from __process_content(text, settings.DATE_ORDER) class ParseError(Exception): diff --git a/src/documents/views.py b/src/documents/views.py index 84fc38a38..f0061cf2b 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,3 +1,4 @@ +import itertools import json import logging import os @@ -21,6 +22,7 @@ from django.db.models.functions import Lower from django.http import Http404 from django.http import HttpResponse from django.http import HttpResponseBadRequest +from django.shortcuts import get_object_or_404 from django.utils.decorators import method_decorator from django.utils.translation import get_language from django.views.decorators.cache import cache_control @@ -70,6 +72,7 @@ from .models import SavedView from .models import StoragePath from .models import Tag from .parsers import get_parser_class_for_mime_type +from .parsers import parse_date_generator from .serialisers import AcknowledgeTasksViewSerializer from .serialisers import BulkDownloadSerializer from .serialisers import BulkEditSerializer @@ -329,13 +332,13 @@ class DocumentViewSet( @action(methods=["get"], detail=True) def suggestions(self, request, pk=None): - try: - doc = Document.objects.get(pk=pk) - except Document.DoesNotExist: - raise Http404() + doc = get_object_or_404(Document, pk=pk) classifier = load_classifier() + gen = parse_date_generator(doc.filename, doc.content) + dates = {i for i in itertools.islice(gen, 5)} + return Response( { "correspondents": [c.id for c in match_correspondents(doc, classifier)], @@ -344,6 +347,9 @@ class DocumentViewSet( dt.id for dt in match_document_types(doc, classifier) ], "storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)], + "dates": [ + date.strftime("%Y-%m-%d") for date in dates if date is not None + ], }, )