feat(parsers): add generator for date parsing

This commit is contained in:
Matthias Eck 2022-08-06 13:02:08 +02:00
parent ca75fb5664
commit a5d2ae2588
2 changed files with 36 additions and 31 deletions

View File

@ -6,6 +6,7 @@ import re
import shutil import shutil
import subprocess import subprocess
import tempfile import tempfile
from typing import Iterator
from typing import Optional from typing import Optional
from typing import Set from typing import Set
@ -216,6 +217,10 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
def parse_date(filename, text) -> Optional[datetime.datetime]: def parse_date(filename, text) -> Optional[datetime.datetime]:
return next(parse_date_generator(filename, text), None)
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
""" """
Returns the date of the document. Returns the date of the document.
""" """
@ -246,38 +251,32 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
return date return date
return None return None
date = None def __process_match(
match: re.Match[str],
date_order: str,
) -> Optional[datetime.datetime]:
date_string = match.group(0)
try:
date = __parser(date_string, date_order)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
date = None
return __filter(date)
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
for m in re.finditer(DATE_REGEX, content):
date = __process_match(m, date_order)
if date is not None:
yield date
# if filename date parsing is enabled, search there first: # if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER: if settings.FILENAME_DATE_ORDER:
for m in re.finditer(DATE_REGEX, filename): yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
date_string = m.group(0)
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
return date
# Iterate through all regex matches in text and try to parse the date # Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text): yield from __process_content(text, settings.DATE_ORDER)
date_string = m.group(0)
try:
date = __parser(date_string, settings.DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
return date
return date
class ParseError(Exception): class ParseError(Exception):

View File

@ -1,3 +1,4 @@
import itertools
import json import json
import logging import logging
import os import os
@ -21,6 +22,7 @@ from django.db.models.functions import Lower
from django.http import Http404 from django.http import Http404
from django.http import HttpResponse from django.http import HttpResponse
from django.http import HttpResponseBadRequest from django.http import HttpResponseBadRequest
from django.shortcuts import get_object_or_404
from django.utils.decorators import method_decorator from django.utils.decorators import method_decorator
from django.utils.translation import get_language from django.utils.translation import get_language
from django.views.decorators.cache import cache_control from django.views.decorators.cache import cache_control
@ -70,6 +72,7 @@ from .models import SavedView
from .models import StoragePath from .models import StoragePath
from .models import Tag from .models import Tag
from .parsers import get_parser_class_for_mime_type from .parsers import get_parser_class_for_mime_type
from .parsers import parse_date_generator
from .serialisers import AcknowledgeTasksViewSerializer from .serialisers import AcknowledgeTasksViewSerializer
from .serialisers import BulkDownloadSerializer from .serialisers import BulkDownloadSerializer
from .serialisers import BulkEditSerializer from .serialisers import BulkEditSerializer
@ -329,13 +332,13 @@ class DocumentViewSet(
@action(methods=["get"], detail=True) @action(methods=["get"], detail=True)
def suggestions(self, request, pk=None): def suggestions(self, request, pk=None):
try: doc = get_object_or_404(Document, pk=pk)
doc = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404()
classifier = load_classifier() classifier = load_classifier()
gen = parse_date_generator(doc.filename, doc.content)
dates = {i for i in itertools.islice(gen, 5)}
return Response( return Response(
{ {
"correspondents": [c.id for c in match_correspondents(doc, classifier)], "correspondents": [c.id for c in match_correspondents(doc, classifier)],
@ -344,6 +347,9 @@ class DocumentViewSet(
dt.id for dt in match_document_types(doc, classifier) dt.id for dt in match_document_types(doc, classifier)
], ],
"storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)], "storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)],
"dates": [
date.strftime("%Y-%m-%d") for date in dates if date is not None
],
}, },
) )