feat(parsers): add generator for date parsing

This commit is contained in:
Matthias Eck 2022-08-06 13:02:08 +02:00
parent ca75fb5664
commit a5d2ae2588
2 changed files with 36 additions and 31 deletions

View File

@ -6,6 +6,7 @@ import re
import shutil
import subprocess
import tempfile
from typing import Iterator
from typing import Optional
from typing import Set
@ -216,6 +217,10 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
def parse_date(filename, text) -> Optional[datetime.datetime]:
return next(parse_date_generator(filename, text), None)
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"""
Returns the date of the document.
"""
@ -246,38 +251,32 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
return date
return None
date = None
def __process_match(
match: re.Match[str],
date_order: str,
) -> Optional[datetime.datetime]:
date_string = match.group(0)
try:
date = __parser(date_string, date_order)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
date = None
return __filter(date)
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
for m in re.finditer(DATE_REGEX, content):
date = __process_match(m, date_order)
if date is not None:
yield date
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
for m in re.finditer(DATE_REGEX, filename):
date_string = m.group(0)
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
return date
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
try:
date = __parser(date_string, settings.DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
return date
return date
yield from __process_content(text, settings.DATE_ORDER)
class ParseError(Exception):

View File

@ -1,3 +1,4 @@
import itertools
import json
import logging
import os
@ -21,6 +22,7 @@ from django.db.models.functions import Lower
from django.http import Http404
from django.http import HttpResponse
from django.http import HttpResponseBadRequest
from django.shortcuts import get_object_or_404
from django.utils.decorators import method_decorator
from django.utils.translation import get_language
from django.views.decorators.cache import cache_control
@ -70,6 +72,7 @@ from .models import SavedView
from .models import StoragePath
from .models import Tag
from .parsers import get_parser_class_for_mime_type
from .parsers import parse_date_generator
from .serialisers import AcknowledgeTasksViewSerializer
from .serialisers import BulkDownloadSerializer
from .serialisers import BulkEditSerializer
@ -329,13 +332,13 @@ class DocumentViewSet(
@action(methods=["get"], detail=True)
def suggestions(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404()
doc = get_object_or_404(Document, pk=pk)
classifier = load_classifier()
gen = parse_date_generator(doc.filename, doc.content)
dates = {i for i in itertools.islice(gen, 5)}
return Response(
{
"correspondents": [c.id for c in match_correspondents(doc, classifier)],
@ -344,6 +347,9 @@ class DocumentViewSet(
dt.id for dt in match_document_types(doc, classifier)
],
"storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)],
"dates": [
date.strftime("%Y-%m-%d") for date in dates if date is not None
],
},
)