mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
feat(parsers): add generator for date parsing
This commit is contained in:
parent
ca75fb5664
commit
a5d2ae2588
@ -6,6 +6,7 @@ import re
|
|||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from typing import Iterator
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from typing import Set
|
from typing import Set
|
||||||
|
|
||||||
@ -216,6 +217,10 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def parse_date(filename, text) -> Optional[datetime.datetime]:
|
def parse_date(filename, text) -> Optional[datetime.datetime]:
|
||||||
|
return next(parse_date_generator(filename, text), None)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
||||||
"""
|
"""
|
||||||
Returns the date of the document.
|
Returns the date of the document.
|
||||||
"""
|
"""
|
||||||
@ -246,38 +251,32 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
|
|||||||
return date
|
return date
|
||||||
return None
|
return None
|
||||||
|
|
||||||
date = None
|
def __process_match(
|
||||||
|
match: re.Match[str],
|
||||||
|
date_order: str,
|
||||||
|
) -> Optional[datetime.datetime]:
|
||||||
|
date_string = match.group(0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
date = __parser(date_string, date_order)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
# Skip all matches that do not parse to a proper date
|
||||||
|
date = None
|
||||||
|
|
||||||
|
return __filter(date)
|
||||||
|
|
||||||
|
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
|
||||||
|
for m in re.finditer(DATE_REGEX, content):
|
||||||
|
date = __process_match(m, date_order)
|
||||||
|
if date is not None:
|
||||||
|
yield date
|
||||||
|
|
||||||
# if filename date parsing is enabled, search there first:
|
# if filename date parsing is enabled, search there first:
|
||||||
if settings.FILENAME_DATE_ORDER:
|
if settings.FILENAME_DATE_ORDER:
|
||||||
for m in re.finditer(DATE_REGEX, filename):
|
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
|
||||||
date_string = m.group(0)
|
|
||||||
|
|
||||||
try:
|
|
||||||
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
# Skip all matches that do not parse to a proper date
|
|
||||||
continue
|
|
||||||
|
|
||||||
date = __filter(date)
|
|
||||||
if date is not None:
|
|
||||||
return date
|
|
||||||
|
|
||||||
# Iterate through all regex matches in text and try to parse the date
|
# Iterate through all regex matches in text and try to parse the date
|
||||||
for m in re.finditer(DATE_REGEX, text):
|
yield from __process_content(text, settings.DATE_ORDER)
|
||||||
date_string = m.group(0)
|
|
||||||
|
|
||||||
try:
|
|
||||||
date = __parser(date_string, settings.DATE_ORDER)
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
# Skip all matches that do not parse to a proper date
|
|
||||||
continue
|
|
||||||
|
|
||||||
date = __filter(date)
|
|
||||||
if date is not None:
|
|
||||||
return date
|
|
||||||
|
|
||||||
return date
|
|
||||||
|
|
||||||
|
|
||||||
class ParseError(Exception):
|
class ParseError(Exception):
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import itertools
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@ -21,6 +22,7 @@ from django.db.models.functions import Lower
|
|||||||
from django.http import Http404
|
from django.http import Http404
|
||||||
from django.http import HttpResponse
|
from django.http import HttpResponse
|
||||||
from django.http import HttpResponseBadRequest
|
from django.http import HttpResponseBadRequest
|
||||||
|
from django.shortcuts import get_object_or_404
|
||||||
from django.utils.decorators import method_decorator
|
from django.utils.decorators import method_decorator
|
||||||
from django.utils.translation import get_language
|
from django.utils.translation import get_language
|
||||||
from django.views.decorators.cache import cache_control
|
from django.views.decorators.cache import cache_control
|
||||||
@ -70,6 +72,7 @@ from .models import SavedView
|
|||||||
from .models import StoragePath
|
from .models import StoragePath
|
||||||
from .models import Tag
|
from .models import Tag
|
||||||
from .parsers import get_parser_class_for_mime_type
|
from .parsers import get_parser_class_for_mime_type
|
||||||
|
from .parsers import parse_date_generator
|
||||||
from .serialisers import AcknowledgeTasksViewSerializer
|
from .serialisers import AcknowledgeTasksViewSerializer
|
||||||
from .serialisers import BulkDownloadSerializer
|
from .serialisers import BulkDownloadSerializer
|
||||||
from .serialisers import BulkEditSerializer
|
from .serialisers import BulkEditSerializer
|
||||||
@ -329,13 +332,13 @@ class DocumentViewSet(
|
|||||||
|
|
||||||
@action(methods=["get"], detail=True)
|
@action(methods=["get"], detail=True)
|
||||||
def suggestions(self, request, pk=None):
|
def suggestions(self, request, pk=None):
|
||||||
try:
|
doc = get_object_or_404(Document, pk=pk)
|
||||||
doc = Document.objects.get(pk=pk)
|
|
||||||
except Document.DoesNotExist:
|
|
||||||
raise Http404()
|
|
||||||
|
|
||||||
classifier = load_classifier()
|
classifier = load_classifier()
|
||||||
|
|
||||||
|
gen = parse_date_generator(doc.filename, doc.content)
|
||||||
|
dates = {i for i in itertools.islice(gen, 5)}
|
||||||
|
|
||||||
return Response(
|
return Response(
|
||||||
{
|
{
|
||||||
"correspondents": [c.id for c in match_correspondents(doc, classifier)],
|
"correspondents": [c.id for c in match_correspondents(doc, classifier)],
|
||||||
@ -344,6 +347,9 @@ class DocumentViewSet(
|
|||||||
dt.id for dt in match_document_types(doc, classifier)
|
dt.id for dt in match_document_types(doc, classifier)
|
||||||
],
|
],
|
||||||
"storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)],
|
"storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)],
|
||||||
|
"dates": [
|
||||||
|
date.strftime("%Y-%m-%d") for date in dates if date is not None
|
||||||
|
],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user