mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	feat(parsers): add generator for date parsing
This commit is contained in:
		| @@ -6,6 +6,7 @@ import re | ||||
| import shutil | ||||
| import subprocess | ||||
| import tempfile | ||||
| from typing import Iterator | ||||
| from typing import Optional | ||||
| from typing import Set | ||||
|  | ||||
| @@ -216,6 +217,10 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: | ||||
|  | ||||
|  | ||||
| def parse_date(filename, text) -> Optional[datetime.datetime]: | ||||
|     return next(parse_date_generator(filename, text), None) | ||||
|  | ||||
|  | ||||
| def parse_date_generator(filename, text) -> Iterator[datetime.datetime]: | ||||
|     """ | ||||
|     Returns the date of the document. | ||||
|     """ | ||||
| @@ -246,38 +251,32 @@ def parse_date(filename, text) -> Optional[datetime.datetime]: | ||||
|             return date | ||||
|         return None | ||||
|  | ||||
|     date = None | ||||
|     def __process_match( | ||||
|         match: re.Match[str], | ||||
|         date_order: str, | ||||
|     ) -> Optional[datetime.datetime]: | ||||
|         date_string = match.group(0) | ||||
|  | ||||
|         try: | ||||
|             date = __parser(date_string, date_order) | ||||
|         except (TypeError, ValueError): | ||||
|             # Skip all matches that do not parse to a proper date | ||||
|             date = None | ||||
|  | ||||
|         return __filter(date) | ||||
|  | ||||
|     def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]: | ||||
|         for m in re.finditer(DATE_REGEX, content): | ||||
|             date = __process_match(m, date_order) | ||||
|             if date is not None: | ||||
|                 yield date | ||||
|  | ||||
|     # if filename date parsing is enabled, search there first: | ||||
|     if settings.FILENAME_DATE_ORDER: | ||||
|         for m in re.finditer(DATE_REGEX, filename): | ||||
|             date_string = m.group(0) | ||||
|  | ||||
|             try: | ||||
|                 date = __parser(date_string, settings.FILENAME_DATE_ORDER) | ||||
|             except (TypeError, ValueError): | ||||
|                 # Skip all matches that do not parse to a proper date | ||||
|                 continue | ||||
|  | ||||
|             date = __filter(date) | ||||
|             if date is not None: | ||||
|                 return date | ||||
|         yield from __process_content(filename, settings.FILENAME_DATE_ORDER) | ||||
|  | ||||
|     # Iterate through all regex matches in text and try to parse the date | ||||
|     for m in re.finditer(DATE_REGEX, text): | ||||
|         date_string = m.group(0) | ||||
|  | ||||
|         try: | ||||
|             date = __parser(date_string, settings.DATE_ORDER) | ||||
|         except (TypeError, ValueError): | ||||
|             # Skip all matches that do not parse to a proper date | ||||
|             continue | ||||
|  | ||||
|         date = __filter(date) | ||||
|         if date is not None: | ||||
|             return date | ||||
|  | ||||
|     return date | ||||
|     yield from __process_content(text, settings.DATE_ORDER) | ||||
|  | ||||
|  | ||||
| class ParseError(Exception): | ||||
|   | ||||
| @@ -1,3 +1,4 @@ | ||||
| import itertools | ||||
| import json | ||||
| import logging | ||||
| import os | ||||
| @@ -21,6 +22,7 @@ from django.db.models.functions import Lower | ||||
| from django.http import Http404 | ||||
| from django.http import HttpResponse | ||||
| from django.http import HttpResponseBadRequest | ||||
| from django.shortcuts import get_object_or_404 | ||||
| from django.utils.decorators import method_decorator | ||||
| from django.utils.translation import get_language | ||||
| from django.views.decorators.cache import cache_control | ||||
| @@ -70,6 +72,7 @@ from .models import SavedView | ||||
| from .models import StoragePath | ||||
| from .models import Tag | ||||
| from .parsers import get_parser_class_for_mime_type | ||||
| from .parsers import parse_date_generator | ||||
| from .serialisers import AcknowledgeTasksViewSerializer | ||||
| from .serialisers import BulkDownloadSerializer | ||||
| from .serialisers import BulkEditSerializer | ||||
| @@ -329,13 +332,13 @@ class DocumentViewSet( | ||||
|  | ||||
|     @action(methods=["get"], detail=True) | ||||
|     def suggestions(self, request, pk=None): | ||||
|         try: | ||||
|             doc = Document.objects.get(pk=pk) | ||||
|         except Document.DoesNotExist: | ||||
|             raise Http404() | ||||
|         doc = get_object_or_404(Document, pk=pk) | ||||
|  | ||||
|         classifier = load_classifier() | ||||
|  | ||||
|         gen = parse_date_generator(doc.filename, doc.content) | ||||
|         dates = {i for i in itertools.islice(gen, 5)} | ||||
|  | ||||
|         return Response( | ||||
|             { | ||||
|                 "correspondents": [c.id for c in match_correspondents(doc, classifier)], | ||||
| @@ -344,6 +347,9 @@ class DocumentViewSet( | ||||
|                     dt.id for dt in match_document_types(doc, classifier) | ||||
|                 ], | ||||
|                 "storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)], | ||||
|                 "dates": [ | ||||
|                     date.strftime("%Y-%m-%d") for date in dates if date is not None | ||||
|                 ], | ||||
|             }, | ||||
|         ) | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Matthias Eck
					Matthias Eck