diff --git a/docs/configuration.rst b/docs/configuration.rst index dce6b3a83..1c1c54806 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -741,6 +741,19 @@ PAPERLESS_FILENAME_DATE_ORDER=<format> Defaults to none, which disables this feature. +PAPERLESS_NUMBER_OF_SUGGESTED_DATES=<num> + Paperless searches an entire document for dates. The first date found will + be used as the initial value for the created date. When this variable is + greater than 0 (or left to it's default value), paperless will also suggest + other dates found in the document, up to a maximum of this setting. Note that + duplicates will be removed, which can result in fewer dates displayed in the + frontend than this setting value. + + The task to find all dates can be time-consuming and increases with a higher + (maximum) number of suggested dates and slower hardware. + + Defaults to 3. Set to 0 to disable this feature. + PAPERLESS_THUMBNAIL_FONT_NAME=<filename> Paperless creates thumbnails for plain text files by rendering the content of the file on an image and uses a predefined font for that. This diff --git a/paperless.conf.example b/paperless.conf.example index d7319689b..26fc327c3 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -69,6 +69,7 @@ #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_FILENAME_DATE_ORDER=YMD #PAPERLESS_FILENAME_PARSE_TRANSFORMS=[] +#PAPERLESS_NUMBER_OF_SUGGESTED_DATES=5 #PAPERLESS_THUMBNAIL_FONT_NAME= #PAPERLESS_IGNORE_DATES= #PAPERLESS_ENABLE_UPDATE_CHECK= diff --git a/src-ui/src/app/components/common/input/date/date.component.html b/src-ui/src/app/components/common/input/date/date.component.html index e742ead9b..926429a8d 100644 --- a/src-ui/src/app/components/common/input/date/date.component.html +++ b/src-ui/src/app/components/common/input/date/date.component.html @@ -12,4 +12,10 @@ </div> <div class="invalid-feedback" i18n>Invalid date.</div> <small *ngIf="hint" class="form-text text-muted">{{hint}}</small> + <small *ngIf="getSuggestions().length > 0"> + <span i18n>Suggestions:</span> + <ng-container *ngFor="let s of getSuggestions()"> + <a (click)="onSuggestionClick(s)" [routerLink]="[]">{{s}}</a> + </ng-container> + </small> </div> diff --git a/src-ui/src/app/components/common/input/date/date.component.ts b/src-ui/src/app/components/common/input/date/date.component.ts index 168745910..3f52dbd2e 100644 --- a/src-ui/src/app/components/common/input/date/date.component.ts +++ b/src-ui/src/app/components/common/input/date/date.component.ts @@ -1,4 +1,4 @@ -import { Component, forwardRef, OnInit } from '@angular/core' +import { Component, forwardRef, Input, OnInit } from '@angular/core' import { NG_VALUE_ACCESSOR } from '@angular/forms' import { NgbDateAdapter, @@ -31,6 +31,28 @@ export class DateComponent super() } + @Input() + suggestions: string[] + + getSuggestions() { + return this.suggestions == null + ? [] + : this.suggestions + .map((s) => this.ngbDateParserFormatter.parse(s)) + .filter( + (d) => + this.value === null || // if value is not set, take all suggestions + this.value != this.isoDateAdapter.toModel(d) // otherwise filter out current date + ) + .map((s) => this.ngbDateParserFormatter.format(s)) + } + + onSuggestionClick(dateString: string) { + const parsedDate = this.ngbDateParserFormatter.parse(dateString) + this.writeValue(this.isoDateAdapter.toModel(parsedDate)) + this.onChange(this.value) + } + ngOnInit(): void { super.ngOnInit() this.placeholder = this.settings.getLocalizedDateInputFormat() diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html index 764a587e0..b20f3facd 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.html +++ b/src-ui/src/app/components/document-detail/document-detail.component.html @@ -74,7 +74,8 @@ <app-input-text #inputTitle i18n-title title="Title" formControlName="title" (keyup)="titleKeyUp($event)" [error]="error?.title"></app-input-text> <app-input-number i18n-title title="Archive serial number" [error]="error?.archive_serial_number" formControlName='archive_serial_number'></app-input-number> - <app-input-date i18n-title title="Date created" formControlName="created_date" [error]="error?.created_date"></app-input-date> + <app-input-date i18n-title title="Date created" formControlName="created_date" [suggestions]="suggestions?.dates" + [error]="error?.created_date"></app-input-date> <app-input-select [items]="correspondents" i18n-title title="Correspondent" formControlName="correspondent" [allowNull]="true" (createNew)="createCorrespondent($event)" [suggestions]="suggestions?.correspondents"></app-input-select> <app-input-select [items]="documentTypes" i18n-title title="Document type" formControlName="document_type" [allowNull]="true" diff --git a/src-ui/src/app/data/paperless-document-suggestions.ts b/src-ui/src/app/data/paperless-document-suggestions.ts index 47d480985..295a1ab0e 100644 --- a/src-ui/src/app/data/paperless-document-suggestions.ts +++ b/src-ui/src/app/data/paperless-document-suggestions.ts @@ -6,4 +6,6 @@ export interface PaperlessDocumentSuggestions { document_types?: number[] storage_paths?: number[] + + dates?: string[] // ISO-formatted date string e.g. 2022-11-03 } diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 721346fb0..f62199677 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -6,6 +6,8 @@ import re import shutil import subprocess import tempfile +from typing import Iterator +from typing import Match from typing import Optional from typing import Set @@ -216,6 +218,10 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: def parse_date(filename, text) -> Optional[datetime.datetime]: + return next(parse_date_generator(filename, text), None) + + +def parse_date_generator(filename, text) -> Iterator[datetime.datetime]: """ Returns the date of the document. """ @@ -246,38 +252,32 @@ def parse_date(filename, text) -> Optional[datetime.datetime]: return date return None - date = None + def __process_match( + match: Match[str], + date_order: str, + ) -> Optional[datetime.datetime]: + date_string = match.group(0) + + try: + date = __parser(date_string, date_order) + except (TypeError, ValueError): + # Skip all matches that do not parse to a proper date + date = None + + return __filter(date) + + def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]: + for m in re.finditer(DATE_REGEX, content): + date = __process_match(m, date_order) + if date is not None: + yield date # if filename date parsing is enabled, search there first: if settings.FILENAME_DATE_ORDER: - for m in re.finditer(DATE_REGEX, filename): - date_string = m.group(0) - - try: - date = __parser(date_string, settings.FILENAME_DATE_ORDER) - except (TypeError, ValueError): - # Skip all matches that do not parse to a proper date - continue - - date = __filter(date) - if date is not None: - return date + yield from __process_content(filename, settings.FILENAME_DATE_ORDER) # Iterate through all regex matches in text and try to parse the date - for m in re.finditer(DATE_REGEX, text): - date_string = m.group(0) - - try: - date = __parser(date_string, settings.DATE_ORDER) - except (TypeError, ValueError): - # Skip all matches that do not parse to a proper date - continue - - date = __filter(date) - if date is not None: - return date - - return date + yield from __process_content(text, settings.DATE_ORDER) class ParseError(Exception): diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index 38fe6f07b..b6fa69699 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -1107,6 +1107,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): "tags": [], "document_types": [], "storage_paths": [], + "dates": [], }, ) @@ -1118,6 +1119,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): @mock.patch("documents.views.match_document_types") @mock.patch("documents.views.match_tags") @mock.patch("documents.views.match_correspondents") + @override_settings(NUMBER_OF_SUGGESTED_DATES=10) def test_get_suggestions( self, match_correspondents, @@ -1128,7 +1130,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): doc = Document.objects.create( title="test", mime_type="application/pdf", - content="this is an invoice!", + content="this is an invoice from 12.04.2022!", ) match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)] @@ -1144,6 +1146,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): "tags": [56, 123], "document_types": [23], "storage_paths": [99, 77], + "dates": ["2022-04-12"], }, ) diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py index 1019c572f..b9151a6f7 100644 --- a/src/documents/tests/test_date_parsing.py +++ b/src/documents/tests/test_date_parsing.py @@ -8,6 +8,7 @@ from django.conf import settings from django.test import override_settings from django.test import TestCase from documents.parsers import parse_date +from documents.parsers import parse_date_generator from paperless.settings import DATE_ORDER @@ -161,6 +162,25 @@ class TestDate(TestCase): def test_crazy_date_with_spaces(self, *args): self.assertIsNone(parse_date("", "20 408000l 2475")) + def test_multiple_dates(self): + text = """This text has multiple dates. + For example 02.02.2018, 22 July 2022 and Dezember 2021. + But not 24-12-9999 because its in the future...""" + dates = list(parse_date_generator("", text)) + self.assertEqual(len(dates), 3) + self.assertEqual( + dates[0], + datetime.datetime(2018, 2, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + self.assertEqual( + dates[1], + datetime.datetime(2022, 7, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + self.assertEqual( + dates[2], + datetime.datetime(2021, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + @override_settings(FILENAME_DATE_ORDER="YMD") def test_filename_date_parse_valid_ymd(self, *args): """ diff --git a/src/documents/views.py b/src/documents/views.py index b261f37fd..51a6ed23a 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,3 +1,4 @@ +import itertools import json import logging import os @@ -21,6 +22,7 @@ from django.db.models.functions import Lower from django.http import Http404 from django.http import HttpResponse from django.http import HttpResponseBadRequest +from django.shortcuts import get_object_or_404 from django.utils.decorators import method_decorator from django.utils.translation import get_language from django.views.decorators.cache import cache_control @@ -70,6 +72,7 @@ from .models import SavedView from .models import StoragePath from .models import Tag from .parsers import get_parser_class_for_mime_type +from .parsers import parse_date_generator from .serialisers import AcknowledgeTasksViewSerializer from .serialisers import BulkDownloadSerializer from .serialisers import BulkEditSerializer @@ -330,13 +333,15 @@ class DocumentViewSet( @action(methods=["get"], detail=True) def suggestions(self, request, pk=None): - try: - doc = Document.objects.get(pk=pk) - except Document.DoesNotExist: - raise Http404() + doc = get_object_or_404(Document, pk=pk) classifier = load_classifier() + gen = parse_date_generator(doc.filename, doc.content) + dates = sorted( + {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)}, + ) + return Response( { "correspondents": [c.id for c in match_correspondents(doc, classifier)], @@ -345,6 +350,9 @@ class DocumentViewSet( dt.id for dt in match_document_types(doc, classifier) ], "storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)], + "dates": [ + date.strftime("%Y-%m-%d") for date in dates if date is not None + ], }, ) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 2ce99ac0e..7ec260b1a 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -588,6 +588,10 @@ POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT") DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") +# Maximum number of dates taken from document start to end to show as suggestions for +# `created` date in the frontend. Duplicates are removed, which can result in fewer dates shown. +NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3) + # Transformations applied before filename parsing FILENAME_PARSE_TRANSFORMS = [] for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):