Merge pull request #1367 from Eckii24/feat/date-suggestions

Adding date suggestions to the documents details view
This commit is contained in:
shamoon 2022-08-25 11:47:37 -07:00 committed by GitHub
commit d40c13420d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 114 additions and 34 deletions

View File

@ -741,6 +741,19 @@ PAPERLESS_FILENAME_DATE_ORDER=<format>
Defaults to none, which disables this feature.
PAPERLESS_NUMBER_OF_SUGGESTED_DATES=<num>
Paperless searches an entire document for dates. The first date found will
be used as the initial value for the created date. When this variable is
greater than 0 (or left to it's default value), paperless will also suggest
other dates found in the document, up to a maximum of this setting. Note that
duplicates will be removed, which can result in fewer dates displayed in the
frontend than this setting value.
The task to find all dates can be time-consuming and increases with a higher
(maximum) number of suggested dates and slower hardware.
Defaults to 3. Set to 0 to disable this feature.
PAPERLESS_THUMBNAIL_FONT_NAME=<filename>
Paperless creates thumbnails for plain text files by rendering the content
of the file on an image and uses a predefined font for that. This

View File

@ -69,6 +69,7 @@
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
#PAPERLESS_FILENAME_DATE_ORDER=YMD
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[]
#PAPERLESS_NUMBER_OF_SUGGESTED_DATES=5
#PAPERLESS_THUMBNAIL_FONT_NAME=
#PAPERLESS_IGNORE_DATES=
#PAPERLESS_ENABLE_UPDATE_CHECK=

View File

@ -12,4 +12,10 @@
</div>
<div class="invalid-feedback" i18n>Invalid date.</div>
<small *ngIf="hint" class="form-text text-muted">{{hint}}</small>
<small *ngIf="getSuggestions().length > 0">
<span i18n>Suggestions:</span>&nbsp;
<ng-container *ngFor="let s of getSuggestions()">
<a (click)="onSuggestionClick(s)" [routerLink]="[]">{{s}}</a>&nbsp;
</ng-container>
</small>
</div>

View File

@ -1,4 +1,4 @@
import { Component, forwardRef, OnInit } from '@angular/core'
import { Component, forwardRef, Input, OnInit } from '@angular/core'
import { NG_VALUE_ACCESSOR } from '@angular/forms'
import {
NgbDateAdapter,
@ -31,6 +31,28 @@ export class DateComponent
super()
}
@Input()
suggestions: string[]
getSuggestions() {
return this.suggestions == null
? []
: this.suggestions
.map((s) => this.ngbDateParserFormatter.parse(s))
.filter(
(d) =>
this.value === null || // if value is not set, take all suggestions
this.value != this.isoDateAdapter.toModel(d) // otherwise filter out current date
)
.map((s) => this.ngbDateParserFormatter.format(s))
}
onSuggestionClick(dateString: string) {
const parsedDate = this.ngbDateParserFormatter.parse(dateString)
this.writeValue(this.isoDateAdapter.toModel(parsedDate))
this.onChange(this.value)
}
ngOnInit(): void {
super.ngOnInit()
this.placeholder = this.settings.getLocalizedDateInputFormat()

View File

@ -74,7 +74,8 @@
<app-input-text #inputTitle i18n-title title="Title" formControlName="title" (keyup)="titleKeyUp($event)" [error]="error?.title"></app-input-text>
<app-input-number i18n-title title="Archive serial number" [error]="error?.archive_serial_number" formControlName='archive_serial_number'></app-input-number>
<app-input-date i18n-title title="Date created" formControlName="created_date" [error]="error?.created_date"></app-input-date>
<app-input-date i18n-title title="Date created" formControlName="created_date" [suggestions]="suggestions?.dates"
[error]="error?.created_date"></app-input-date>
<app-input-select [items]="correspondents" i18n-title title="Correspondent" formControlName="correspondent" [allowNull]="true"
(createNew)="createCorrespondent($event)" [suggestions]="suggestions?.correspondents"></app-input-select>
<app-input-select [items]="documentTypes" i18n-title title="Document type" formControlName="document_type" [allowNull]="true"

View File

@ -6,4 +6,6 @@ export interface PaperlessDocumentSuggestions {
document_types?: number[]
storage_paths?: number[]
dates?: string[] // ISO-formatted date string e.g. 2022-11-03
}

View File

@ -6,6 +6,8 @@ import re
import shutil
import subprocess
import tempfile
from typing import Iterator
from typing import Match
from typing import Optional
from typing import Set
@ -216,6 +218,10 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
def parse_date(filename, text) -> Optional[datetime.datetime]:
return next(parse_date_generator(filename, text), None)
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"""
Returns the date of the document.
"""
@ -246,38 +252,32 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
return date
return None
date = None
def __process_match(
match: Match[str],
date_order: str,
) -> Optional[datetime.datetime]:
date_string = match.group(0)
try:
date = __parser(date_string, date_order)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
date = None
return __filter(date)
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
for m in re.finditer(DATE_REGEX, content):
date = __process_match(m, date_order)
if date is not None:
yield date
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
for m in re.finditer(DATE_REGEX, filename):
date_string = m.group(0)
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
return date
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
try:
date = __parser(date_string, settings.DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
return date
return date
yield from __process_content(text, settings.DATE_ORDER)
class ParseError(Exception):

View File

@ -1107,6 +1107,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
"tags": [],
"document_types": [],
"storage_paths": [],
"dates": [],
},
)
@ -1118,6 +1119,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.match_document_types")
@mock.patch("documents.views.match_tags")
@mock.patch("documents.views.match_correspondents")
@override_settings(NUMBER_OF_SUGGESTED_DATES=10)
def test_get_suggestions(
self,
match_correspondents,
@ -1128,7 +1130,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
doc = Document.objects.create(
title="test",
mime_type="application/pdf",
content="this is an invoice!",
content="this is an invoice from 12.04.2022!",
)
match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
@ -1144,6 +1146,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
"tags": [56, 123],
"document_types": [23],
"storage_paths": [99, 77],
"dates": ["2022-04-12"],
},
)

View File

@ -8,6 +8,7 @@ from django.conf import settings
from django.test import override_settings
from django.test import TestCase
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
from paperless.settings import DATE_ORDER
@ -161,6 +162,25 @@ class TestDate(TestCase):
def test_crazy_date_with_spaces(self, *args):
self.assertIsNone(parse_date("", "20 408000l 2475"))
def test_multiple_dates(self):
text = """This text has multiple dates.
For example 02.02.2018, 22 July 2022 and Dezember 2021.
But not 24-12-9999 because its in the future..."""
dates = list(parse_date_generator("", text))
self.assertEqual(len(dates), 3)
self.assertEqual(
dates[0],
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
self.assertEqual(
dates[1],
datetime.datetime(2022, 7, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
self.assertEqual(
dates[2],
datetime.datetime(2021, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_valid_ymd(self, *args):
"""

View File

@ -1,3 +1,4 @@
import itertools
import json
import logging
import os
@ -21,6 +22,7 @@ from django.db.models.functions import Lower
from django.http import Http404
from django.http import HttpResponse
from django.http import HttpResponseBadRequest
from django.shortcuts import get_object_or_404
from django.utils.decorators import method_decorator
from django.utils.translation import get_language
from django.views.decorators.cache import cache_control
@ -70,6 +72,7 @@ from .models import SavedView
from .models import StoragePath
from .models import Tag
from .parsers import get_parser_class_for_mime_type
from .parsers import parse_date_generator
from .serialisers import AcknowledgeTasksViewSerializer
from .serialisers import BulkDownloadSerializer
from .serialisers import BulkEditSerializer
@ -330,13 +333,15 @@ class DocumentViewSet(
@action(methods=["get"], detail=True)
def suggestions(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404()
doc = get_object_or_404(Document, pk=pk)
classifier = load_classifier()
gen = parse_date_generator(doc.filename, doc.content)
dates = sorted(
{i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
)
return Response(
{
"correspondents": [c.id for c in match_correspondents(doc, classifier)],
@ -345,6 +350,9 @@ class DocumentViewSet(
dt.id for dt in match_document_types(doc, classifier)
],
"storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)],
"dates": [
date.strftime("%Y-%m-%d") for date in dates if date is not None
],
},
)

View File

@ -588,6 +588,10 @@ POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
# Maximum number of dates taken from document start to end to show as suggestions for
# `created` date in the frontend. Duplicates are removed, which can result in fewer dates shown.
NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3)
# Transformations applied before filename parsing
FILENAME_PARSE_TRANSFORMS = []
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):