mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge remote-tracking branch 'upstream/master'
This commit is contained in:
@@ -64,12 +64,12 @@ class FinancialYearFilter(admin.SimpleListFilter):
|
||||
|
||||
# To keep it simple we use the same string for both
|
||||
# query parameter and the display.
|
||||
return (query, query)
|
||||
return query, query
|
||||
|
||||
else:
|
||||
query = "{0}-{0}".format(date.year)
|
||||
display = "{}".format(date.year)
|
||||
return (query, display)
|
||||
return query, display
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
if not settings.FY_START or not settings.FY_END:
|
||||
@@ -91,25 +91,24 @@ class FinancialYearFilter(admin.SimpleListFilter):
|
||||
|
||||
|
||||
class RecentCorrespondentFilter(admin.RelatedFieldListFilter):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.title = "correspondent (recent)"
|
||||
"""
|
||||
If PAPERLESS_RECENT_CORRESPONDENT_YEARS is set, we limit the available
|
||||
correspondents to documents sent our way over the past ``n`` years.
|
||||
"""
|
||||
|
||||
def field_choices(self, field, request, model_admin):
|
||||
|
||||
years = settings.PAPERLESS_RECENT_CORRESPONDENT_YEARS
|
||||
days = 365 * years
|
||||
correspondents = Correspondent.objects.all()
|
||||
|
||||
lookups = []
|
||||
if years and years > 0:
|
||||
correspondents = Correspondent.objects.filter(
|
||||
self.title = "Correspondent (Recent)"
|
||||
days = 365 * years
|
||||
correspondents = correspondents.filter(
|
||||
documents__created__gte=datetime.now() - timedelta(days=days)
|
||||
).distinct()
|
||||
for c in correspondents:
|
||||
lookups.append((c.id, c.name))
|
||||
|
||||
return lookups
|
||||
return [(c.id, c.name) for c in correspondents]
|
||||
|
||||
|
||||
class CommonAdmin(admin.ModelAdmin):
|
||||
@@ -124,7 +123,9 @@ class CorrespondentAdmin(CommonAdmin):
|
||||
"document_count",
|
||||
"last_correspondence"
|
||||
)
|
||||
list_editable = ("automatic_classification")
|
||||
list_editable = ("automatic_classification",)
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super(CorrespondentAdmin, self).get_queryset(request)
|
||||
@@ -149,6 +150,11 @@ class TagAdmin(CommonAdmin):
|
||||
list_filter = ("colour",)
|
||||
list_editable = ("colour", "automatic_classification")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
class Media:
|
||||
js = ("js/colours.js",)
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super(TagAdmin, self).get_queryset(request)
|
||||
qs = qs.annotate(document_count=models.Count("documents"))
|
||||
@@ -164,6 +170,8 @@ class DocumentTypeAdmin(CommonAdmin):
|
||||
list_display = ("name", "automatic_classification", "document_count")
|
||||
list_editable = ("automatic_classification",)
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super(DocumentTypeAdmin, self).get_queryset(request)
|
||||
qs = qs.annotate(document_count=models.Count("documents"))
|
||||
@@ -182,14 +190,13 @@ class DocumentAdmin(CommonAdmin):
|
||||
}
|
||||
|
||||
search_fields = ("correspondent__name", "title", "content", "tags__name")
|
||||
readonly_fields = ("added",)
|
||||
readonly_fields = ("added", "file_type", "storage_type",)
|
||||
list_display = ("title", "created", "added", "thumbnail", "correspondent",
|
||||
"tags_", "archive_serial_number", "document_type")
|
||||
list_filter = (
|
||||
"document_type",
|
||||
"tags",
|
||||
("correspondent", RecentCorrespondentFilter),
|
||||
"correspondent",
|
||||
FinancialYearFilter
|
||||
)
|
||||
|
||||
|
@@ -1,3 +1,4 @@
|
||||
from django.db import transaction
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
@@ -111,8 +112,11 @@ class Consumer:
|
||||
if not self.try_consume_file(file):
|
||||
self._ignore.append((file, mtime))
|
||||
|
||||
@transaction.atomic
|
||||
def try_consume_file(self, file):
|
||||
"Return True if file was consumed"
|
||||
"""
|
||||
Return True if file was consumed
|
||||
"""
|
||||
|
||||
if not re.match(FileInfo.REGEXES["title"], file):
|
||||
return False
|
||||
@@ -145,7 +149,7 @@ class Consumer:
|
||||
parsed_document = parser_class(doc)
|
||||
|
||||
try:
|
||||
thumbnail = parsed_document.get_thumbnail()
|
||||
thumbnail = parsed_document.get_optimised_thumbnail()
|
||||
date = parsed_document.get_date()
|
||||
document = self._store(
|
||||
parsed_document.get_text(),
|
||||
|
@@ -1,4 +1,4 @@
|
||||
from django_filters.rest_framework import CharFilter, FilterSet, BooleanFilter, ModelChoiceFilter
|
||||
from django_filters.rest_framework import BooleanFilter, FilterSet
|
||||
|
||||
from .models import Correspondent, Document, Tag, DocumentType
|
||||
|
||||
|
52
src/documents/migrations/0022_auto_20181007_1420.py
Normal file
52
src/documents/migrations/0022_auto_20181007_1420.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# Generated by Django 2.0.8 on 2018-10-07 14:20
|
||||
|
||||
from django.db import migrations, models
|
||||
from django.utils.text import slugify
|
||||
|
||||
|
||||
def re_slug_all_the_things(apps, schema_editor):
|
||||
"""
|
||||
Rewrite all slug values to make sure they're actually slugs before we brand
|
||||
them as uneditable.
|
||||
"""
|
||||
|
||||
Tag = apps.get_model("documents", "Tag")
|
||||
Correspondent = apps.get_model("documents", "Correspondent")
|
||||
|
||||
for klass in (Tag, Correspondent):
|
||||
for instance in klass.objects.all():
|
||||
klass.objects.filter(
|
||||
pk=instance.pk
|
||||
).update(
|
||||
slug=slugify(instance.slug)
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0021_document_storage_type'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='tag',
|
||||
options={'ordering': ('name',)},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='correspondent',
|
||||
name='slug',
|
||||
field=models.SlugField(blank=True, editable=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='file_type',
|
||||
field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF'), ('txt', 'TXT'), ('csv', 'CSV'), ('md', 'MD')], editable=False, max_length=4),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(blank=True, editable=False),
|
||||
),
|
||||
migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
|
||||
]
|
@@ -6,7 +6,7 @@ from django.db import migrations, models
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0021_document_storage_type'),
|
||||
('documents', '0022_auto_20181007_1420'),
|
||||
]
|
||||
|
||||
operations = [
|
@@ -7,7 +7,7 @@ import django.db.models.deletion
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0022_workflow_improvements'),
|
||||
('documents', '1001_workflow_improvements'),
|
||||
]
|
||||
|
||||
operations = [
|
@@ -18,7 +18,7 @@ def reverse_automatic_classification(apps, schema_editor):
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0023_auto_20180823_1155'),
|
||||
('documents', '1002_auto_20180823_1155'),
|
||||
]
|
||||
|
||||
operations = [
|
36
src/documents/migrations/1004_documenttype_slug.py
Normal file
36
src/documents/migrations/1004_documenttype_slug.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# Generated by Django 2.0.8 on 2018-10-07 14:20
|
||||
|
||||
from django.db import migrations, models
|
||||
from django.utils.text import slugify
|
||||
|
||||
|
||||
def re_slug_all_the_things(apps, schema_editor):
|
||||
"""
|
||||
Rewrite all slug values to make sure they're actually slugs before we brand
|
||||
them as uneditable.
|
||||
"""
|
||||
|
||||
DocumentType = apps.get_model("documents", "DocumentType")
|
||||
|
||||
for instance in DocumentType.objects.all():
|
||||
DocumentType.objects.filter(
|
||||
pk=instance.pk
|
||||
).update(
|
||||
slug=slugify(instance.slug)
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1003_auto_20180904_1425'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='documenttype',
|
||||
name='slug',
|
||||
field=models.SlugField(blank=True, editable=False),
|
||||
),
|
||||
migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
|
||||
]
|
@@ -11,6 +11,7 @@ from django.conf import settings
|
||||
from django.db import models
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from .managers import LogManager
|
||||
@@ -24,7 +25,7 @@ except ImportError:
|
||||
class MatchingModel(models.Model):
|
||||
|
||||
name = models.CharField(max_length=128, unique=True)
|
||||
slug = models.SlugField(blank=True)
|
||||
slug = models.SlugField(blank=True, editable=False)
|
||||
|
||||
automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.')
|
||||
|
||||
@@ -37,8 +38,7 @@ class MatchingModel(models.Model):
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
|
||||
if not self.slug:
|
||||
self.slug = slugify(self.name)
|
||||
self.slug = slugify(self.name)
|
||||
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
||||
@@ -369,7 +369,7 @@ class FileInfo:
|
||||
r = []
|
||||
for t in tags.split(","):
|
||||
r.append(Tag.objects.get_or_create(
|
||||
slug=t.lower(),
|
||||
slug=slugify(t),
|
||||
defaults={"name": t}
|
||||
)[0])
|
||||
return tuple(r)
|
||||
|
@@ -1,23 +1,31 @@
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
import dateparser
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
DATE_REGEX = re.compile(
|
||||
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
|
||||
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{4})\b'
|
||||
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
|
||||
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
|
||||
)
|
||||
|
||||
|
||||
@@ -32,6 +40,9 @@ class DocumentParser:
|
||||
"""
|
||||
|
||||
SCRATCH = settings.SCRATCH_DIR
|
||||
DATE_ORDER = settings.DATE_ORDER
|
||||
FILENAME_DATE_ORDER = settings.FILENAME_DATE_ORDER
|
||||
OPTIPNG = settings.OPTIPNG_BINARY
|
||||
|
||||
def __init__(self, path):
|
||||
self.document_path = path
|
||||
@@ -45,6 +56,19 @@ class DocumentParser:
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def optimise_thumbnail(self, in_path):
|
||||
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
|
||||
args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
|
||||
if not subprocess.Popen(args).wait() == 0:
|
||||
raise ParseError("Optipng failed at {}".format(args))
|
||||
|
||||
return out_path
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.optimise_thumbnail(self.get_thumbnail())
|
||||
|
||||
def get_text(self):
|
||||
"""
|
||||
Returns the text from the document and only the text.
|
||||
@@ -55,7 +79,82 @@ class DocumentParser:
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def __parser(ds, date_order):
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE":
|
||||
True
|
||||
}
|
||||
)
|
||||
|
||||
date = None
|
||||
date_string = None
|
||||
|
||||
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||
title = os.path.basename(self.document_path)
|
||||
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if self.FILENAME_DATE_ORDER:
|
||||
self.log("info", "Checking document title for date")
|
||||
for m in re.finditer(DATE_REGEX, title):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, self.FILENAME_DATE_ORDER)
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
self.log(
|
||||
"info",
|
||||
"Detected document date {} based on string {} "
|
||||
"from document title"
|
||||
"".format(date.isoformat(), date_string)
|
||||
)
|
||||
return date
|
||||
|
||||
try:
|
||||
# getting text after checking filename will save time if only
|
||||
# looking at the filename instead of the whole text
|
||||
text = self.get_text()
|
||||
except ParseError:
|
||||
return None
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, self.DATE_ORDER)
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
if date is not None:
|
||||
self.log(
|
||||
"info",
|
||||
"Detected document date {} based on string {}".format(
|
||||
date.isoformat(),
|
||||
date_string
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
|
66
src/documents/static/js/colours.js
Normal file
66
src/documents/static/js/colours.js
Normal file
@@ -0,0 +1,66 @@
|
||||
// The following jQuery snippet will add a small square next to the selection
|
||||
// drop-down on the `Add tag` page that will update to show the selected tag
|
||||
// color as the drop-down value is changed.
|
||||
|
||||
django.jQuery(document).ready(function(){
|
||||
|
||||
if (django.jQuery("#id_colour").length) {
|
||||
|
||||
let colour;
|
||||
let colour_num;
|
||||
|
||||
colour_num = django.jQuery("#id_colour").val() - 1;
|
||||
colour = django.jQuery('#id_colour')[0][colour_num].text;
|
||||
django.jQuery('#id_colour').after('<div class="colour_square"></div>');
|
||||
|
||||
django.jQuery('.colour_square').css({
|
||||
'float': 'left',
|
||||
'width': '20px',
|
||||
'height': '20px',
|
||||
'margin': '5px',
|
||||
'border': '1px solid rgba(0, 0, 0, .2)',
|
||||
'background': colour
|
||||
});
|
||||
|
||||
django.jQuery('#id_colour').change(function () {
|
||||
colour_num = django.jQuery("#id_colour").val() - 1;
|
||||
colour = django.jQuery('#id_colour')[0][colour_num].text;
|
||||
django.jQuery('.colour_square').css({'background': colour});
|
||||
});
|
||||
|
||||
} else if (django.jQuery("select[id*='colour']").length) {
|
||||
|
||||
django.jQuery('select[id*="-colour"]').each(function (index, element) {
|
||||
let id;
|
||||
let loop_colour_num;
|
||||
let loop_colour;
|
||||
|
||||
id = "colour_square_" + index;
|
||||
django.jQuery(element).after('<div class="colour_square" id="' + id + '"></div>');
|
||||
|
||||
loop_colour_num = django.jQuery(element).val() - 1;
|
||||
loop_colour = django.jQuery(element)[0][loop_colour_num].text;
|
||||
|
||||
django.jQuery("<style type='text/css'>\
|
||||
.colour_square{ \
|
||||
float: left; \
|
||||
width: 20px; \
|
||||
height: 20px; \
|
||||
margin: 5px; \
|
||||
border: 1px solid rgba(0,0,0,.2); \
|
||||
} </style>").appendTo("head");
|
||||
django.jQuery('#' + id).css({'background': loop_colour});
|
||||
|
||||
console.log(id, loop_colour_num, loop_colour);
|
||||
|
||||
django.jQuery(element).change(function () {
|
||||
loop_colour_num = django.jQuery(element).val() - 1;
|
||||
loop_colour = django.jQuery(element)[0][loop_colour_num].text;
|
||||
django.jQuery('#' + id).css({'background': loop_colour});
|
||||
console.log('#' + id, loop_colour)
|
||||
});
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
});
|
@@ -76,7 +76,12 @@ def binaries_check(app_configs, **kwargs):
|
||||
error = "Paperless can't find {}. Without it, consumption is impossible."
|
||||
hint = "Either it's not in your ${PATH} or it's not installed."
|
||||
|
||||
binaries = (settings.CONVERT_BINARY, settings.UNPAPER_BINARY, "tesseract")
|
||||
binaries = (
|
||||
settings.CONVERT_BINARY,
|
||||
settings.OPTIPNG_BINARY,
|
||||
settings.UNPAPER_BINARY,
|
||||
"tesseract"
|
||||
)
|
||||
|
||||
check_messages = []
|
||||
for binary in binaries:
|
||||
|
@@ -1,15 +1,20 @@
|
||||
from django.contrib.auth.models import User as DjangoUser
|
||||
|
||||
|
||||
class User:
|
||||
"""
|
||||
This is a dummy django User used with our middleware to disable
|
||||
login authentication if that is configured in paperless.conf
|
||||
This is a dummy django User used with our middleware to disable
|
||||
login authentication if that is configured in paperless.conf
|
||||
"""
|
||||
|
||||
is_superuser = True
|
||||
is_active = True
|
||||
is_staff = True
|
||||
is_authenticated = True
|
||||
|
||||
# Must be -1 to avoid colliding with real user ID's (which start at 1)
|
||||
id = -1
|
||||
@property
|
||||
def id(self):
|
||||
return DjangoUser.objects.order_by("pk").first().pk
|
||||
|
||||
@property
|
||||
def pk(self):
|
||||
@@ -17,9 +22,9 @@ class User:
|
||||
|
||||
|
||||
"""
|
||||
NOTE: These are here as a hack instead of being in the User definition
|
||||
above due to the way pycodestyle handles lamdbdas.
|
||||
See https://github.com/PyCQA/pycodestyle/issues/379 for more.
|
||||
NOTE: These are here as a hack instead of being in the User definition
|
||||
NOTE: above due to the way pycodestyle handles lamdbdas.
|
||||
NOTE: See https://github.com/PyCQA/pycodestyle/issues/379 for more.
|
||||
"""
|
||||
|
||||
User.has_module_perms = lambda *_: True
|
||||
|
@@ -152,6 +152,10 @@ if os.getenv("PAPERLESS_DBENGINE"):
|
||||
}
|
||||
if os.getenv("PAPERLESS_DBPASS"):
|
||||
DATABASES["default"]["PASSWORD"] = os.getenv("PAPERLESS_DBPASS")
|
||||
if os.getenv("PAPERLESS_DBHOST"):
|
||||
DATABASES["default"]["HOST"] = os.getenv("PAPERLESS_DBHOST")
|
||||
if os.getenv("PAPERLESS_DBPORT"):
|
||||
DATABASES["default"]["PORT"] = os.getenv("PAPERLESS_DBPORT")
|
||||
|
||||
|
||||
# Password validation
|
||||
@@ -199,6 +203,16 @@ STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", "/static/")
|
||||
MEDIA_URL = os.getenv("PAPERLESS_MEDIA_URL", "/media/")
|
||||
|
||||
|
||||
# Other
|
||||
|
||||
# Disable Django's artificial limit on the number of form fields to submit at
|
||||
# once. This is a protection against overloading the server, but since this is
|
||||
# a self-hosted sort of gig, the benefits of being able to mass-delete a tonne
|
||||
# of log entries outweight the benefits of such a safeguard.
|
||||
|
||||
DATA_UPLOAD_MAX_NUMBER_FIELDS = None
|
||||
|
||||
|
||||
# Document classification models location
|
||||
MODEL_FILE = os.getenv(
|
||||
"PAPERLESS_MODEL_FILE", os.path.join(BASE_DIR, "..", "models", "model.pickle"))
|
||||
@@ -252,6 +266,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
|
||||
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
|
||||
CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")
|
||||
|
||||
# OptiPNG
|
||||
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
|
||||
|
||||
# Unpaper
|
||||
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
|
||||
|
||||
@@ -298,6 +315,7 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
|
||||
|
||||
# Specify the default date order (for autodetected dates)
|
||||
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
||||
|
||||
# Specify for how many years a correspondent is considered recent. Recent
|
||||
# correspondents will be shown in a separate "Recent correspondents" filter as
|
||||
|
@@ -1 +1 @@
|
||||
__version__ = (2, 3, 0)
|
||||
__version__ = (2, 6, 0)
|
||||
|
@@ -4,7 +4,6 @@ import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import dateparser
|
||||
import langdetect
|
||||
import pyocr
|
||||
from django.conf import settings
|
||||
@@ -14,7 +13,7 @@ from pyocr.libtesseract.tesseract_raw import \
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
import pdftotext
|
||||
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
|
||||
from documents.parsers import DocumentParser, ParseError
|
||||
|
||||
from .languages import ISO639
|
||||
|
||||
@@ -33,7 +32,6 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
DATE_ORDER = settings.DATE_ORDER
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||
|
||||
@@ -46,15 +44,18 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
|
||||
out_path = os.path.join(self.tempdir, "convert.png")
|
||||
|
||||
# Run convert to get a decent thumbnail
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-scale", "500x5000",
|
||||
"-alpha", "remove",
|
||||
"{}[0]".format(self.document_path),
|
||||
os.path.join(self.tempdir, "convert.png")
|
||||
out_path
|
||||
)
|
||||
|
||||
return os.path.join(self.tempdir, "convert.png")
|
||||
return out_path
|
||||
|
||||
def _is_ocred(self):
|
||||
|
||||
@@ -152,7 +153,10 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError("Language detection failed")
|
||||
error_msg = ("Language detection failed. Set "
|
||||
"PAPERLESS_FORGIVING_OCR in config file to continue "
|
||||
"anyway.")
|
||||
raise OCRError(error_msg)
|
||||
|
||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
@@ -202,40 +206,6 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||
return text
|
||||
|
||||
def get_date(self):
|
||||
date = None
|
||||
datestring = None
|
||||
|
||||
try:
|
||||
text = self.get_text()
|
||||
except ParseError as e:
|
||||
return None
|
||||
|
||||
# Iterate through all regex matches and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
datestring = m.group(0)
|
||||
|
||||
try:
|
||||
date = dateparser.parse(
|
||||
datestring,
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None:
|
||||
break
|
||||
|
||||
if date is not None:
|
||||
self.log("info", "Detected document date " + date.isoformat() +
|
||||
" based on string " + datestring)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
|
||||
|
||||
def run_convert(*args):
|
||||
|
||||
@@ -251,7 +221,8 @@ def run_convert(*args):
|
||||
|
||||
def run_unpaper(args):
|
||||
unpaper, pnm = args
|
||||
command_args = unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
|
||||
command_args = (unpaper, "--overwrite", pnm,
|
||||
pnm.replace(".pnm", ".unpaper.pnm"))
|
||||
if not subprocess.Popen(command_args).wait() == 0:
|
||||
raise ParseError("Unpaper failed at {}".format(command_args))
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 136 KiB |
Binary file not shown.
Binary file not shown.
Before Width: | Height: | Size: 138 KiB After Width: | Height: | Size: 55 KiB |
Binary file not shown.
Binary file not shown.
Before Width: | Height: | Size: 138 KiB After Width: | Height: | Size: 53 KiB |
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 136 KiB |
@@ -5,9 +5,10 @@ from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from dateutil import tz
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..parsers import RasterisedDocumentParser
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
@@ -59,9 +60,13 @@ class TestDate(TestCase):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = "lorem ipsum 13.02.2018 lorem ipsum"
|
||||
date = document.get_date()
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc())
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -72,10 +77,16 @@ class TestDate(TestCase):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = (
|
||||
"lorem ipsum 130218, 2018, 20180213 and 13.02.2018 lorem ipsum")
|
||||
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
|
||||
"ipsum"
|
||||
)
|
||||
date = document.get_date()
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc())
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -110,9 +121,13 @@ class TestDate(TestCase):
|
||||
"März 2019\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
date = document.get_date()
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
date,
|
||||
datetime.datetime(
|
||||
2019, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -122,19 +137,25 @@ class TestDate(TestCase):
|
||||
def test_date_format_8(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = ("lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum\n"
|
||||
"März 2020")
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum\n"
|
||||
"März 2020"
|
||||
)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -143,13 +164,19 @@ class TestDate(TestCase):
|
||||
def test_date_format_9(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = ("lorem ipsum\n"
|
||||
"27. Nullmonth 2020\n"
|
||||
"März 2020\n"
|
||||
"lorem ipsum")
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"27. Nullmonth 2020\n"
|
||||
"März 2020\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -158,11 +185,16 @@ class TestDate(TestCase):
|
||||
def test_get_text_1_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
date = document.get_date()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 4, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -172,11 +204,15 @@ class TestDate(TestCase):
|
||||
def test_get_text_1_png(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2018, 4, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -186,11 +222,15 @@ class TestDate(TestCase):
|
||||
def test_get_text_2_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2013, 2, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -200,67 +240,91 @@ class TestDate(TestCase):
|
||||
def test_get_text_2_png(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2013, 2, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
@override_settings(OCR_LANGUAGE="deu")
|
||||
def test_get_text_3_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
@override_settings(OCR_LANGUAGE="deu")
|
||||
def test_get_text_3_png(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
@override_settings(OCR_LANGUAGE="eng")
|
||||
def test_get_text_4_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
@override_settings(OCR_LANGUAGE="eng")
|
||||
def test_get_text_4_png(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -270,11 +334,15 @@ class TestDate(TestCase):
|
||||
def test_get_text_5_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -284,11 +352,15 @@ class TestDate(TestCase):
|
||||
def test_get_text_5_png(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -303,7 +375,10 @@ class TestDate(TestCase):
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -318,7 +393,10 @@ class TestDate(TestCase):
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -328,6 +406,7 @@ class TestDate(TestCase):
|
||||
def test_get_text_6_pdf_eu(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(), None)
|
||||
@@ -339,6 +418,7 @@ class TestDate(TestCase):
|
||||
def test_get_text_6_png_eu(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(), None)
|
||||
@@ -350,11 +430,15 @@ class TestDate(TestCase):
|
||||
def test_get_text_7_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_7.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2018, 4, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -364,11 +448,15 @@ class TestDate(TestCase):
|
||||
def test_get_text_8_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2017, 12, 31, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
@@ -378,9 +466,137 @@ class TestDate(TestCase):
|
||||
def test_get_text_9_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.DATE_ORDER = 'DMY'
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
|
||||
datetime.datetime(
|
||||
2017, 12, 31, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_filename_date_1_pdf(self):
|
||||
input_file = os.path.join(
|
||||
self.SAMPLE_FILES,
|
||||
"tests_date_in_filename_2018-03-20_1.pdf"
|
||||
)
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.FILENAME_DATE_ORDER = 'YMD'
|
||||
document.get_text()
|
||||
date = document.get_date()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 3, 20, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_filename_date_1_png(self):
|
||||
input_file = os.path.join(
|
||||
self.SAMPLE_FILES,
|
||||
"tests_date_in_filename_2018-03-20_1.png"
|
||||
)
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.FILENAME_DATE_ORDER = 'YMD'
|
||||
date = document.get_date()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 3, 20, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_filename_date_2_pdf(self):
|
||||
input_file = os.path.join(
|
||||
self.SAMPLE_FILES,
|
||||
"2013-12-11_tests_date_in_filename_2.pdf"
|
||||
)
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.FILENAME_DATE_ORDER = 'YMD'
|
||||
date = document.get_date()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2013, 12, 11, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_filename_date_2_png(self):
|
||||
input_file = os.path.join(
|
||||
self.SAMPLE_FILES,
|
||||
"2013-12-11_tests_date_in_filename_2.png"
|
||||
)
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.FILENAME_DATE_ORDER = 'YMD'
|
||||
date = document.get_date()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2013, 12, 11, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="01-07-0590 00:00:00"
|
||||
)
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_crazy_date_past(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null")
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="01-07-2350 00:00:00"
|
||||
)
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_crazy_date_future(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null")
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="01-07-0590 00:00:00"
|
||||
)
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_crazy_date_past(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null")
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
@@ -1,11 +1,9 @@
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
import dateparser
|
||||
from django.conf import settings
|
||||
|
||||
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
|
||||
from documents.parsers import DocumentParser, ParseError
|
||||
|
||||
|
||||
class TextDocumentParser(DocumentParser):
|
||||
@@ -16,7 +14,6 @@ class TextDocumentParser(DocumentParser):
|
||||
CONVERT = settings.CONVERT_BINARY
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
DATE_ORDER = settings.DATE_ORDER
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||
|
||||
@@ -26,7 +23,7 @@ class TextDocumentParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
The thumbnail of a txt is just a 500px wide image of the text
|
||||
The thumbnail of a text file is just a 500px wide image of the text
|
||||
rendered onto a letter-sized page.
|
||||
"""
|
||||
# The below is heavily cribbed from https://askubuntu.com/a/590951
|
||||
@@ -35,7 +32,7 @@ class TextDocumentParser(DocumentParser):
|
||||
text_color = "black" # text color
|
||||
psize = [500, 647] # icon size
|
||||
n_lines = 50 # number of lines to show
|
||||
output_file = os.path.join(self.tempdir, "convert-txt.png")
|
||||
out_path = os.path.join(self.tempdir, "convert.png")
|
||||
|
||||
temp_bg = os.path.join(self.tempdir, "bg.png")
|
||||
temp_txlayer = os.path.join(self.tempdir, "tx.png")
|
||||
@@ -46,9 +43,13 @@ class TextDocumentParser(DocumentParser):
|
||||
work_size = ",".join([str(n - 1) for n in psize])
|
||||
r = str(round(psize[0] / 10))
|
||||
rounded = ",".join([r, r])
|
||||
run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
|
||||
'"fill ', bg_color, ' roundrectangle 0,0,',
|
||||
work_size, ",", rounded, '" ', temp_bg)
|
||||
run_command(
|
||||
self.CONVERT,
|
||||
"-size ", picsize,
|
||||
' xc:none -draw ',
|
||||
'"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ', # NOQA: E501
|
||||
temp_bg
|
||||
)
|
||||
|
||||
def read_text():
|
||||
with open(self.document_path, 'r') as src:
|
||||
@@ -57,22 +58,29 @@ class TextDocumentParser(DocumentParser):
|
||||
return text.replace('"', "'")
|
||||
|
||||
def create_txlayer():
|
||||
run_command(self.CONVERT,
|
||||
"-background none",
|
||||
"-fill",
|
||||
text_color,
|
||||
"-pointsize", "12",
|
||||
"-border 4 -bordercolor none",
|
||||
"-size ", txsize,
|
||||
' caption:"', read_text(), '" ',
|
||||
temp_txlayer)
|
||||
run_command(
|
||||
self.CONVERT,
|
||||
"-background none",
|
||||
"-fill",
|
||||
text_color,
|
||||
"-pointsize", "12",
|
||||
"-border 4 -bordercolor none",
|
||||
"-size ", txsize,
|
||||
' caption:"', read_text(), '" ',
|
||||
temp_txlayer
|
||||
)
|
||||
|
||||
create_txlayer()
|
||||
create_bg()
|
||||
run_command(self.CONVERT, temp_bg, temp_txlayer,
|
||||
"-background None -layers merge ", output_file)
|
||||
run_command(
|
||||
self.CONVERT,
|
||||
temp_bg,
|
||||
temp_txlayer,
|
||||
"-background None -layers merge ",
|
||||
out_path
|
||||
)
|
||||
|
||||
return output_file
|
||||
return out_path
|
||||
|
||||
def get_text(self):
|
||||
|
||||
@@ -84,40 +92,6 @@ class TextDocumentParser(DocumentParser):
|
||||
|
||||
return self._text
|
||||
|
||||
def get_date(self):
|
||||
date = None
|
||||
datestring = None
|
||||
|
||||
try:
|
||||
text = self.get_text()
|
||||
except ParseError as e:
|
||||
return None
|
||||
|
||||
# Iterate through all regex matches and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
datestring = m.group(0)
|
||||
|
||||
try:
|
||||
date = dateparser.parse(
|
||||
datestring,
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None:
|
||||
break
|
||||
|
||||
if date is not None:
|
||||
self.log("info", "Detected document date " + date.isoformat() +
|
||||
" based on string " + datestring)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
|
||||
|
||||
def run_command(*args):
|
||||
environment = os.environ.copy()
|
||||
|
19
src/reminders/migrations/0002_auto_20181007_1420.py
Normal file
19
src/reminders/migrations/0002_auto_20181007_1420.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# Generated by Django 2.0.8 on 2018-10-07 14:20
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('reminders', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='reminder',
|
||||
name='document',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='documents.Document'),
|
||||
),
|
||||
]
|
@@ -4,7 +4,6 @@ from django.db import models
|
||||
class Reminder(models.Model):
|
||||
|
||||
document = models.ForeignKey(
|
||||
"documents.Document", on_delete=models.PROTECT
|
||||
)
|
||||
"documents.Document", on_delete=models.PROTECT)
|
||||
date = models.DateTimeField()
|
||||
note = models.TextField(blank=True)
|
||||
|
Reference in New Issue
Block a user