mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'master' of github.com:danielquinn/paperless into ENH_filename_date_parsing
This commit is contained in:
146
src/documents/actions.py
Normal file
146
src/documents/actions.py
Normal file
@@ -0,0 +1,146 @@
|
||||
from django.contrib import messages
|
||||
from django.contrib.admin import helpers
|
||||
from django.contrib.admin.utils import model_ngettext
|
||||
from django.core.exceptions import PermissionDenied
|
||||
from django.template.response import TemplateResponse
|
||||
|
||||
from documents.models import Correspondent, Tag
|
||||
|
||||
|
||||
def select_action(
|
||||
modeladmin, request, queryset, title, action, modelclass,
|
||||
success_message="", document_action=None, queryset_action=None):
|
||||
|
||||
opts = modeladmin.model._meta
|
||||
app_label = opts.app_label
|
||||
|
||||
if not modeladmin.has_change_permission(request):
|
||||
raise PermissionDenied
|
||||
|
||||
if request.POST.get('post'):
|
||||
n = queryset.count()
|
||||
selected_object = modelclass.objects.get(id=request.POST.get('obj_id'))
|
||||
if n:
|
||||
for document in queryset:
|
||||
if document_action:
|
||||
document_action(document, selected_object)
|
||||
document_display = str(document)
|
||||
modeladmin.log_change(request, document, document_display)
|
||||
if queryset_action:
|
||||
queryset_action(queryset, selected_object)
|
||||
|
||||
modeladmin.message_user(request, success_message % {
|
||||
"selected_object": selected_object.name,
|
||||
"count": n,
|
||||
"items": model_ngettext(modeladmin.opts, n)
|
||||
}, messages.SUCCESS)
|
||||
|
||||
# Return None to display the change list page again.
|
||||
return None
|
||||
|
||||
context = dict(
|
||||
modeladmin.admin_site.each_context(request),
|
||||
title=title,
|
||||
queryset=queryset,
|
||||
opts=opts,
|
||||
action_checkbox_name=helpers.ACTION_CHECKBOX_NAME,
|
||||
media=modeladmin.media,
|
||||
action=action,
|
||||
objects=modelclass.objects.all(),
|
||||
itemname=model_ngettext(modelclass, 1)
|
||||
)
|
||||
|
||||
request.current_app = modeladmin.admin_site.name
|
||||
|
||||
return TemplateResponse(
|
||||
request,
|
||||
"admin/{}/{}/select_object.html".format(app_label, opts.model_name),
|
||||
context
|
||||
)
|
||||
|
||||
|
||||
def simple_action(
|
||||
modeladmin, request, queryset, success_message="",
|
||||
document_action=None, queryset_action=None):
|
||||
|
||||
if not modeladmin.has_change_permission(request):
|
||||
raise PermissionDenied
|
||||
|
||||
n = queryset.count()
|
||||
if n:
|
||||
for document in queryset:
|
||||
if document_action:
|
||||
document_action(document)
|
||||
document_display = str(document)
|
||||
modeladmin.log_change(request, document, document_display)
|
||||
if queryset_action:
|
||||
queryset_action(queryset)
|
||||
modeladmin.message_user(request, success_message % {
|
||||
"count": n, "items": model_ngettext(modeladmin.opts, n)
|
||||
}, messages.SUCCESS)
|
||||
|
||||
# Return None to display the change list page again.
|
||||
return None
|
||||
|
||||
|
||||
def add_tag_to_selected(modeladmin, request, queryset):
|
||||
return select_action(
|
||||
modeladmin=modeladmin,
|
||||
request=request,
|
||||
queryset=queryset,
|
||||
title="Add tag to multiple documents",
|
||||
action="add_tag_to_selected",
|
||||
modelclass=Tag,
|
||||
success_message="Successfully added tag %(selected_object)s to "
|
||||
"%(count)d %(items)s.",
|
||||
document_action=lambda doc, tag: doc.tags.add(tag)
|
||||
)
|
||||
|
||||
|
||||
def remove_tag_from_selected(modeladmin, request, queryset):
|
||||
return select_action(
|
||||
modeladmin=modeladmin,
|
||||
request=request,
|
||||
queryset=queryset,
|
||||
title="Remove tag from multiple documents",
|
||||
action="remove_tag_from_selected",
|
||||
modelclass=Tag,
|
||||
success_message="Successfully removed tag %(selected_object)s from "
|
||||
"%(count)d %(items)s.",
|
||||
document_action=lambda doc, tag: doc.tags.remove(tag)
|
||||
)
|
||||
|
||||
|
||||
def set_correspondent_on_selected(modeladmin, request, queryset):
|
||||
|
||||
return select_action(
|
||||
modeladmin=modeladmin,
|
||||
request=request,
|
||||
queryset=queryset,
|
||||
title="Set correspondent on multiple documents",
|
||||
action="set_correspondent_on_selected",
|
||||
modelclass=Correspondent,
|
||||
success_message="Successfully set correspondent %(selected_object)s "
|
||||
"on %(count)d %(items)s.",
|
||||
queryset_action=lambda qs, corr: qs.update(correspondent=corr)
|
||||
)
|
||||
|
||||
|
||||
def remove_correspondent_from_selected(modeladmin, request, queryset):
|
||||
return simple_action(
|
||||
modeladmin=modeladmin,
|
||||
request=request,
|
||||
queryset=queryset,
|
||||
success_message="Successfully removed correspondent from %(count)d "
|
||||
"%(items)s.",
|
||||
queryset_action=lambda qs: qs.update(correspondent=None)
|
||||
)
|
||||
|
||||
|
||||
add_tag_to_selected.short_description = "Add tag to selected documents"
|
||||
remove_tag_from_selected.short_description = \
|
||||
"Remove tag from selected documents"
|
||||
set_correspondent_on_selected.short_description = \
|
||||
"Set correspondent on selected documents"
|
||||
remove_correspondent_from_selected.short_description = \
|
||||
"Remove correspondent from selected documents"
|
@@ -1,42 +1,25 @@
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib import admin
|
||||
from django.contrib.auth.models import User, Group
|
||||
try:
|
||||
from django.core.urlresolvers import reverse
|
||||
except ImportError:
|
||||
from django.urls import reverse
|
||||
from django.contrib import admin, messages
|
||||
from django.contrib.admin.templatetags.admin_urls import add_preserved_filters
|
||||
from django.contrib.auth.models import Group, User
|
||||
from django.db import models
|
||||
from django.http import HttpResponseRedirect
|
||||
from django.templatetags.static import static
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.urls import reverse
|
||||
from django.utils.html import format_html, format_html_join
|
||||
from django.utils.http import urlquote
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
from .models import Correspondent, Tag, Document, Log
|
||||
from documents.actions import (
|
||||
add_tag_to_selected,
|
||||
remove_correspondent_from_selected,
|
||||
remove_tag_from_selected,
|
||||
set_correspondent_on_selected
|
||||
)
|
||||
|
||||
|
||||
class MonthListFilter(admin.SimpleListFilter):
|
||||
|
||||
title = "Month"
|
||||
|
||||
# Parameter for the filter that will be used in the URL query.
|
||||
parameter_name = "month"
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
r = []
|
||||
for document in Document.objects.all():
|
||||
r.append((
|
||||
document.created.strftime("%Y-%m"),
|
||||
document.created.strftime("%B %Y")
|
||||
))
|
||||
return sorted(set(r), key=lambda x: x[0], reverse=True)
|
||||
|
||||
def queryset(self, request, queryset):
|
||||
|
||||
if not self.value():
|
||||
return None
|
||||
|
||||
year, month = self.value().split("-")
|
||||
return queryset.filter(created__year=year, created__month=month)
|
||||
from .models import Correspondent, Document, Log, Tag
|
||||
|
||||
|
||||
class FinancialYearFilter(admin.SimpleListFilter):
|
||||
@@ -78,12 +61,12 @@ class FinancialYearFilter(admin.SimpleListFilter):
|
||||
|
||||
# To keep it simple we use the same string for both
|
||||
# query parameter and the display.
|
||||
return (query, query)
|
||||
return query, query
|
||||
|
||||
else:
|
||||
query = "{0}-{0}".format(date.year)
|
||||
display = "{}".format(date.year)
|
||||
return (query, display)
|
||||
return query, display
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
if not settings.FY_START or not settings.FY_END:
|
||||
@@ -104,29 +87,79 @@ class FinancialYearFilter(admin.SimpleListFilter):
|
||||
created__lte=self._fy_end(end))
|
||||
|
||||
|
||||
class RecentCorrespondentFilter(admin.RelatedFieldListFilter):
|
||||
"""
|
||||
If PAPERLESS_RECENT_CORRESPONDENT_YEARS is set, we limit the available
|
||||
correspondents to documents sent our way over the past ``n`` years.
|
||||
"""
|
||||
|
||||
def field_choices(self, field, request, model_admin):
|
||||
|
||||
years = settings.PAPERLESS_RECENT_CORRESPONDENT_YEARS
|
||||
correspondents = Correspondent.objects.all()
|
||||
|
||||
if years and years > 0:
|
||||
self.title = "Correspondent (Recent)"
|
||||
days = 365 * years
|
||||
correspondents = correspondents.filter(
|
||||
documents__created__gte=datetime.now() - timedelta(days=days)
|
||||
).distinct()
|
||||
|
||||
return [(c.id, c.name) for c in correspondents]
|
||||
|
||||
|
||||
class CommonAdmin(admin.ModelAdmin):
|
||||
list_per_page = settings.PAPERLESS_LIST_PER_PAGE
|
||||
|
||||
|
||||
class CorrespondentAdmin(CommonAdmin):
|
||||
|
||||
list_display = ("name", "match", "matching_algorithm", "document_count")
|
||||
list_display = (
|
||||
"name",
|
||||
"match",
|
||||
"matching_algorithm",
|
||||
"document_count",
|
||||
"last_correspondence"
|
||||
)
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super(CorrespondentAdmin, self).get_queryset(request)
|
||||
qs = qs.annotate(
|
||||
document_count=models.Count("documents"),
|
||||
last_correspondence=models.Max("documents__created")
|
||||
)
|
||||
return qs
|
||||
|
||||
def document_count(self, obj):
|
||||
return obj.documents.count()
|
||||
return obj.document_count
|
||||
document_count.admin_order_field = "document_count"
|
||||
|
||||
def last_correspondence(self, obj):
|
||||
return obj.last_correspondence
|
||||
last_correspondence.admin_order_field = "last_correspondence"
|
||||
|
||||
|
||||
class TagAdmin(CommonAdmin):
|
||||
|
||||
list_display = ("name", "colour", "match", "matching_algorithm",
|
||||
"document_count")
|
||||
list_display = (
|
||||
"name", "colour", "match", "matching_algorithm", "document_count")
|
||||
list_filter = ("colour", "matching_algorithm")
|
||||
list_editable = ("colour", "match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super(TagAdmin, self).get_queryset(request)
|
||||
qs = qs.annotate(document_count=models.Count("documents"))
|
||||
return qs
|
||||
|
||||
def document_count(self, obj):
|
||||
return obj.documents.count()
|
||||
return obj.document_count
|
||||
document_count.admin_order_field = "document_count"
|
||||
|
||||
|
||||
class DocumentAdmin(CommonAdmin):
|
||||
@@ -137,15 +170,32 @@ class DocumentAdmin(CommonAdmin):
|
||||
}
|
||||
|
||||
search_fields = ("correspondent__name", "title", "content", "tags__name")
|
||||
readonly_fields = ("added",)
|
||||
readonly_fields = ("added", "file_type", "storage_type",)
|
||||
list_display = ("title", "created", "added", "thumbnail", "correspondent",
|
||||
"tags_")
|
||||
list_filter = ("tags", "correspondent", FinancialYearFilter,
|
||||
MonthListFilter)
|
||||
list_filter = (
|
||||
"tags",
|
||||
("correspondent", RecentCorrespondentFilter),
|
||||
FinancialYearFilter
|
||||
)
|
||||
|
||||
filter_horizontal = ("tags",)
|
||||
|
||||
ordering = ["-created", "correspondent"]
|
||||
|
||||
actions = [
|
||||
add_tag_to_selected,
|
||||
remove_tag_from_selected,
|
||||
set_correspondent_on_selected,
|
||||
remove_correspondent_from_selected
|
||||
]
|
||||
|
||||
date_hierarchy = "created"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.document_queue = []
|
||||
|
||||
def has_add_permission(self, request):
|
||||
return False
|
||||
|
||||
@@ -153,6 +203,79 @@ class DocumentAdmin(CommonAdmin):
|
||||
return obj.created.date().strftime("%Y-%m-%d")
|
||||
created_.short_description = "Created"
|
||||
|
||||
def changelist_view(self, request, extra_context=None):
|
||||
|
||||
response = super().changelist_view(
|
||||
request,
|
||||
extra_context=extra_context
|
||||
)
|
||||
|
||||
if request.method == "GET":
|
||||
cl = self.get_changelist_instance(request)
|
||||
self.document_queue = [doc.id for doc in cl.queryset]
|
||||
|
||||
return response
|
||||
|
||||
def change_view(self, request, object_id=None, form_url='',
|
||||
extra_context=None):
|
||||
|
||||
extra_context = extra_context or {}
|
||||
|
||||
if self.document_queue and object_id:
|
||||
if int(object_id) in self.document_queue:
|
||||
# There is a queue of documents
|
||||
current_index = self.document_queue.index(int(object_id))
|
||||
if current_index < len(self.document_queue) - 1:
|
||||
# ... and there are still documents in the queue
|
||||
extra_context["next_object"] = self.document_queue[
|
||||
current_index + 1
|
||||
]
|
||||
|
||||
return super(DocumentAdmin, self).change_view(
|
||||
request,
|
||||
object_id,
|
||||
form_url,
|
||||
extra_context=extra_context,
|
||||
)
|
||||
|
||||
def response_change(self, request, obj):
|
||||
|
||||
# This is mostly copied from ModelAdmin.response_change()
|
||||
opts = self.model._meta
|
||||
preserved_filters = self.get_preserved_filters(request)
|
||||
|
||||
msg_dict = {
|
||||
"name": opts.verbose_name,
|
||||
"obj": format_html(
|
||||
'<a href="{}">{}</a>',
|
||||
urlquote(request.path),
|
||||
obj
|
||||
),
|
||||
}
|
||||
if "_saveandeditnext" in request.POST:
|
||||
msg = format_html(
|
||||
'The {name} "{obj}" was changed successfully. '
|
||||
'Editing next object.',
|
||||
**msg_dict
|
||||
)
|
||||
self.message_user(request, msg, messages.SUCCESS)
|
||||
redirect_url = reverse(
|
||||
"admin:{}_{}_change".format(opts.app_label, opts.model_name),
|
||||
args=(request.POST["_next_object"],),
|
||||
current_app=self.admin_site.name
|
||||
)
|
||||
redirect_url = add_preserved_filters(
|
||||
{
|
||||
"preserved_filters": preserved_filters,
|
||||
"opts": opts
|
||||
},
|
||||
redirect_url
|
||||
)
|
||||
return HttpResponseRedirect(redirect_url)
|
||||
|
||||
return super().response_change(request, obj)
|
||||
|
||||
@mark_safe
|
||||
def thumbnail(self, obj):
|
||||
return self._html_tag(
|
||||
"a",
|
||||
@@ -165,8 +288,8 @@ class DocumentAdmin(CommonAdmin):
|
||||
),
|
||||
href=obj.download_url
|
||||
)
|
||||
thumbnail.allow_tags = True
|
||||
|
||||
@mark_safe
|
||||
def tags_(self, obj):
|
||||
r = ""
|
||||
for tag in obj.tags.all():
|
||||
@@ -183,10 +306,11 @@ class DocumentAdmin(CommonAdmin):
|
||||
)
|
||||
}
|
||||
)
|
||||
return mark_safe(r)
|
||||
tags_.allow_tags = True
|
||||
return r
|
||||
|
||||
@mark_safe
|
||||
def document(self, obj):
|
||||
# TODO: is this method even used anymore?
|
||||
return self._html_tag(
|
||||
"a",
|
||||
self._html_tag(
|
||||
@@ -199,7 +323,6 @@ class DocumentAdmin(CommonAdmin):
|
||||
),
|
||||
href=obj.download_url
|
||||
)
|
||||
document.allow_tags = True
|
||||
|
||||
@staticmethod
|
||||
def _html_tag(kind, inside=None, **kwargs):
|
||||
|
@@ -1,3 +1,4 @@
|
||||
from django.db import transaction
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
@@ -111,8 +112,11 @@ class Consumer:
|
||||
if not self.try_consume_file(file):
|
||||
self._ignore.append((file, mtime))
|
||||
|
||||
@transaction.atomic
|
||||
def try_consume_file(self, file):
|
||||
"Return True if file was consumed"
|
||||
"""
|
||||
Return True if file was consumed
|
||||
"""
|
||||
|
||||
if not re.match(FileInfo.REGEXES["title"], file):
|
||||
return False
|
||||
@@ -145,7 +149,7 @@ class Consumer:
|
||||
parsed_document = parser_class(doc)
|
||||
|
||||
try:
|
||||
thumbnail = parsed_document.get_thumbnail()
|
||||
thumbnail = parsed_document.get_optimised_thumbnail()
|
||||
date = parsed_document.get_date()
|
||||
document = self._store(
|
||||
parsed_document.get_text(),
|
||||
|
@@ -1,8 +1,14 @@
|
||||
from django_filters.rest_framework import CharFilter, FilterSet, BooleanFilter
|
||||
from django_filters.rest_framework import BooleanFilter, FilterSet
|
||||
|
||||
from .models import Correspondent, Document, Tag
|
||||
|
||||
|
||||
CHAR_KWARGS = (
|
||||
"startswith", "endswith", "contains",
|
||||
"istartswith", "iendswith", "icontains"
|
||||
)
|
||||
|
||||
|
||||
class CorrespondentFilterSet(FilterSet):
|
||||
|
||||
class Meta:
|
||||
@@ -31,34 +37,24 @@ class TagFilterSet(FilterSet):
|
||||
|
||||
class DocumentFilterSet(FilterSet):
|
||||
|
||||
CHAR_KWARGS = {
|
||||
"lookup_expr": (
|
||||
"startswith",
|
||||
"endswith",
|
||||
"contains",
|
||||
"istartswith",
|
||||
"iendswith",
|
||||
"icontains"
|
||||
)
|
||||
}
|
||||
|
||||
correspondent__name = CharFilter(
|
||||
field_name="correspondent__name", **CHAR_KWARGS)
|
||||
correspondent__slug = CharFilter(
|
||||
field_name="correspondent__slug", **CHAR_KWARGS)
|
||||
tags__name = CharFilter(
|
||||
field_name="tags__name", **CHAR_KWARGS)
|
||||
tags__slug = CharFilter(
|
||||
field_name="tags__slug", **CHAR_KWARGS)
|
||||
tags__empty = BooleanFilter(
|
||||
field_name="tags", lookup_expr="isnull", distinct=True)
|
||||
tags_empty = BooleanFilter(
|
||||
label="Is tagged",
|
||||
field_name="tags",
|
||||
lookup_expr="isnull",
|
||||
exclude=True
|
||||
)
|
||||
|
||||
class Meta:
|
||||
model = Document
|
||||
fields = {
|
||||
"title": [
|
||||
"startswith", "endswith", "contains",
|
||||
"istartswith", "iendswith", "icontains"
|
||||
],
|
||||
"content": ["contains", "icontains"],
|
||||
|
||||
"title": CHAR_KWARGS,
|
||||
"content": ("contains", "icontains"),
|
||||
|
||||
"correspondent__name": CHAR_KWARGS,
|
||||
"correspondent__slug": CHAR_KWARGS,
|
||||
|
||||
"tags__name": CHAR_KWARGS,
|
||||
"tags__slug": CHAR_KWARGS,
|
||||
|
||||
}
|
||||
|
@@ -55,7 +55,12 @@ class Command(Renderable, BaseCommand):
|
||||
documents = Document.objects.all()
|
||||
document_map = {d.pk: d for d in documents}
|
||||
manifest = json.loads(serializers.serialize("json", documents))
|
||||
for document_dict in manifest:
|
||||
|
||||
for index, document_dict in enumerate(manifest):
|
||||
|
||||
# Force output to unencrypted as that will be the current state.
|
||||
# The importer will make the decision to encrypt or not.
|
||||
manifest[index]["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
|
||||
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
|
@@ -94,7 +94,7 @@ class Command(Renderable, BaseCommand):
|
||||
document_path = os.path.join(self.source, doc_file)
|
||||
thumbnail_path = os.path.join(self.source, thumb_file)
|
||||
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
if settings.PASSPHRASE:
|
||||
|
||||
with open(document_path, "rb") as unencrypted:
|
||||
with open(document.source_path, "wb") as encrypted:
|
||||
@@ -112,3 +112,15 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
|
||||
# Reset the storage type to whatever we've used while importing
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
|
||||
Document.objects.filter(
|
||||
pk__in=[r["pk"] for r in self.manifest]
|
||||
).update(
|
||||
storage_type=storage_type
|
||||
)
|
||||
|
@@ -158,9 +158,4 @@ class Migration(migrations.Migration):
|
||||
name='modified',
|
||||
field=models.DateTimeField(auto_now=True, db_index=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='checksum',
|
||||
field=models.CharField(editable=False, help_text='The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.', max_length=32, unique=True),
|
||||
),
|
||||
]
|
||||
|
@@ -12,6 +12,11 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='checksum',
|
||||
field=models.CharField(editable=False, help_text='The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.', max_length=32, unique=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='correspondent',
|
||||
name='is_insensitive',
|
||||
|
52
src/documents/migrations/0022_auto_20181007_1420.py
Normal file
52
src/documents/migrations/0022_auto_20181007_1420.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# Generated by Django 2.0.8 on 2018-10-07 14:20
|
||||
|
||||
from django.db import migrations, models
|
||||
from django.utils.text import slugify
|
||||
|
||||
|
||||
def re_slug_all_the_things(apps, schema_editor):
|
||||
"""
|
||||
Rewrite all slug values to make sure they're actually slugs before we brand
|
||||
them as uneditable.
|
||||
"""
|
||||
|
||||
Tag = apps.get_model("documents", "Tag")
|
||||
Correspondent = apps.get_model("documents", "Tag")
|
||||
|
||||
for klass in (Tag, Correspondent):
|
||||
for instance in klass.objects.all():
|
||||
klass.objects.filter(
|
||||
pk=instance.pk
|
||||
).update(
|
||||
slug=slugify(instance.slug)
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0021_document_storage_type'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='tag',
|
||||
options={'ordering': ('name',)},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='correspondent',
|
||||
name='slug',
|
||||
field=models.SlugField(blank=True, editable=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='file_type',
|
||||
field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF'), ('txt', 'TXT'), ('csv', 'CSV'), ('md', 'MD')], editable=False, max_length=4),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(blank=True, editable=False),
|
||||
),
|
||||
migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
|
||||
]
|
@@ -11,6 +11,7 @@ from django.conf import settings
|
||||
from django.db import models
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from .managers import LogManager
|
||||
@@ -37,7 +38,7 @@ class MatchingModel(models.Model):
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=128, unique=True)
|
||||
slug = models.SlugField(blank=True)
|
||||
slug = models.SlugField(blank=True, editable=False)
|
||||
|
||||
match = models.CharField(max_length=256, blank=True)
|
||||
matching_algorithm = models.PositiveIntegerField(
|
||||
@@ -147,9 +148,7 @@ class MatchingModel(models.Model):
|
||||
def save(self, *args, **kwargs):
|
||||
|
||||
self.match = self.match.lower()
|
||||
|
||||
if not self.slug:
|
||||
self.slug = slugify(self.name)
|
||||
self.slug = slugify(self.name)
|
||||
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
||||
@@ -452,7 +451,7 @@ class FileInfo:
|
||||
r = []
|
||||
for t in tags.split(","):
|
||||
r.append(Tag.objects.get_or_create(
|
||||
slug=t.lower(),
|
||||
slug=slugify(t),
|
||||
defaults={"name": t}
|
||||
)[0])
|
||||
return tuple(r)
|
||||
|
@@ -1,9 +1,13 @@
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
import dateparser
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
@@ -36,6 +40,9 @@ class DocumentParser:
|
||||
"""
|
||||
|
||||
SCRATCH = settings.SCRATCH_DIR
|
||||
DATE_ORDER = settings.DATE_ORDER
|
||||
FILENAME_DATE_ORDER = settings.FILENAME_DATE_ORDER
|
||||
OPTIPNG = settings.OPTIPNG_BINARY
|
||||
|
||||
def __init__(self, path):
|
||||
self.document_path = path
|
||||
@@ -49,6 +56,19 @@ class DocumentParser:
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def optimise_thumbnail(self, in_path):
|
||||
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
|
||||
args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
|
||||
if not subprocess.Popen(args).wait() == 0:
|
||||
raise ParseError("Optipng failed at {}".format(args))
|
||||
|
||||
return out_path
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.optimise_thumbnail(self.get_thumbnail())
|
||||
|
||||
def get_text(self):
|
||||
"""
|
||||
Returns the text from the document and only the text.
|
||||
@@ -59,7 +79,75 @@ class DocumentParser:
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def __parser__(ds, date_order):
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
return dateparser.parse(ds,
|
||||
settings={"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE":
|
||||
True})
|
||||
date = None
|
||||
date_string = None
|
||||
|
||||
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||
title = os.path.basename(self.document_path)
|
||||
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if self.FILENAME_DATE_ORDER:
|
||||
self.log("info", "Checking document title for date")
|
||||
for m in re.finditer(DATE_REGEX, title):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser__(date_string, self.FILENAME_DATE_ORDER)
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
self.log("info",
|
||||
"Detected document date {} based on string {} "
|
||||
"from document title"
|
||||
"".format(date.isoformat(), date_string))
|
||||
return date
|
||||
|
||||
try:
|
||||
# getting text after checking filename will save time if only
|
||||
# looking at the filename instead of the whole text
|
||||
text = self.get_text()
|
||||
except ParseError:
|
||||
return None
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser__(date_string, self.DATE_ORDER)
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
if date is not None:
|
||||
self.log(
|
||||
"info",
|
||||
"Detected document date {} based on string {}".format(
|
||||
date.isoformat(),
|
||||
date_string
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
|
@@ -1,5 +1,21 @@
|
||||
{% extends 'admin/change_form.html' %}
|
||||
|
||||
{% block content %}
|
||||
|
||||
{{ block.super }}
|
||||
|
||||
{% if next_object %}
|
||||
<script type="text/javascript">//<![CDATA[
|
||||
(function($){
|
||||
$('<input type="submit" value="Save and edit next" name="_saveandeditnext" />')
|
||||
.prependTo('div.submit-row');
|
||||
$('<input type="hidden" value="{{next_object}}" name="_next_object" />')
|
||||
.prependTo('div.submit-row');
|
||||
})(django.jQuery);
|
||||
//]]></script>
|
||||
{% endif %}
|
||||
|
||||
{% endblock content %}
|
||||
|
||||
{% block footer %}
|
||||
|
||||
@@ -10,4 +26,4 @@
|
||||
django.jQuery(".field-created input").first().attr("type", "date")
|
||||
</script>
|
||||
|
||||
{% endblock footer %}
|
||||
{% endblock footer %}
|
||||
|
@@ -0,0 +1,50 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
|
||||
|
||||
{% load i18n l10n admin_urls static %}
|
||||
{% load staticfiles %}
|
||||
|
||||
|
||||
{% block extrahead %}
|
||||
{{ block.super }}
|
||||
{{ media }}
|
||||
<script type="text/javascript" src="{% static 'admin/js/cancel.js' %}"></script>
|
||||
{% endblock %}
|
||||
|
||||
|
||||
{% block bodyclass %}{{ block.super }} app-{{ opts.app_label }} model-{{ opts.model_name }} delete-confirmation delete-selected-confirmation{% endblock %}
|
||||
|
||||
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
› <a href="{% url 'admin:app_list' app_label=opts.app_label %}">{{ opts.app_config.verbose_name }}</a>
|
||||
› <a href="{% url opts|admin_urlname:'changelist' %}">{{ opts.verbose_name_plural|capfirst }}</a>
|
||||
› {{ title }}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<p>Please select the {{itemname}}.</p>
|
||||
<form method="post">{% csrf_token %}
|
||||
<div>
|
||||
{% for obj in queryset %}
|
||||
<input type="hidden" name="{{ action_checkbox_name }}" value="{{ obj.pk|unlocalize }}"/>
|
||||
{% endfor %}
|
||||
<p>
|
||||
<select name="obj_id">
|
||||
{% for obj in objects %}
|
||||
<option value="{{ obj.id }}">{{ obj.name }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</p>
|
||||
|
||||
<input type="hidden" name="action" value="{{ action }}"/>
|
||||
<input type="hidden" name="post" value="yes" />
|
||||
<p>
|
||||
<input type="submit" value="{% trans 'Confirm' %}" />
|
||||
<a href="#" class="button cancel-link">{% trans "Go back" %}</a>
|
||||
</p>
|
||||
</div>
|
||||
</form>
|
||||
{% endblock %}
|
@@ -76,7 +76,12 @@ def binaries_check(app_configs, **kwargs):
|
||||
error = "Paperless can't find {}. Without it, consumption is impossible."
|
||||
hint = "Either it's not in your ${PATH} or it's not installed."
|
||||
|
||||
binaries = (settings.CONVERT_BINARY, settings.UNPAPER_BINARY, "tesseract")
|
||||
binaries = (
|
||||
settings.CONVERT_BINARY,
|
||||
settings.OPTIPNG_BINARY,
|
||||
settings.UNPAPER_BINARY,
|
||||
"tesseract"
|
||||
)
|
||||
|
||||
check_messages = []
|
||||
for binary in binaries:
|
||||
|
@@ -1,15 +1,20 @@
|
||||
from django.contrib.auth.models import User as DjangoUser
|
||||
|
||||
|
||||
class User:
|
||||
"""
|
||||
This is a dummy django User used with our middleware to disable
|
||||
login authentication if that is configured in paperless.conf
|
||||
This is a dummy django User used with our middleware to disable
|
||||
login authentication if that is configured in paperless.conf
|
||||
"""
|
||||
|
||||
is_superuser = True
|
||||
is_active = True
|
||||
is_staff = True
|
||||
is_authenticated = True
|
||||
|
||||
# Must be -1 to avoid colliding with real user ID's (which start at 1)
|
||||
id = -1
|
||||
@property
|
||||
def id(self):
|
||||
return DjangoUser.objects.order_by("pk").first().pk
|
||||
|
||||
@property
|
||||
def pk(self):
|
||||
@@ -17,9 +22,9 @@ class User:
|
||||
|
||||
|
||||
"""
|
||||
NOTE: These are here as a hack instead of being in the User definition
|
||||
above due to the way pycodestyle handles lamdbdas.
|
||||
See https://github.com/PyCQA/pycodestyle/issues/379 for more.
|
||||
NOTE: These are here as a hack instead of being in the User definition
|
||||
NOTE: above due to the way pycodestyle handles lamdbdas.
|
||||
NOTE: See https://github.com/PyCQA/pycodestyle/issues/379 for more.
|
||||
"""
|
||||
|
||||
User.has_module_perms = lambda *_: True
|
||||
|
@@ -22,12 +22,12 @@ elif os.path.exists("/usr/local/etc/paperless.conf"):
|
||||
load_dotenv("/usr/local/etc/paperless.conf")
|
||||
|
||||
|
||||
def __get_boolean(key):
|
||||
def __get_boolean(key, default="NO"):
|
||||
"""
|
||||
Return a boolean value based on whatever the user has supplied in the
|
||||
environment based on whether the value "looks like" it's True or not.
|
||||
"""
|
||||
return bool(os.getenv(key, "NO").lower() in ("yes", "y", "1", "t", "true"))
|
||||
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
|
||||
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
||||
@@ -47,7 +47,7 @@ SECRET_KEY = os.getenv(
|
||||
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
DEBUG = __get_boolean("PAPERLESS_DEBUG", "YES")
|
||||
|
||||
LOGIN_URL = "admin:login"
|
||||
|
||||
@@ -144,13 +144,14 @@ DATABASES = {
|
||||
}
|
||||
}
|
||||
|
||||
if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"):
|
||||
if os.getenv("PAPERLESS_DBUSER"):
|
||||
DATABASES["default"] = {
|
||||
"ENGINE": "django.db.backends.postgresql_psycopg2",
|
||||
"NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
|
||||
"USER": os.getenv("PAPERLESS_DBUSER"),
|
||||
"PASSWORD": os.getenv("PAPERLESS_DBPASS")
|
||||
}
|
||||
if os.getenv("PAPERLESS_DBPASS"):
|
||||
DATABASES["default"]["PASSWORD"] = os.getenv("PAPERLESS_DBPASS")
|
||||
|
||||
|
||||
# Password validation
|
||||
@@ -198,6 +199,16 @@ STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", "/static/")
|
||||
MEDIA_URL = os.getenv("PAPERLESS_MEDIA_URL", "/media/")
|
||||
|
||||
|
||||
# Other
|
||||
|
||||
# Disable Django's artificial limit on the number of form fields to submit at
|
||||
# once. This is a protection against overloading the server, but since this is
|
||||
# a self-hosted sort of gig, the benefits of being able to mass-delete a tonne
|
||||
# of log entries outweight the benefits of such a safeguard.
|
||||
|
||||
DATA_UPLOAD_MAX_NUMBER_FIELDS = None
|
||||
|
||||
|
||||
# Paperless-specific stuff
|
||||
# You shouldn't have to edit any of these values. Rather, you can set these
|
||||
# values in /etc/paperless.conf instead.
|
||||
@@ -246,6 +257,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
|
||||
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
|
||||
CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")
|
||||
|
||||
# OptiPNG
|
||||
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
|
||||
|
||||
# Unpaper
|
||||
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
|
||||
|
||||
@@ -293,3 +307,9 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
|
||||
# Specify the default date order (for autodetected dates)
|
||||
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
||||
|
||||
# Specify for how many years a correspondent is considered recent. Recent
|
||||
# correspondents will be shown in a separate "Recent correspondents" filter as
|
||||
# well. Set to 0 to disable this filter.
|
||||
PAPERLESS_RECENT_CORRESPONDENT_YEARS = int(os.getenv(
|
||||
"PAPERLESS_RECENT_CORRESPONDENT_YEARS", 0))
|
||||
|
@@ -1 +1 @@
|
||||
__version__ = (2, 3, 0)
|
||||
__version__ = (2, 5, 0)
|
||||
|
@@ -4,7 +4,6 @@ import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import dateparser
|
||||
import langdetect
|
||||
import pyocr
|
||||
from django.conf import settings
|
||||
@@ -14,7 +13,7 @@ from pyocr.libtesseract.tesseract_raw import \
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
import pdftotext
|
||||
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
|
||||
from documents.parsers import DocumentParser, ParseError
|
||||
|
||||
from .languages import ISO639
|
||||
|
||||
@@ -33,8 +32,6 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
DATE_ORDER = settings.DATE_ORDER
|
||||
FILENAME_DATE_ORDER = settings.FILENAME_DATE_ORDER
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||
|
||||
@@ -47,15 +44,18 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
|
||||
out_path = os.path.join(self.tempdir, "convert.png")
|
||||
|
||||
# Run convert to get a decent thumbnail
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-scale", "500x5000",
|
||||
"-alpha", "remove",
|
||||
"{}[0]".format(self.document_path),
|
||||
os.path.join(self.tempdir, "convert.png")
|
||||
out_path
|
||||
)
|
||||
|
||||
return os.path.join(self.tempdir, "convert.png")
|
||||
return out_path
|
||||
|
||||
def _is_ocred(self):
|
||||
|
||||
@@ -153,7 +153,10 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError("Language detection failed")
|
||||
error_msg = ("Language detection failed. Set "
|
||||
"PAPERLESS_FORGIVING_OCR in config file to continue "
|
||||
"anyway.")
|
||||
raise OCRError(error_msg)
|
||||
|
||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
@@ -173,8 +176,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError(
|
||||
"The guessed language is not available in this instance of "
|
||||
"Tesseract."
|
||||
"The guessed language ({}) is not available in this instance "
|
||||
"of Tesseract.".format(guessed_language)
|
||||
)
|
||||
|
||||
def _ocr(self, imgs, lang):
|
||||
@@ -203,63 +206,6 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||
return text
|
||||
|
||||
def get_date(self):
|
||||
date = None
|
||||
datestring = None
|
||||
|
||||
if self.FILENAME_DATE_ORDER:
|
||||
self.log("info", "Checking document title for date")
|
||||
text = os.path.basename(self.document_path)
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
datestring = m.group(0)
|
||||
try:
|
||||
date = dateparser.parse(
|
||||
datestring,
|
||||
settings={'DATE_ORDER': self.FILENAME_DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None:
|
||||
self.log("info",
|
||||
"Detected document date {} based on string {} "
|
||||
"from document title"
|
||||
"".format(date.isoformat(), datestring))
|
||||
return date
|
||||
|
||||
try:
|
||||
self.log('info', "Checking document text for date")
|
||||
text = self.get_text()
|
||||
except ParseError as e:
|
||||
return None
|
||||
|
||||
# Iterate through all regex matches and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
datestring = m.group(0)
|
||||
|
||||
try:
|
||||
date = dateparser.parse(
|
||||
datestring,
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None:
|
||||
break
|
||||
|
||||
if date is not None:
|
||||
self.log("info", "Detected document date " + date.isoformat() +
|
||||
" based on string " + datestring)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
|
||||
|
||||
def run_convert(*args):
|
||||
|
||||
@@ -275,7 +221,8 @@ def run_convert(*args):
|
||||
|
||||
def run_unpaper(args):
|
||||
unpaper, pnm = args
|
||||
command_args = unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
|
||||
command_args = (unpaper, "--overwrite", pnm,
|
||||
pnm.replace(".pnm", ".unpaper.pnm"))
|
||||
if not subprocess.Popen(command_args).wait() == 0:
|
||||
raise ParseError("Unpaper failed at {}".format(command_args))
|
||||
|
||||
|
@@ -494,3 +494,42 @@ class TestDate(TestCase):
|
||||
datetime.datetime(2013, 12, 11, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE))
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="01-07-0590 00:00:00"
|
||||
)
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_crazy_date_past(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null")
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="01-07-2350 00:00:00"
|
||||
)
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_crazy_date_future(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null")
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="01-07-0590 00:00:00"
|
||||
)
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_crazy_date_past(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null")
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
@@ -1,11 +1,9 @@
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
import dateparser
|
||||
from django.conf import settings
|
||||
|
||||
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
|
||||
from documents.parsers import DocumentParser, ParseError
|
||||
|
||||
|
||||
class TextDocumentParser(DocumentParser):
|
||||
@@ -16,7 +14,6 @@ class TextDocumentParser(DocumentParser):
|
||||
CONVERT = settings.CONVERT_BINARY
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
DATE_ORDER = settings.DATE_ORDER
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||
|
||||
@@ -26,7 +23,7 @@ class TextDocumentParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
The thumbnail of a txt is just a 500px wide image of the text
|
||||
The thumbnail of a text file is just a 500px wide image of the text
|
||||
rendered onto a letter-sized page.
|
||||
"""
|
||||
# The below is heavily cribbed from https://askubuntu.com/a/590951
|
||||
@@ -35,7 +32,7 @@ class TextDocumentParser(DocumentParser):
|
||||
text_color = "black" # text color
|
||||
psize = [500, 647] # icon size
|
||||
n_lines = 50 # number of lines to show
|
||||
output_file = os.path.join(self.tempdir, "convert-txt.png")
|
||||
out_path = os.path.join(self.tempdir, "convert.png")
|
||||
|
||||
temp_bg = os.path.join(self.tempdir, "bg.png")
|
||||
temp_txlayer = os.path.join(self.tempdir, "tx.png")
|
||||
@@ -46,9 +43,13 @@ class TextDocumentParser(DocumentParser):
|
||||
work_size = ",".join([str(n - 1) for n in psize])
|
||||
r = str(round(psize[0] / 10))
|
||||
rounded = ",".join([r, r])
|
||||
run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
|
||||
'"fill ', bg_color, ' roundrectangle 0,0,',
|
||||
work_size, ",", rounded, '" ', temp_bg)
|
||||
run_command(
|
||||
self.CONVERT,
|
||||
"-size ", picsize,
|
||||
' xc:none -draw ',
|
||||
'"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ', # NOQA: E501
|
||||
temp_bg
|
||||
)
|
||||
|
||||
def read_text():
|
||||
with open(self.document_path, 'r') as src:
|
||||
@@ -57,22 +58,29 @@ class TextDocumentParser(DocumentParser):
|
||||
return text.replace('"', "'")
|
||||
|
||||
def create_txlayer():
|
||||
run_command(self.CONVERT,
|
||||
"-background none",
|
||||
"-fill",
|
||||
text_color,
|
||||
"-pointsize", "12",
|
||||
"-border 4 -bordercolor none",
|
||||
"-size ", txsize,
|
||||
' caption:"', read_text(), '" ',
|
||||
temp_txlayer)
|
||||
run_command(
|
||||
self.CONVERT,
|
||||
"-background none",
|
||||
"-fill",
|
||||
text_color,
|
||||
"-pointsize", "12",
|
||||
"-border 4 -bordercolor none",
|
||||
"-size ", txsize,
|
||||
' caption:"', read_text(), '" ',
|
||||
temp_txlayer
|
||||
)
|
||||
|
||||
create_txlayer()
|
||||
create_bg()
|
||||
run_command(self.CONVERT, temp_bg, temp_txlayer,
|
||||
"-background None -layers merge ", output_file)
|
||||
run_command(
|
||||
self.CONVERT,
|
||||
temp_bg,
|
||||
temp_txlayer,
|
||||
"-background None -layers merge ",
|
||||
out_path
|
||||
)
|
||||
|
||||
return output_file
|
||||
return out_path
|
||||
|
||||
def get_text(self):
|
||||
|
||||
@@ -84,40 +92,6 @@ class TextDocumentParser(DocumentParser):
|
||||
|
||||
return self._text
|
||||
|
||||
def get_date(self):
|
||||
date = None
|
||||
datestring = None
|
||||
|
||||
try:
|
||||
text = self.get_text()
|
||||
except ParseError as e:
|
||||
return None
|
||||
|
||||
# Iterate through all regex matches and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
datestring = m.group(0)
|
||||
|
||||
try:
|
||||
date = dateparser.parse(
|
||||
datestring,
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None:
|
||||
break
|
||||
|
||||
if date is not None:
|
||||
self.log("info", "Detected document date " + date.isoformat() +
|
||||
" based on string " + datestring)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
|
||||
|
||||
def run_command(*args):
|
||||
environment = os.environ.copy()
|
||||
|
19
src/reminders/migrations/0002_auto_20181007_1420.py
Normal file
19
src/reminders/migrations/0002_auto_20181007_1420.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# Generated by Django 2.0.8 on 2018-10-07 14:20
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('reminders', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='reminder',
|
||||
name='document',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='documents.Document'),
|
||||
),
|
||||
]
|
@@ -4,7 +4,6 @@ from django.db import models
|
||||
class Reminder(models.Model):
|
||||
|
||||
document = models.ForeignKey(
|
||||
"documents.Document", on_delete=models.PROTECT
|
||||
)
|
||||
"documents.Document", on_delete=models.PROTECT)
|
||||
date = models.DateTimeField()
|
||||
note = models.TextField(blank=True)
|
||||
|
@@ -5,7 +5,7 @@
|
||||
|
||||
[tox]
|
||||
skipsdist = True
|
||||
envlist = py34, py35, py36, pycodestyle, doc
|
||||
envlist = py34, py35, py36, py37, pycodestyle, doc
|
||||
|
||||
[testenv]
|
||||
commands = pytest
|
||||
|
Reference in New Issue
Block a user