- document index
- api access for thumbnails/downloads
- more api filters

updated
- pipfile

removed
- filename handling
- legacy thumb/download access
- obsolete admin gui settings (per page items, FY, inline view)
This commit is contained in:
Jonas Winkler 2020-10-25 23:03:02 +01:00
parent 9187026c47
commit 052c1680f3
16 changed files with 327 additions and 572 deletions

137
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "ade1227b607ebc7b4088c267af05232275750d8ea212a5c8a4b75f84f22ec849"
"sha256": "e5813d1cc93070f225d9fabff0f19d8552563d5b55c8d048be4cf001ddccbbb0"
},
"pipfile-spec": 6,
"requires": {},
@ -149,38 +149,37 @@
},
"pillow": {
"hashes": [
"sha256:04d984e45a0b9815f4b407e8aadb50f25fbb82a605d89db927376e94c3adf371",
"sha256:06e730451b70471c08b8a0ee7f18e7e1df310dba9c780bbfb730a13102b143db",
"sha256:1f59596af2b3d64a9e43f9d6509b7a51db744d0eecc23297617c604e6823c6ae",
"sha256:233513465a2f25fce537b965621866da3d1f02e15708f371dd4e19f0fb7b7711",
"sha256:2696f1a6402c1a42ed12c5cd8adfb4b381c32d41e35a34b8ee544309ef854172",
"sha256:2ca55a4443b463eec90528ac27be14d226b1c2b972178bc7d4d282ce89e47b6a",
"sha256:30615e9115f976e00a938a28c7152562e8cf8e221ddacf4446dd8b20c0d97333",
"sha256:3a77e7b9f8991b81d7be8e0b2deab05013cf3ebb24ac2b863d2979acb68c73dd",
"sha256:54667c8ab16658cc0b7d824d8706b440d4db8382a3561042758bdfd48ca99298",
"sha256:59304c67d12394815331eda95ec892bf54ad95e0aa7bc1ccd8e0a4a5a25d4bf3",
"sha256:594f2f25b7bcfd9542c41b9df156fb5104f19f5fcefa51b1447f1d9f64c9cc14",
"sha256:5b5dde5dcedc4e6f5a71d7654a3c6e189ced82e97d7896b1ca5a5c5e4e0e916f",
"sha256:6bcea85f93fb2c94a1bcd35704c348a929a7fb24a0ec0cc2b9fcbb0046b87176",
"sha256:718d7f0eb3351052023b33fe0f83fc9e3beeb7cbacbd0ff2b52524e2153e4598",
"sha256:7c4a7ee37027ca716f42726b6f9fc491c13c843c7af559e0767dfab1ae9682d4",
"sha256:87a855b64a9b692604f6339baa4f9913d06838df1b4ccf0cb899dd18f56ec03c",
"sha256:8c006d52365c0a6bb41a07f9c8f9f458ae8170e0af3b8c49bf7089347066b97b",
"sha256:8e29701229705615d3dcfc439c7c46f40f913e57c7fe322b1efc30d3f37d1287",
"sha256:9b5b41737853bc49943864d5980dfb401a09e78ddb471e71291810ccdeadd712",
"sha256:b04569ff215b85ce3e2954979d2d5e0bf84007e43ddcf84b632fc6bc18e07909",
"sha256:b731d45764349313bd956c07bdc1d43803bb0ad2b11354328a074e416c7d84bc",
"sha256:c12e33cb17e2e12049a49b77696ee479791a4e44e541fdc393ae043e1246389f",
"sha256:c41442c3814afeba1f6f16fd70cdf312a2c73c6dee8dc3ac8926bb115713ad1d",
"sha256:c4d743c5c91424965707c9c8edc58b7cb43c127dcaf191fbcd304e2082eef56a",
"sha256:d6766fd28f4f47cf93280a57e3dc6a9d11bdada1a6e9f019b8c62b12bbc86f6a",
"sha256:d904570afcdbec40eb6bdbe24cba8d95c0215a2c0cbbc9c16301045bc8504c1f",
"sha256:e674be2f349ea810e221b0113bd4491f53584ac848d5bcc3b62443cfa11d9c40",
"sha256:e6ac40f1a62a227eb00226eb64c9c82bc878a3ed700b5414d34c9be57be87e87",
"sha256:f5270369c799b4405ed47d45c88c09fbd7942fc9fb9891c0dabf0b8c751b625d"
"sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a",
"sha256:0a2e8d03787ec7ad71dc18aec9367c946ef8ef50e1e78c71f743bc3a770f9fae",
"sha256:0eeeae397e5a79dc088d8297a4c2c6f901f8fb30db47795113a4a605d0f1e5ce",
"sha256:11c5c6e9b02c9dac08af04f093eb5a2f84857df70a7d4a6a6ad461aca803fb9e",
"sha256:2fb113757a369a6cdb189f8df3226e995acfed0a8919a72416626af1a0a71140",
"sha256:4b0ef2470c4979e345e4e0cc1bbac65fda11d0d7b789dbac035e4c6ce3f98adb",
"sha256:59e903ca800c8cfd1ebe482349ec7c35687b95e98cefae213e271c8c7fffa021",
"sha256:5abd653a23c35d980b332bc0431d39663b1709d64142e3652890df4c9b6970f6",
"sha256:5f9403af9c790cc18411ea398a6950ee2def2a830ad0cfe6dc9122e6d528b302",
"sha256:6b4a8fd632b4ebee28282a9fef4c341835a1aa8671e2770b6f89adc8e8c2703c",
"sha256:6c1aca8231625115104a06e4389fcd9ec88f0c9befbabd80dc206c35561be271",
"sha256:795e91a60f291e75de2e20e6bdd67770f793c8605b553cb6e4387ce0cb302e09",
"sha256:7ba0ba61252ab23052e642abdb17fd08fdcfdbbf3b74c969a30c58ac1ade7cd3",
"sha256:7c9401e68730d6c4245b8e361d3d13e1035cbc94db86b49dc7da8bec235d0015",
"sha256:81f812d8f5e8a09b246515fac141e9d10113229bc33ea073fec11403b016bcf3",
"sha256:895d54c0ddc78a478c80f9c438579ac15f3e27bf442c2a9aa74d41d0e4d12544",
"sha256:8de332053707c80963b589b22f8e0229f1be1f3ca862a932c1bcd48dafb18dd8",
"sha256:92c882b70a40c79de9f5294dc99390671e07fc0b0113d472cbea3fde15db1792",
"sha256:95edb1ed513e68bddc2aee3de66ceaf743590bf16c023fb9977adc4be15bd3f0",
"sha256:b63d4ff734263ae4ce6593798bcfee6dbfb00523c82753a3a03cbc05555a9cc3",
"sha256:bd7bf289e05470b1bc74889d1466d9ad4a56d201f24397557b6f65c24a6844b8",
"sha256:cc3ea6b23954da84dbee8025c616040d9aa5eaf34ea6895a0a762ee9d3e12e11",
"sha256:cc9ec588c6ef3a1325fa032ec14d97b7309db493782ea8c304666fb10c3bd9a7",
"sha256:d3d07c86d4efa1facdf32aa878bd508c0dc4f87c48125cc16b937baa4e5b5e11",
"sha256:d8a96747df78cda35980905bf26e72960cba6d355ace4780d4bdde3b217cdf1e",
"sha256:e38d58d9138ef972fceb7aeec4be02e3f01d383723965bfcef14d174c8ccd039",
"sha256:eb472586374dc66b31e36e14720747595c2b265ae962987261f044e5cce644b5",
"sha256:fbd922f702582cb0d71ef94442bfca57624352622d75e3be7a1e7e9360b07e72"
],
"index": "pypi",
"version": "==8.0.0"
"version": "==8.0.1"
},
"psycopg2": {
"hashes": [
@ -241,35 +240,35 @@
},
"regex": {
"hashes": [
"sha256:02686a2f0b1a4be0facdd0d3ad4dc6c23acaa0f38fb5470d892ae88584ba705c",
"sha256:137da580d1e6302484be3ef41d72cf5c3ad22a076070051b7449c0e13ab2c482",
"sha256:20cdd7e1736f4f61a5161aa30d05ac108ab8efc3133df5eb70fe1e6a23ea1ca6",
"sha256:25991861c6fef1e5fd0a01283cf5658c5e7f7aa644128e85243bc75304e91530",
"sha256:26b85672275d8c7a9d4ff93dbc4954f5146efdb2ecec89ad1de49439984dea14",
"sha256:2f60ba5c33f00ce9be29a140e6f812e39880df8ba9cb92ad333f0016dbc30306",
"sha256:3dd952f3f8dc01b72c0cf05b3631e05c50ac65ddd2afdf26551638e97502107b",
"sha256:578ac6379e65eb8e6a85299b306c966c852712c834dc7eef0ba78d07a828f67b",
"sha256:5d4a3221f37520bb337b64a0632716e61b26c8ae6aaffceeeb7ad69c009c404b",
"sha256:608d6c05452c0e6cc49d4d7407b4767963f19c4d2230fa70b7201732eedc84f2",
"sha256:65b6b018b07e9b3b6a05c2c3bb7710ed66132b4df41926c243887c4f1ff303d5",
"sha256:698f8a5a2815e1663d9895830a063098ae2f8f2655ae4fdc5dfa2b1f52b90087",
"sha256:6c72adb85adecd4522a488a751e465842cdd2a5606b65464b9168bf029a54272",
"sha256:6d4cdb6c20e752426b2e569128488c5046fb1b16b1beadaceea9815c36da0847",
"sha256:6e9f72e0ee49f7d7be395bfa29e9533f0507a882e1e6bf302c0a204c65b742bf",
"sha256:828618f3c3439c5e6ef8621e7c885ca561bbaaba90ddbb6a7dfd9e1ec8341103",
"sha256:85b733a1ef2b2e7001aff0e204a842f50ad699c061856a214e48cfb16ace7d0c",
"sha256:8958befc139ac4e3f16d44ec386c490ea2121ed8322f4956f83dd9cad8e9b922",
"sha256:a51e51eecdac39a50ede4aeed86dbef4776e3b73347d31d6ad0bc9648ba36049",
"sha256:aeac7c9397480450016bc4a840eefbfa8ca68afc1e90648aa6efbfe699e5d3bb",
"sha256:aef23aed9d4017cc74d37f703d57ce254efb4c8a6a01905f40f539220348abf9",
"sha256:af1f5e997dd1ee71fb6eb4a0fb6921bf7a778f4b62f1f7ef0d7445ecce9155d6",
"sha256:b5eeaf4b5ef38fab225429478caf71f44d4a0b44d39a1aa4d4422cda23a9821b",
"sha256:d25f5cca0f3af6d425c9496953445bf5b288bb5b71afc2b8308ad194b714c159",
"sha256:d81be22d5d462b96a2aa5c512f741255ba182995efb0114e5a946fe254148df1",
"sha256:e935a166a5f4c02afe3f7e4ce92ce5a786f75c6caa0c4ce09c922541d74b77e8",
"sha256:ef3a55b16c6450574734db92e0a3aca283290889934a23f7498eaf417e3af9f0"
"sha256:0cb23ed0e327c18fb7eac61ebbb3180ebafed5b9b86ca2e15438201e5903b5dd",
"sha256:1a065e7a6a1b4aa851a0efa1a2579eabc765246b8b3a5fd74000aaa3134b8b4e",
"sha256:1a511470db3aa97432ac8c1bf014fcc6c9fbfd0f4b1313024d342549cf86bcd6",
"sha256:1c447b0d108cddc69036b1b3910fac159f2b51fdeec7f13872e059b7bc932be1",
"sha256:2278453c6a76280b38855a263198961938108ea2333ee145c5168c36b8e2b376",
"sha256:240509721a663836b611fa13ca1843079fc52d0b91ef3f92d9bba8da12e768a0",
"sha256:4e21340c07090ddc8c16deebfd82eb9c9e1ec5e62f57bb86194a2595fd7b46e0",
"sha256:570e916a44a361d4e85f355aacd90e9113319c78ce3c2d098d2ddf9631b34505",
"sha256:59d5c6302d22c16d59611a9fd53556554010db1d47e9df5df37be05007bebe75",
"sha256:6a46eba253cedcbe8a6469f881f014f0a98819d99d341461630885139850e281",
"sha256:6f567df0601e9c7434958143aebea47a9c4b45434ea0ae0286a4ec19e9877169",
"sha256:781906e45ef1d10a0ed9ec8ab83a09b5e0d742de70e627b20d61ccb1b1d3964d",
"sha256:8469377a437dbc31e480993399fd1fd15fe26f382dc04c51c9cb73e42965cc06",
"sha256:8cd0d587aaac74194ad3e68029124c06245acaeddaae14cb45844e5c9bebeea4",
"sha256:97a023f97cddf00831ba04886d1596ef10f59b93df7f855856f037190936e868",
"sha256:a973d5a7a324e2a5230ad7c43f5e1383cac51ef4903bf274936a5634b724b531",
"sha256:af360e62a9790e0a96bc9ac845d87bfa0e4ee0ee68547ae8b5a9c1030517dbef",
"sha256:b706c70070eea03411b1761fff3a2675da28d042a1ab7d0863b3efe1faa125c9",
"sha256:bfd7a9fddd11d116a58b62ee6c502fd24cfe22a4792261f258f886aa41c2a899",
"sha256:c30d8766a055c22e39dd7e1a4f98f6266169f2de05db737efe509c2fb9c8a3c8",
"sha256:c53dc8ee3bb7b7e28ee9feb996a0c999137be6c1d3b02cb6b3c4cba4f9e5ed09",
"sha256:c95d514093b80e5309bdca5dd99e51bcf82c44043b57c34594d9d7556bd04d05",
"sha256:d43cf21df524283daa80ecad551c306b7f52881c8d0fe4e3e76a96b626b6d8d8",
"sha256:d62205f00f461fe8b24ade07499454a3b7adf3def1225e258b994e2215fd15c5",
"sha256:e289a857dca3b35d3615c3a6a438622e20d1bf0abcb82c57d866c8d0be3f44c4",
"sha256:e5f6aa56dda92472e9d6f7b1e6331f4e2d51a67caafff4d4c5121cadac03941e",
"sha256:f4b1c65ee86bfbf7d0c3dfd90592a9e3d6e9ecd36c367c884094c050d4c35d04"
],
"version": "==2020.10.15"
"version": "==2020.10.23"
},
"scikit-learn": {
"hashes": [
@ -349,14 +348,6 @@
],
"version": "==2.1"
},
"whitenoise": {
"hashes": [
"sha256:05ce0be39ad85740a78750c86a93485c40f08ad8c62a6006de0233765996e5c7",
"sha256:05d00198c777028d72d8b0bbd234db605ef6d60e9410125124002518a48e515d"
],
"index": "pypi",
"version": "==5.2.0"
},
"whoosh": {
"hashes": [
"sha256:7ca5633dbfa9e0e0fa400d3151a8a0c4bec53bd2ecedc0a67705b17565c31a83",
@ -624,11 +615,11 @@
},
"pygments": {
"hashes": [
"sha256:307543fe65c0947b126e83dd5a61bd8acbd84abec11f43caebaf5534cbc17998",
"sha256:926c3f319eda178d1bd90851e4317e6d8cdb5e292a3386aac9bd75eca29cf9c7"
"sha256:381985fcc551eb9d37c52088a32914e00517e57f4a21609f48141ba08e193fa0",
"sha256:88a0bbcd659fcb9573703957c6b9cff9fab7295e6e76db54c9d00ae42df32773"
],
"markers": "python_version >= '3.5'",
"version": "==2.7.1"
"version": "==2.7.2"
},
"pyparsing": {
"hashes": [
@ -656,11 +647,11 @@
},
"pytest-django": {
"hashes": [
"sha256:0e91003fdd41ac0322c1978682be2ca180bc564203dd53c698f99242bf513614",
"sha256:5f964ccda1f551e00589ab0679a7c45c36c509a44b5bfb5ad07954e0ae3f4bed"
"sha256:10e384e6b8912ded92db64c58be8139d9ae23fb8361e5fc139d8e4f8fc601bc2",
"sha256:26f02c16d36fd4c8672390deebe3413678d89f30720c16efb8b2a6bf63b9041f"
],
"index": "pypi",
"version": "==4.0.0"
"version": "==4.1.0"
},
"pytest-env": {
"hashes": [

View File

@ -39,6 +39,10 @@ PAPERLESS_CONSUMPTION_DIR=""
#PAPERLESS_STATICDIR=""
# This is where the whoosh document index is stored
#PAPERLESS_INDEX_DIR="/path/to/index"
# Override the MEDIA_URL here. Unless you're hosting Paperless off a subdomain
# like /paperless/, you probably don't need to change this.
#PAPERLESS_MEDIA_URL="/media/"
@ -262,18 +266,6 @@ PAPERLESS_EMAIL_SECRET=""
#PAPERLESS_TIME_ZONE=UTC
# If set, Paperless will show document filters per financial year.
# The dates must be in the format "mm-dd", for example "07-15" for July 15.
#PAPERLESS_FINANCIAL_YEAR_START="mm-dd"
#PAPERLESS_FINANCIAL_YEAR_END="mm-dd"
# The number of items on each page in the web UI. This value must be a
# positive integer, but if you don't define one in paperless.conf, a default of
# 100 will be used.
#PAPERLESS_LIST_PER_PAGE=100
###############################################################################
#### Third-Party Binaries ####
###############################################################################

View File

@ -1,85 +1,12 @@
from datetime import datetime
from django.conf import settings
from django.contrib import admin
from django.contrib.auth.models import Group, User
from django.db import models
from django.utils.html import format_html, format_html_join
from django.utils.safestring import mark_safe
from .models import Correspondent, Document, DocumentType, Log, Tag
class FinancialYearFilter(admin.SimpleListFilter):
title = "Financial Year"
parameter_name = "fy"
_fy_wraps = None
def _fy_start(self, year):
"""Return date of the start of financial year for the given year."""
fy_start = "{}-{}".format(str(year), settings.FY_START)
return datetime.strptime(fy_start, "%Y-%m-%d").date()
def _fy_end(self, year):
"""Return date of the end of financial year for the given year."""
fy_end = "{}-{}".format(str(year), settings.FY_END)
return datetime.strptime(fy_end, "%Y-%m-%d").date()
def _fy_does_wrap(self):
"""Return whether the financial year spans across two years."""
if self._fy_wraps is None:
start = "{}".format(settings.FY_START)
start = datetime.strptime(start, "%m-%d").date()
end = "{}".format(settings.FY_END)
end = datetime.strptime(end, "%m-%d").date()
self._fy_wraps = end < start
return self._fy_wraps
def _determine_fy(self, date):
"""Return a (query, display) financial year tuple of the given date."""
if self._fy_does_wrap():
fy_start = self._fy_start(date.year)
if date.date() >= fy_start:
query = "{}-{}".format(date.year, date.year + 1)
else:
query = "{}-{}".format(date.year - 1, date.year)
# To keep it simple we use the same string for both
# query parameter and the display.
return query, query
else:
query = "{0}-{0}".format(date.year)
display = "{}".format(date.year)
return query, display
def lookups(self, request, model_admin):
if not settings.FY_START or not settings.FY_END:
return None
r = []
for document in Document.objects.all():
r.append(self._determine_fy(document.created))
return sorted(set(r), key=lambda x: x[0], reverse=True)
def queryset(self, request, queryset):
if not self.value() or not settings.FY_START or not settings.FY_END:
return None
start, end = self.value().split("-")
return queryset.filter(created__gte=self._fy_start(start),
created__lte=self._fy_end(end))
class CommonAdmin(admin.ModelAdmin):
list_per_page = settings.PAPERLESS_LIST_PER_PAGE
class CorrespondentAdmin(CommonAdmin):
class CorrespondentAdmin(admin.ModelAdmin):
list_display = (
"name",
@ -90,7 +17,7 @@ class CorrespondentAdmin(CommonAdmin):
readonly_fields = ("slug",)
class TagAdmin(CommonAdmin):
class TagAdmin(admin.ModelAdmin):
list_display = (
"name",
@ -104,7 +31,7 @@ class TagAdmin(CommonAdmin):
readonly_fields = ("slug",)
class DocumentTypeAdmin(CommonAdmin):
class DocumentTypeAdmin(admin.ModelAdmin):
list_display = (
"name",
@ -116,7 +43,7 @@ class DocumentTypeAdmin(CommonAdmin):
readonly_fields = ("slug",)
class DocumentAdmin(CommonAdmin):
class DocumentAdmin(admin.ModelAdmin):
search_fields = ("correspondent__name", "title", "content", "tags__name")
readonly_fields = ("added", "file_type", "storage_type",)
@ -125,8 +52,7 @@ class DocumentAdmin(CommonAdmin):
list_filter = (
"document_type",
"tags",
"correspondent",
FinancialYearFilter
"correspondent"
)
filter_horizontal = ("tags",)
@ -164,7 +90,7 @@ class DocumentAdmin(CommonAdmin):
return format_html("<{} {}/>", kind, attributes)
class LogAdmin(CommonAdmin):
class LogAdmin(admin.ModelAdmin):
list_display = ("created", "message", "level",)
list_filter = ("level", "created",)

View File

@ -16,12 +16,14 @@ class DocumentsConfig(AppConfig):
run_pre_consume_script,
run_post_consume_script,
cleanup_document_deletion,
set_log_entry
set_log_entry,
index_document
)
document_consumption_started.connect(run_pre_consume_script)
document_consumption_finished.connect(classify_document)
document_consumption_finished.connect(index_document)
document_consumption_finished.connect(add_inbox_tags)
document_consumption_finished.connect(set_log_entry)
document_consumption_finished.connect(run_post_consume_script)

View File

@ -239,7 +239,6 @@ class Consumer:
self._write(document, doc, document.source_path)
self._write(document, thumbnail, document.thumbnail_path)
document.set_filename(document.source_filename)
document.save()
self.log("info", "Completed")

View File

@ -5,6 +5,8 @@ from .models import Correspondent, Document, Tag, DocumentType
CHAR_KWARGS = ["istartswith", "iendswith", "icontains", "iexact"]
ID_KWARGS = ["in", "exact"]
INT_KWARGS = ["exact"]
DATE_KWARGS = ["year", "month", "day", "date__gt", "gt", "date__lt", "lt"]
class CorrespondentFilterSet(FilterSet):
@ -36,7 +38,7 @@ class DocumentTypeFilterSet(FilterSet):
class DocumentFilterSet(FilterSet):
tags_empty = BooleanFilter(
is_tagged = BooleanFilter(
label="Is tagged",
field_name="tags",
lookup_expr="isnull",
@ -50,6 +52,12 @@ class DocumentFilterSet(FilterSet):
"title": CHAR_KWARGS,
"content": CHAR_KWARGS,
"archive_serial_number": INT_KWARGS,
"created": DATE_KWARGS,
"added": DATE_KWARGS,
"modified": DATE_KWARGS,
"correspondent__id": ID_KWARGS,
"correspondent__name": CHAR_KWARGS,
@ -57,6 +65,6 @@ class DocumentFilterSet(FilterSet):
"tags__name": CHAR_KWARGS,
"document_type__id": ID_KWARGS,
"document_type__name": CHAR_KWARGS
"document_type__name": CHAR_KWARGS,
}

104
src/documents/index.py Normal file
View File

@ -0,0 +1,104 @@
from collections import Iterable
from django.db import models
from django.dispatch import receiver
from whoosh.fields import Schema, TEXT, NUMERIC, DATETIME, KEYWORD
from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import QueryParser
from whoosh.query import terms
from whoosh.writing import AsyncWriter
from documents.models import Document
from paperless import settings
class JsonFormatter(Formatter):
def __init__(self):
self.seen = {}
def format_token(self, text, token, replace=False):
seen = self.seen
ttext = self._text(get_text(text, token, replace))
if ttext in seen:
termnum = seen[ttext]
else:
termnum = len(seen)
seen[ttext] = termnum
return {'text': ttext, 'term': termnum}
def format_fragment(self, fragment, replace=False):
output = []
index = fragment.startchar
text = fragment.text
for t in fragment.matches:
if t.startchar is None:
continue
if t.startchar < index:
continue
if t.startchar > index:
output.append({'text': text[index:t.startchar]})
output.append(self.format_token(text, t, replace))
index = t.endchar
if index < fragment.endchar:
output.append({'text': text[index:fragment.endchar]})
return output
def format(self, fragments, replace=False):
output = []
for fragment in fragments:
output.append(self.format_fragment(fragment, replace=replace))
return output
def get_schema():
return Schema(
id=NUMERIC(stored=True, unique=True, numtype=int),
title=TEXT(stored=True),
content=TEXT(stored=True)
)
def open_index(recreate=False):
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR)
else:
return create_in(settings.INDEX_DIR, get_schema())
def update_document(writer, doc):
writer.update_document(
id=doc.id,
title=doc.title,
content=doc.content
)
@receiver(models.signals.post_save, sender=Document)
def add_document_to_index(sender, instance, **kwargs):
ix = open_index()
with AsyncWriter(ix) as writer:
update_document(writer, instance)
@receiver(models.signals.post_delete, sender=Document)
def remove_document_from_index(sender, instance, **kwargs):
ix = open_index()
with AsyncWriter(ix) as writer:
writer.delete_by_term('id', instance.id)
def query_index(ix, querystr):
with ix.searcher() as searcher:
query = QueryParser("content", ix.schema, termclass=terms.FuzzyTerm).parse(querystr)
results = searcher.search(query)
results.formatter = JsonFormatter()
results.fragmenter.surround = 50
return [
{'id': r['id'],
'highlights': r.highlights("content"),
'score': r.score,
'title': r['title']
} for r in results]

View File

@ -0,0 +1,27 @@
from django.core.management import BaseCommand
from whoosh.writing import AsyncWriter
import documents.index as index
from documents.mixins import Renderable
from documents.models import Document
class Command(Renderable, BaseCommand):
help = "Recreates the document index"
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
documents = Document.objects.all()
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in documents:
index.update_document(writer, document)

View File

@ -1,37 +0,0 @@
# Generated by Django 2.0.10 on 2019-04-26 18:57
from django.db import migrations, models
def set_filename(apps, schema_editor):
Document = apps.get_model("documents", "Document")
for doc in Document.objects.all():
file_name = "{:07}.{}".format(doc.pk, doc.file_type)
if doc.storage_type == "gpg":
file_name += ".gpg"
# Set filename
doc.filename = file_name
# Save document
doc.save()
class Migration(migrations.Migration):
dependencies = [
('documents', '0022_auto_20181007_1420'),
]
operations = [
migrations.AddField(
model_name='document',
name='filename',
field=models.FilePathField(default=None,
null=True,
editable=False,
help_text='Current filename in storage',
max_length=256),
),
migrations.RunPython(set_filename)
]

View File

@ -168,14 +168,6 @@ class Document(models.Model):
added = models.DateTimeField(
default=timezone.now, editable=False, db_index=True)
filename = models.FilePathField(
max_length=256,
editable=False,
default=None,
null=True,
help_text="Current filename in storage"
)
archive_serial_number = models.IntegerField(
blank=True,
null=True,
@ -197,125 +189,17 @@ class Document(models.Model):
return "{}: {}".format(created, self.correspondent or self.title)
return str(created)
def find_renamed_document(self, subdirectory=""):
suffix = "%07i.%s" % (self.pk, self.file_type)
# Append .gpg for encrypted files
if self.storage_type == self.STORAGE_TYPE_GPG:
suffix += ".gpg"
# Go up in the directory hierarchy and try to delete all directories
root = os.path.normpath(Document.filename_to_path(subdirectory))
for filename in os.listdir(root):
if filename.endswith(suffix):
return os.path.join(subdirectory, filename)
fullname = os.path.join(subdirectory, filename)
if os.path.isdir(Document.filename_to_path(fullname)):
return self.find_renamed_document(fullname)
return None
@property
def source_filename(self):
# Initial filename generation (for new documents)
if self.filename is None:
self.filename = self.generate_source_filename()
# Check if document is still available under filename
elif not os.path.isfile(Document.filename_to_path(self.filename)):
recovered_filename = self.find_renamed_document()
# If we have found the file so update the filename
if recovered_filename is not None:
logger = logging.getLogger(__name__)
logger.warning("Filename of document " + str(self.id) +
" has changed and was successfully updated")
self.filename = recovered_filename
# Remove all empty subdirectories from MEDIA_ROOT
Document.delete_all_empty_subdirectories(
Document.filename_to_path(""))
else:
logger = logging.getLogger(__name__)
logger.error("File of document " + str(self.id) + " has " +
"gone and could not be recovered")
return self.filename
@staticmethod
def many_to_dictionary(field):
# Converts ManyToManyField to dictionary by assuming, that field
# entries contain an _ or - which will be used as a delimiter
mydictionary = dict()
for index, t in enumerate(field.all()):
# Populate tag names by index
mydictionary[index] = slugify(t.name)
# Find delimiter
delimiter = t.name.find('_')
if delimiter == -1:
delimiter = t.name.find('-')
if delimiter == -1:
continue
key = t.name[:delimiter]
value = t.name[delimiter+1:]
mydictionary[slugify(key)] = slugify(value)
return mydictionary
def generate_source_filename(self):
# Create filename based on configured format
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdict(lambda: slugify(None),
self.many_to_dictionary(self.tags))
path = settings.PAPERLESS_FILENAME_FORMAT.format(
correspondent=slugify(self.correspondent),
title=slugify(self.title),
created=slugify(self.created),
added=slugify(self.added),
tags=tags)
else:
path = ""
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:
filename = "%s-%07i.%s" % (path, self.pk, self.file_type)
else:
filename = "%07i.%s" % (self.pk, self.file_type)
# Append .gpg for encrypted files
if self.storage_type == self.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
def create_source_directory(self):
new_filename = self.generate_source_filename()
# Determine the full "target" path
dir_new = Document.filename_to_path(os.path.dirname(new_filename))
# Create new path
os.makedirs(dir_new, exist_ok=True)
@property
def source_path(self):
return Document.filename_to_path(self.source_filename)
file_name = "{:07}.{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
file_name += ".gpg"
@staticmethod
def filename_to_path(filename):
return os.path.join(
settings.MEDIA_ROOT,
"documents",
"originals",
filename
file_name
)
@property
@ -352,125 +236,6 @@ class Document(models.Model):
def thumbnail_url(self):
return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk})
def set_filename(self, filename):
if os.path.isfile(Document.filename_to_path(filename)):
self.filename = filename
@staticmethod
def try_delete_empty_directories(directory):
# Go up in the directory hierarchy and try to delete all directories
directory = os.path.normpath(directory)
root = os.path.normpath(Document.filename_to_path(""))
while directory != root:
# Try to delete the current directory
try:
os.rmdir(directory)
except os.error:
# Directory not empty, no need to go further up
return
# Cut off actual directory and go one level up
directory, _ = os.path.split(directory)
directory = os.path.normpath(directory)
@staticmethod
def delete_all_empty_subdirectories(directory):
# Go through all folders and try to delete all directories
root = os.path.normpath(Document.filename_to_path(directory))
for filename in os.listdir(root):
fullname = os.path.join(directory, filename)
if not os.path.isdir(Document.filename_to_path(fullname)):
continue
# Go into subdirectory to see, if there is more to delete
Document.delete_all_empty_subdirectories(
os.path.join(directory, filename))
# Try to delete the directory
try:
os.rmdir(Document.filename_to_path(fullname))
continue
except os.error:
# Directory not empty, no need to go further up
continue
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@receiver(models.signals.post_save, sender=Document)
def update_filename(sender, instance, **kwargs):
# Skip if document has not been saved yet
if instance.filename is None:
return
# Check is file exists and update filename otherwise
if not os.path.isfile(Document.filename_to_path(instance.filename)):
instance.filename = instance.source_filename
# Build the new filename
new_filename = instance.generate_source_filename()
# If the filename is the same, then nothing needs to be done
if instance.filename == new_filename:
return
# Determine the full "target" path
path_new = instance.filename_to_path(new_filename)
dir_new = instance.filename_to_path(os.path.dirname(new_filename))
# Create new path
instance.create_source_directory()
# Determine the full "current" path
path_current = instance.filename_to_path(instance.source_filename)
# Move file
try:
os.rename(path_current, path_new)
except PermissionError:
# Do not update filename in object
return
except FileNotFoundError:
logger = logging.getLogger(__name__)
logger.error("Renaming of document " + str(instance.id) + " failed " +
"as file " + instance.filename + " was no longer present")
return
# Delete empty directory
old_dir = os.path.dirname(instance.filename)
old_path = instance.filename_to_path(old_dir)
Document.try_delete_empty_directories(old_path)
instance.filename = new_filename
# Save instance
# This will not cause a cascade of post_save signals, as next time
# nothing needs to be renamed
instance.save()
@receiver(models.signals.post_delete, sender=Document)
def delete_files(sender, instance, **kwargs):
if instance.filename is None:
return
# Remove the document
old_file = instance.filename_to_path(instance.filename)
try:
os.remove(old_file)
except FileNotFoundError:
logger = logging.getLogger(__name__)
logger.warning("Deleted document " + str(instance.id) + " but file " +
old_file + " was no longer present")
# And remove the directory (if applicable)
old_dir = os.path.dirname(instance.filename)
old_path = instance.filename_to_path(old_dir)
Document.try_delete_empty_directories(old_path)
class Log(models.Model):

View File

@ -93,8 +93,6 @@ class DocumentSerializer(serializers.ModelSerializer):
"modified",
"added",
"file_name",
"download_url",
"thumbnail_url",
"archive_serial_number"
)

View File

@ -9,6 +9,7 @@ from django.contrib.contenttypes.models import ContentType
from django.utils import timezone
from documents.classifier import DocumentClassifier
from .. import index
from ..models import Document, Tag
@ -16,9 +17,14 @@ def logger(message, group):
logging.getLogger(__name__).debug(message, extra={"group": group})
#TODO: global? really?
classifier = DocumentClassifier()
def index_document(sender, document=None, logging_group=None, **kwargs):
index.add_document_to_index(sender, instance=document)
def classify_document(sender, document=None, logging_group=None, **kwargs):
global classifier
try:

View File

@ -1,12 +1,13 @@
from django.db.models import Count, Max
from django.http import HttpResponse, HttpResponseBadRequest
from django.views.generic import DetailView, FormView, TemplateView
from django.http import HttpResponse
from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from django.conf import settings
from django.utils import cache
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.views import APIView
from paperless.db import GnuPG
from paperless.mixins import SessionOrBasicAuthMixin
from paperless.views import StandardPagination
from rest_framework.filters import OrderingFilter, SearchFilter
from rest_framework.mixins import (
@ -29,7 +30,7 @@ from .filters import (
DocumentTypeFilterSet
)
from .forms import UploadForm
import documents.index as index
from .models import Correspondent, Document, Log, Tag, DocumentType
from .serialisers import (
CorrespondentSerializer,
@ -41,71 +42,7 @@ from .serialisers import (
class IndexView(TemplateView):
template_name = "documents/index.html"
class FetchView(SessionOrBasicAuthMixin, DetailView):
model = Document
def render_to_response(self, context, **response_kwargs):
"""
Override the default to return the unencrypted image/PDF as raw data.
"""
content_types = {
Document.TYPE_PDF: "application/pdf",
Document.TYPE_PNG: "image/png",
Document.TYPE_JPG: "image/jpeg",
Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
}
if self.kwargs["kind"] == "thumb":
response = HttpResponse(
self._get_raw_data(self.object.thumbnail_file),
content_type=content_types[Document.TYPE_PNG]
)
cache.patch_cache_control(response, max_age=31536000, private=True)
return response
response = HttpResponse(
self._get_raw_data(self.object.source_file),
content_type=content_types[self.object.file_type]
)
DISPOSITION = (
'inline' if settings.INLINE_DOC or self.kwargs["kind"] == 'preview'
else 'attachment'
)
response["Content-Disposition"] = '{}; filename="{}"'.format(
DISPOSITION, self.object.file_name)
return response
def _get_raw_data(self, file_handle):
if self.object.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
return file_handle
return GnuPG.decrypted(file_handle)
class PushView(SessionOrBasicAuthMixin, FormView):
"""
A crude REST-ish API for creating documents.
"""
form_class = UploadForm
def form_valid(self, form):
form.save()
return HttpResponse("1", status=202)
def form_invalid(self, form):
return HttpResponseBadRequest(str(form.errors))
template_name = "index.html"
class CorrespondentViewSet(ModelViewSet):
@ -155,7 +92,52 @@ class DocumentViewSet(RetrieveModelMixin,
filter_class = DocumentFilterSet
search_fields = ("title", "correspondent__name", "content")
ordering_fields = (
"id", "title", "correspondent__name", "created", "modified", "added")
"id", "title", "correspondent__name", "created", "modified", "added", "archive_serial_number")
def file_response(self, pk, disposition):
#TODO: this should not be necessary here.
content_types = {
Document.TYPE_PDF: "application/pdf",
Document.TYPE_PNG: "image/png",
Document.TYPE_JPG: "image/jpeg",
Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
}
doc = Document.objects.get(id=pk)
if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
file_handle = doc.source_file
else:
file_handle = GnuPG.decrypted(doc.source_file)
response = HttpResponse(file_handle, content_type=content_types[doc.file_type])
response["Content-Disposition"] = '{}; filename="{}"'.format(
disposition, doc.file_name)
return response
@action(methods=['post'], detail=False)
def post_document(self, request, pk=None):
#TODO: implement document upload
return Response("not implemented yet", status=500)
@action(methods=['get'], detail=True)
def preview(self, request, pk=None):
response = self.file_response(pk, "inline")
return response
@action(methods=['get'], detail=True)
@cache_control(public=False, max_age=315360000)
def thumb(self, request, pk=None):
return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
@action(methods=['get'], detail=True)
def download(self, request, pk=None):
return self.file_response(pk, "attachment")
class LogViewSet(ReadOnlyModelViewSet):
@ -166,3 +148,17 @@ class LogViewSet(ReadOnlyModelViewSet):
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
ordering_fields = ("time",)
class SearchView(APIView):
ix = index.open_index()
def get(self, request, format=None):
if 'query' in request.query_params:
query = request.query_params['query']
query_results = index.query_index(self.ix, query)
for r in query_results:
r['document'] = DocumentSerializer(Document.objects.get(id=r['id'])).data
return Response(query_results)
else:
return Response([])

11
src/paperless/auth.py Normal file
View File

@ -0,0 +1,11 @@
from rest_framework.authentication import TokenAuthentication
# This authentication method is required to serve documents and thumbnails for the front end.
# https://stackoverflow.com/questions/29433416/token-in-query-string-with-django-rest-frameworks-tokenauthentication
class QueryTokenAuthentication(TokenAuthentication):
def authenticate(self, request):
# Check if 'token_auth' is in the request query params.
if 'auth_token' in request.query_params and 'HTTP_AUTHORIZATION' not in request.META:
return self.authenticate_credentials(request.query_params.get('auth_token'))
else:
return None

View File

@ -91,17 +91,13 @@ INSTALLED_APPS = [
REST_FRAMEWORK = {
'DEFAULT_AUTHENTICATION_CLASSES': [
'rest_framework.authentication.BasicAuthentication',
'rest_framework.authentication.SessionAuthentication',
'rest_framework.authentication.TokenAuthentication',
'paperless.auth.QueryTokenAuthentication'
]
}
if os.getenv("PAPERLESS_INSTALLED_APPS"):
INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",")
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'whitenoise.middleware.WhiteNoiseMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'corsheaders.middleware.CorsMiddleware',
'django.middleware.common.CommonMiddleware',
@ -111,8 +107,7 @@ MIDDLEWARE = [
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
# Enable whitenoise compression and caching
STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
# X_FRAME_OPTIONS = 'SAMEORIGIN'
# We allow CORS from localhost:8080
CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080,https://localhost:8080,http://localhost:4200").split(","))
@ -299,6 +294,8 @@ SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
# This is where Paperless will look for PDFs to index
CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR")
INDEX_DIR = os.getenv('PAPERLESS_INDEX_DIR', os.path.join(BASE_DIR, "..", "index"))
# (This setting is ignored on Linux where inotify is used instead of a
# polling loop.)
# The number of seconds that Paperless will wait between checking
@ -323,17 +320,6 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
PRE_CONSUME_SCRIPT = os.getenv("PAPERLESS_PRE_CONSUME_SCRIPT")
POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
# Whether to display a selected document inline, or download it as attachment:
INLINE_DOC = __get_boolean("PAPERLESS_INLINE_DOC")
# The number of items on each page in the web UI. This value must be a
# positive integer, but if you don't define one in paperless.conf, a default of
# 100 will be used.
PAPERLESS_LIST_PER_PAGE = int(os.getenv("PAPERLESS_LIST_PER_PAGE", 100))
FY_START = os.getenv("PAPERLESS_FINANCIAL_YEAR_START")
FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
# Specify the default date order (for autodetected dates)
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
@ -342,6 +328,3 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
FILENAME_PARSE_TRANSFORMS = []
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
# Specify the filename format for out files
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")

View File

@ -1,49 +1,34 @@
from django.conf import settings
from django.conf.urls import include, static, url
from django.contrib import admin
from django.urls import reverse_lazy
from django.views.decorators.csrf import csrf_exempt
from django.views.generic import RedirectView
from rest_framework.authtoken import views
from rest_framework.routers import DefaultRouter
from paperless.views import FaviconView
from documents.views import (
CorrespondentViewSet,
DocumentViewSet,
FetchView,
LogViewSet,
PushView,
TagViewSet,
DocumentTypeViewSet
DocumentTypeViewSet,
SearchView,
IndexView
)
router = DefaultRouter()
router.register(r"correspondents", CorrespondentViewSet)
router.register(r"document_types", DocumentTypeViewSet)
router.register(r"documents", DocumentViewSet)
router.register(r"logs", LogViewSet)
router.register(r"tags", TagViewSet)
api_router = DefaultRouter()
api_router.register(r"correspondents", CorrespondentViewSet)
api_router.register(r"document_types", DocumentTypeViewSet)
api_router.register(r"documents", DocumentViewSet)
api_router.register(r"logs", LogViewSet)
api_router.register(r"tags", TagViewSet)
urlpatterns = [
# API
url(
r"^api/auth/",
include(
('rest_framework.urls', 'rest_framework'),
namespace="rest_framework")
),
url(r"^api/", include((router.urls, 'drf'), namespace="drf")),
# File downloads
url(
r"^fetch/(?P<kind>doc|thumb|preview)/(?P<pk>\d+)$",
FetchView.as_view(),
name="fetch"
),
# File uploads
url(r"^push$", csrf_exempt(PushView.as_view()), name="push"),
url(r"^api/auth/",include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
url(r"^api/search/", SearchView.as_view(), name="search"),
url(r"^api/token/", views.obtain_auth_token), url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
# Favicon
url(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
@ -51,9 +36,8 @@ urlpatterns = [
# The Django admin
url(r"admin/", admin.site.urls),
# Redirect / to /admin
url(r"^$", RedirectView.as_view(
permanent=True, url=reverse_lazy("admin:index"))),
# Root of the Frontent
url(r".*", IndexView.as_view()),
] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)