From 85721f1d441ff9db85818c9b491092b58f3ac09e Mon Sep 17 00:00:00 2001
From: jonaswinkler
Date: Tue, 17 Nov 2020 18:38:52 +0100
Subject: [PATCH 01/52] Update README.md
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 416f0ca06..c1f5b14f8 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
[](https://travis-ci.org/jonaswinkler/paperless-ng)
[](https://paperless-ng.readthedocs.io/en/latest/?badge=latest)
[](https://hub.docker.com/r/jonaswinkler/paperless-ng)
+[](https://coveralls.io/github/jonaswinkler/paperless-ng?branch=master)
# Paperless-ng
From 2c9555015b7b4bcc58d93a7e5bf36ee88d596150 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Fri, 20 Nov 2020 11:21:09 +0100
Subject: [PATCH 02/52] make the index dir if it does not exist.
---
src/documents/index.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/documents/index.py b/src/documents/index.py
index d46ccedaf..ad3a50010 100644
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -1,6 +1,8 @@
import logging
+import os
from contextlib import contextmanager
+from django.conf import settings
from whoosh import highlight
from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.highlight import Formatter, get_text
@@ -8,7 +10,6 @@ from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.writing import AsyncWriter
-from paperless import settings
logger = logging.getLogger(__name__)
@@ -69,6 +70,8 @@ def open_index(recreate=False):
# TODO: this is not thread safe. If 2 instances try to create the index
# at the same time, this fails. This currently prevents parallel
# tests.
+ if not os.path.isdir(settings.INDEX_DIR):
+ os.makedirs(settings.INDEX_DIR, exist_ok=True)
return create_in(settings.INDEX_DIR, get_schema())
From 1255ecf86e2d83787a70f0d546cedf5fcd132f63 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Fri, 20 Nov 2020 11:28:19 +0100
Subject: [PATCH 03/52] update dependencies.
---
Pipfile | 48 ++++++++++++++++++++++++------------------------
Pipfile.lock | 36 ++++++++++++++++++------------------
2 files changed, 42 insertions(+), 42 deletions(-)
diff --git a/Pipfile b/Pipfile
index 66d60845b..ad60e0905 100644
--- a/Pipfile
+++ b/Pipfile
@@ -9,43 +9,43 @@ verify_ssl = true
name = "piwheels"
[packages]
-django = "~=3.1"
-pillow = "*"
-dateparser = "~=0.7"
+dateparser = "~=0.7.6"
+django = "~=3.1.3"
django-cors-headers = "*"
-djangorestframework = "~=3.12"
-python-gnupg = "*"
-python-dotenv = "*"
-filemagic = "*"
-pyocr = "~=0.7"
+django-extensions = "*"
+django-filter = "~=2.4.0"
+django-q = "~=1.3.4"
+djangorestframework = "~=3.12.2"
+fuzzywuzzy = "*"
+gunicorn = "*"
+imap-tools = "*"
langdetect = "*"
pdftotext = "*"
-django-filter = "~=2.4"
-python-dateutil = "*"
-psycopg2-binary = "*"
-scikit-learn="~=0.23"
-whoosh="~=2.7"
-gunicorn = "*"
-whitenoise = "~=5.2"
-fuzzywuzzy = "*"
-python-Levenshtein = "*"
-django-extensions = "*"
-watchdog = "*"
pathvalidate = "*"
-django-q = "*"
+pillow = "*"
+pyocr = "~=0.7.2"
+python-gnupg = "*"
+python-dotenv = "*"
+python-dateutil = "*"
+python-Levenshtein = "*"
+python-magic = "*"
+psycopg2-binary = "*"
redis = "*"
-imap-tools = "*"
+scikit-learn="~=0.23.2"
+whitenoise = "~=5.2.0"
+watchdog = "*"
+whoosh="~=2.7.4"
[dev-packages]
coveralls = "*"
factory-boy = "*"
-sphinx = "~=3.3"
-tox = "*"
pycodestyle = "*"
pytest = "*"
pytest-cov = "*"
pytest-django = "*"
-pytest-sugar = "*"
pytest-env = "*"
+pytest-sugar = "*"
pytest-xdist = "*"
+sphinx = "~=3.3"
sphinx_rtd_theme = "*"
+tox = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
index 15a30e1c0..6ecca3c34 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "abc7e5f5a8d075d4b013ceafd06ca07f57e597f053d670f73449ba210511b114"
+ "sha256": "ae2643b9cf0cf5741ae149fb6bc0c480de41329ce48e773eb4b5d760bc5e2244"
},
"pipfile-spec": 6,
"requires": {},
@@ -105,14 +105,6 @@
"index": "pypi",
"version": "==3.12.2"
},
- "filemagic": {
- "hashes": [
- "sha256:b2fd77411975510e28673220c4b8868ed81b5eb5906339b6f4c233b32122d7d3",
- "sha256:e684359ef40820fe406f0ebc5bf8a78f89717bdb7fed688af68082d991d6dbf3"
- ],
- "index": "pypi",
- "version": "==1.6"
- },
"fuzzywuzzy": {
"hashes": [
"sha256:45016e92264780e58972dca1b3d939ac864b78437422beecebb3095f8efd00e8",
@@ -131,11 +123,11 @@
},
"imap-tools": {
"hashes": [
- "sha256:070929b8ec429c0aad94588a37a2962eed656a119ab61dcf91489f20fe983f5d",
- "sha256:6232cd43748741496446871e889eb137351fc7a7e7f4c7888cd8c0fa28e20cda"
+ "sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246",
+ "sha256:a3ee1827dc4ff185b259b33d0238b091a87d489f63ee59959fcc81716456c602"
],
"index": "pypi",
- "version": "==0.31.0"
+ "version": "==0.32.0"
},
"joblib": {
"hashes": [
@@ -337,6 +329,14 @@
"index": "pypi",
"version": "==0.12.0"
},
+ "python-magic": {
+ "hashes": [
+ "sha256:356efa93c8899047d1eb7d3eb91e871ba2f5b1376edbaf4cc305e3c872207355",
+ "sha256:b757db2a5289ea3f1ced9e60f072965243ea43a2221430048fd8cacab17be0ce"
+ ],
+ "index": "pypi",
+ "version": "==0.4.18"
+ },
"pytz": {
"hashes": [
"sha256:3e6b7dd2d1e0a59084bcee14a17af60c5c562cdc16d828e8eba2e683d3a7e268",
@@ -617,11 +617,11 @@
},
"coveralls": {
"hashes": [
- "sha256:4430b862baabb3cf090d36d84d331966615e4288d8a8c5957e0fd456d0dd8bd6",
- "sha256:b3b60c17b03a0dee61952a91aed6f131e0b2ac8bd5da909389c53137811409e1"
+ "sha256:2301a19500b06649d2ec4f2858f9c69638d7699a4c63027c5d53daba666147cc",
+ "sha256:b990ba1f7bc4288e63340be0433698c1efe8217f78c689d254c2540af3d38617"
],
"index": "pypi",
- "version": "==2.1.2"
+ "version": "==2.2.0"
},
"distlib": {
"hashes": [
@@ -663,11 +663,11 @@
},
"faker": {
"hashes": [
- "sha256:4d038ba51ae5e0a956d79cadd684d856e5750bfd608b61dad1807f8f08b1da49",
- "sha256:f260f0375a44cd1e1a735c9b8c9b914304f607b5eef431d20e098c7c2f5b50a6"
+ "sha256:3f5d379e4b5ce92a8afe3c2ce59d7c43886370dd3bf9495a936b91888debfc81",
+ "sha256:8c0e8a06acef4b9312902e2ce18becabe62badd3a6632180bd0680c6ee111473"
],
"markers": "python_version >= '3.5'",
- "version": "==4.16.0"
+ "version": "==4.17.0"
},
"filelock": {
"hashes": [
From 28ea67f252289fc05479d04a0711735f21422ff4 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Fri, 20 Nov 2020 11:28:30 +0100
Subject: [PATCH 04/52] removed some empty folders.
---
data/.keep | 0
data/index/.keep | 0
media/documents/.keep | 0
media/documents/originals/.keep | 0
media/documents/thumbnails/.keep | 0
5 files changed, 0 insertions(+), 0 deletions(-)
delete mode 100644 data/.keep
delete mode 100644 data/index/.keep
delete mode 100644 media/documents/.keep
delete mode 100644 media/documents/originals/.keep
delete mode 100644 media/documents/thumbnails/.keep
diff --git a/data/.keep b/data/.keep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/data/index/.keep b/data/index/.keep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/media/documents/.keep b/media/documents/.keep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/media/documents/originals/.keep b/media/documents/originals/.keep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/media/documents/thumbnails/.keep b/media/documents/thumbnails/.keep
deleted file mode 100644
index e69de29bb..000000000
From 8681cad77c412aef956e1f0b5883587213a2fc0a Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Fri, 20 Nov 2020 11:29:34 +0100
Subject: [PATCH 05/52] add required packages to travis
---
.travis.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.travis.yml b/.travis.yml
index 10f2a4d73..2db24da87 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,7 @@ python:
before_install:
- sudo apt-get update -qq
- - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr
+ - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript
install:
- pip install --upgrade pipenv
From bd45a804a7dc1e1ec14de45f24134680b7545621 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Fri, 20 Nov 2020 13:28:30 +0100
Subject: [PATCH 06/52] docs
---
docs/setup.rst | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/docs/setup.rst b/docs/setup.rst
index 71acfba42..0f5db1ae5 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -10,10 +10,10 @@ Go to the project page on GitHub and download the
`latest release `_.
There are multiple options available.
-* Download the docker-compose files if you want to pull paperless from
+* Download the dockerfiles archive if you want to pull paperless from
Docker Hub.
-* Download the archive and extract it if you want to build the docker image
+* Download the dist archive and extract it if you want to build the docker image
yourself or want to install paperless without docker.
.. hint::
@@ -22,6 +22,15 @@ There are multiple options available.
is not to pull the entire git repository. Paperless-ng includes artifacts
that need to be compiled, and that's already done for you in the release.
+.. admonition:: Want to try out paperless-ng before migrating?
+
+ The release contains a file ``.env`` which sets the docker-compose project
+ name to "paperless", which is the same as before and instructs docker-compose
+ to reuse and upgrade your paperless volumes.
+
+ Just rename the project name in that file to anything else and docker-compose
+ will create fresh volumes for you!
+
Overview of Paperless-ng
########################
From 41650f20f458482bd855c12288cb22c0385e5bdc Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Fri, 20 Nov 2020 13:31:03 +0100
Subject: [PATCH 07/52] mime type handling
---
src/documents/admin.py | 5 +-
src/documents/consumer.py | 24 ++++-----
src/documents/file_handling.py | 4 +-
.../management/commands/document_exporter.py | 4 +-
src/documents/migrations/1003_mime_types.py | 50 +++++++++++++++++++
src/documents/models.py | 28 ++++-------
src/documents/parsers.py | 24 ++++++---
src/documents/serialisers.py | 2 +-
src/documents/tests/test_api.py | 12 ++---
src/documents/tests/test_consumer.py | 16 +++++-
src/documents/tests/test_document_model.py | 3 ++
src/documents/tests/test_file_handling.py | 34 ++++++-------
src/documents/tests/test_matchables.py | 2 +-
src/documents/tests/test_parsers.py | 18 +++++--
src/documents/views.py | 14 +-----
src/paperless_mail/mail.py | 4 +-
src/paperless_tesseract/signals.py | 15 ++----
src/paperless_tesseract/tests/test_signals.py | 36 -------------
src/paperless_text/signals.py | 14 ++----
19 files changed, 163 insertions(+), 146 deletions(-)
create mode 100644 src/documents/migrations/1003_mime_types.py
delete mode 100644 src/paperless_tesseract/tests/test_signals.py
diff --git a/src/documents/admin.py b/src/documents/admin.py
index 209ddff35..5b3975fda 100755
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -50,7 +50,7 @@ class DocumentTypeAdmin(admin.ModelAdmin):
class DocumentAdmin(admin.ModelAdmin):
search_fields = ("correspondent__name", "title", "content", "tags__name")
- readonly_fields = ("added", "file_type", "storage_type", "filename")
+ readonly_fields = ("added", "mime_type", "storage_type", "filename")
list_display = (
"title",
"created",
@@ -58,8 +58,7 @@ class DocumentAdmin(admin.ModelAdmin):
"correspondent",
"tags_",
"archive_serial_number",
- "document_type",
- "filename"
+ "document_type"
)
list_filter = (
"document_type",
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 3cd57796e..b8eb8cfca 100755
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -2,8 +2,8 @@ import datetime
import hashlib
import logging
import os
-import re
+import magic
from django.conf import settings
from django.db import transaction
from django.utils import timezone
@@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
-from .parsers import ParseError, get_parser_class
+from .parsers import ParseError, get_parser_class_for_mime_type
from .signals import (
document_consumption_finished,
document_consumption_started
@@ -51,12 +51,6 @@ class Consumer(LoggingMixin):
"Consumption directory {} does not exist".format(
settings.CONSUMPTION_DIR))
- def pre_check_regex(self):
- if not re.match(FileInfo.REGEXES["title"], self.filename):
- raise ConsumerError(
- "Filename {} does not seem to be safe to "
- "consume".format(self.filename))
-
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
@@ -100,18 +94,19 @@ class Consumer(LoggingMixin):
self.pre_check_file_exists()
self.pre_check_consumption_dir()
self.pre_check_directories()
- self.pre_check_regex()
self.pre_check_duplicate()
self.log("info", "Consuming {}".format(self.filename))
# Determine the parser class.
- parser_class = get_parser_class(self.filename)
+ mime_type = magic.from_file(self.path, mime=True)
+
+ parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
else:
- self.log("debug", "Parser: {}".format(parser_class.__name__))
+ self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type))
# Notify all listeners that we're going to do some work.
@@ -162,7 +157,8 @@ class Consumer(LoggingMixin):
# store the document.
document = self._store(
text=text,
- date=date
+ date=date,
+ mime_type=mime_type
)
# If we get here, it was successful. Proceed with post-consume
@@ -197,7 +193,7 @@ class Consumer(LoggingMixin):
return document
- def _store(self, text, date):
+ def _store(self, text, date, mime_type):
# If someone gave us the original filename, use it instead of doc.
@@ -220,7 +216,7 @@ class Consumer(LoggingMixin):
correspondent=file_info.correspondent,
title=file_info.title,
content=text,
- file_type=file_info.extension,
+ mime_type=mime_type,
checksum=hashlib.md5(f.read()).hexdigest(),
created=created,
modified=created,
diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py
index 024003118..06d4d2957 100644
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -91,9 +91,9 @@ def generate_filename(document):
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:
- filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
+ filename = "%s-%07i%s" % (path, document.pk, document.file_type)
else:
- filename = "%07i.%s" % (document.pk, document.file_type)
+ filename = "%07i%s" % (document.pk, document.file_type)
# Append .gpg for encrypted files
if document.storage_type == document.STORAGE_TYPE_GPG:
diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py
index 971e6a829..441f1c475 100644
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -127,8 +127,8 @@ class Command(Renderable, BaseCommand):
tags = ",".join([t.slug for t in doc.tags.all()])
if tags:
- return "{} - {} - {} - {}.{}".format(
+ return "{} - {} - {} - {}{}".format(
created, doc.correspondent, doc.title, tags, doc.file_type)
- return "{} - {} - {}.{}".format(
+ return "{} - {} - {}{}".format(
created, doc.correspondent, doc.title, doc.file_type)
diff --git a/src/documents/migrations/1003_mime_types.py b/src/documents/migrations/1003_mime_types.py
new file mode 100644
index 000000000..4c73a4235
--- /dev/null
+++ b/src/documents/migrations/1003_mime_types.py
@@ -0,0 +1,50 @@
+# Generated by Django 3.1.3 on 2020-11-20 11:21
+import os
+
+import magic
+from django.conf import settings
+from django.db import migrations, models
+
+
+def source_path(self):
+ if self.filename:
+ fname = str(self.filename)
+ else:
+ fname = "{:07}.{}".format(self.pk, self.file_type)
+ if self.storage_type == self.STORAGE_TYPE_GPG:
+ fname += ".gpg"
+
+ return os.path.join(
+ settings.ORIGINALS_DIR,
+ fname
+ )
+
+
+def add_mime_types(apps, schema_editor):
+ Document = apps.get_model("documents", "Document")
+ documents = Document.objects.all()
+
+ for d in documents:
+ d.mime_type = magic.from_file(source_path(d), mime=True)
+ d.save()
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('documents', '1002_auto_20201111_1105'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='document',
+ name='mime_type',
+ field=models.CharField(default="-", editable=False, max_length=256),
+ preserve_default=False,
+ ),
+ migrations.RunPython(add_mime_types),
+ migrations.RemoveField(
+ model_name='document',
+ name='file_type',
+ ),
+ ]
diff --git a/src/documents/models.py b/src/documents/models.py
index 4badd2d56..559c395e0 100755
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -1,6 +1,7 @@
# coding=utf-8
import logging
+import mimetypes
import os
import re
from collections import OrderedDict
@@ -113,18 +114,6 @@ class DocumentType(MatchingModel):
class Document(models.Model):
- # TODO: why do we need an explicit list
- TYPE_PDF = "pdf"
- TYPE_PNG = "png"
- TYPE_JPG = "jpg"
- TYPE_GIF = "gif"
- TYPE_TIF = "tiff"
- TYPE_TXT = "txt"
- TYPE_CSV = "csv"
- TYPE_MD = "md"
- TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
- TYPE_TXT, TYPE_CSV, TYPE_MD)
-
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
STORAGE_TYPES = (
@@ -156,10 +145,9 @@ class Document(models.Model):
"primarily used for searching."
)
- file_type = models.CharField(
- max_length=4,
- editable=False,
- choices=tuple([(t, t.upper()) for t in TYPES])
+ mime_type = models.CharField(
+ max_length=256,
+ editable=False
)
tags = models.ManyToManyField(
@@ -223,7 +211,7 @@ class Document(models.Model):
if self.filename:
fname = str(self.filename)
else:
- fname = "{:07}.{}".format(self.pk, self.file_type)
+ fname = "{:07}{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
fname += ".gpg"
@@ -238,7 +226,11 @@ class Document(models.Model):
@property
def file_name(self):
- return slugify(str(self)) + "." + self.file_type
+ return slugify(str(self)) + self.file_type
+
+ @property
+ def file_type(self):
+ return mimetypes.guess_extension(str(self.mime_type))
@property
def thumbnail_path(self):
diff --git a/src/documents/parsers.py b/src/documents/parsers.py
index 496efa188..98f4c5b12 100644
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -6,6 +6,7 @@ import subprocess
import tempfile
import dateparser
+import magic
from django.conf import settings
from django.utils import timezone
@@ -37,10 +38,11 @@ DATE_REGEX = re.compile(
logger = logging.getLogger(__name__)
-def get_parser_class(doc):
- """
- Determine the appropriate parser class based on the file
- """
+def is_mime_type_supported(mime_type):
+ return get_parser_class_for_mime_type(mime_type) is not None
+
+
+def get_parser_class_for_mime_type(mime_type):
options = []
@@ -48,9 +50,9 @@ def get_parser_class(doc):
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
- parser_test = parser_declaration["test"]
+ supported_mime_types = parser_declaration["mime_types"]
- if parser_test(doc):
+ if mime_type in supported_mime_types:
options.append(parser_declaration)
if not options:
@@ -61,6 +63,16 @@ def get_parser_class(doc):
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
+def get_parser_class(path):
+ """
+ Determine the appropriate parser class based on the file
+ """
+
+ mime_type = magic.from_file(path, mime=True)
+
+ return get_parser_class_for_mime_type(mime_type)
+
+
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index e42e26881..cf48e8bd7 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -91,7 +91,7 @@ class DocumentSerializer(serializers.ModelSerializer):
"document_type_id",
"title",
"content",
- "file_type",
+ "mime_type",
"tags",
"tags_id",
"checksum",
diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py
index a049fb825..b0318d2b3 100644
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -45,7 +45,7 @@ class DocumentApiTest(APITestCase):
dt = DocumentType.objects.create(name="dt", pk=63)
tag = Tag.objects.create(name="t", pk=85)
- doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
+ doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123", mime_type="application/pdf")
doc.tags.add(tag)
@@ -95,7 +95,7 @@ class DocumentApiTest(APITestCase):
with open(filename, "wb") as f:
f.write(content)
- doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
+ doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
f.write(content_thumbnail)
@@ -117,7 +117,7 @@ class DocumentApiTest(APITestCase):
def test_document_actions_not_existing_file(self):
- doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
+ doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
self.assertEqual(response.status_code, 404)
@@ -130,9 +130,9 @@ class DocumentApiTest(APITestCase):
def test_document_filters(self):
- doc1 = Document.objects.create(title="none1", checksum="A")
- doc2 = Document.objects.create(title="none2", checksum="B")
- doc3 = Document.objects.create(title="none3", checksum="C")
+ doc1 = Document.objects.create(title="none1", checksum="A", mime_type="application/pdf")
+ doc2 = Document.objects.create(title="none2", checksum="B", mime_type="application/pdf")
+ doc3 = Document.objects.create(title="none3", checksum="C", mime_type="application/pdf")
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
tag_2 = Tag.objects.create(name="t2")
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index f61fd5718..a89bd75ae 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -437,6 +437,18 @@ class FaultyParser(DocumentParser):
raise ParseError("Does not compute.")
+def fake_magic_from_file(file, mime=False):
+
+ if mime:
+ if os.path.splitext(file)[1] == ".pdf":
+ return "application/pdf"
+ else:
+ return "unknown"
+ else:
+ return "A verbose string that describes the contents of the file"
+
+
+@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumer(TestCase):
def make_dummy_parser(self, path, logging_group):
@@ -462,7 +474,7 @@ class TestConsumer(TestCase):
m = patcher.start()
m.return_value = [(None, {
"parser": self.make_dummy_parser,
- "test": lambda _: True,
+ "mime_types": ["application/pdf"],
"weight": 0
})]
@@ -592,7 +604,7 @@ class TestConsumer(TestCase):
def testFaultyParser(self, m):
m.return_value = [(None, {
"parser": self.make_faulty_parser,
- "test": lambda _: True,
+ "mime_types": ["application/pdf"],
"weight": 0
})]
diff --git a/src/documents/tests/test_document_model.py b/src/documents/tests/test_document_model.py
index 2da674527..5b27e2643 100644
--- a/src/documents/tests/test_document_model.py
+++ b/src/documents/tests/test_document_model.py
@@ -13,9 +13,12 @@ class TestDocument(TestCase):
title="Title",
content="content",
checksum="checksum",
+ mime_type="application/pdf"
)
+
file_path = document.source_path
thumb_path = document.thumbnail_path
+
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
document.delete()
mock_unlink.assert_any_call(file_path)
diff --git a/src/documents/tests/test_file_handling.py b/src/documents/tests/test_file_handling.py
index d44e5056a..5ffd35f61 100644
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@@ -31,7 +31,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_generate_source_filename(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -44,7 +44,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -81,7 +81,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_missing_permissions(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -111,10 +111,10 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_database_error(self):
- document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
+ document1 = Document.objects.create(mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.checksum = "BBBBB"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -149,7 +149,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -170,7 +170,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete_nofile(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -179,7 +179,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_directory_not_empty(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -206,7 +206,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_underscore(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -222,7 +222,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_dash(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -238,7 +238,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_malformed(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -254,7 +254,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
def test_tags_all(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -269,7 +269,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
def test_tags_out_of_bounds(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -284,7 +284,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
def test_nested_directory_cleanup(self):
document = Document()
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@@ -309,7 +309,7 @@ class TestDate(TestCase):
def test_format_none(self):
document = Document()
document.pk = 1
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf")
@@ -335,7 +335,7 @@ class TestDate(TestCase):
def test_invalid_format(self):
document = Document()
document.pk = 1
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf")
@@ -344,7 +344,7 @@ class TestDate(TestCase):
def test_invalid_format_key(self):
document = Document()
document.pk = 1
- document.file_type = "pdf"
+ document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf")
diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py
index 93601b9d2..24e285ae7 100644
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@@ -213,7 +213,7 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
TestCase.setUp(self)
User.objects.create_user(username='test_consumer', password='12345')
self.doc_contains = Document.objects.create(
- content="I contain the keyword.", file_type="pdf")
+ content="I contain the keyword.", mime_type="application/pdf")
def test_tag_applied_any(self):
t1 = Tag.objects.create(
diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py
index 5896f3ba3..e99bb8dc6 100644
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -1,3 +1,4 @@
+import os
from tempfile import TemporaryDirectory
from unittest import mock
@@ -5,7 +6,18 @@ from django.test import TestCase
from documents.parsers import get_parser_class
+def fake_magic_from_file(file, mime=False):
+ if mime:
+ if os.path.splitext(file)[1] == ".pdf":
+ return "application/pdf"
+ else:
+ return "unknown"
+ else:
+ return "A verbose string that describes the contents of the file"
+
+
+@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send")
@@ -14,7 +26,7 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
- (None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
+ (None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
)
self.assertEqual(
@@ -32,8 +44,8 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
- (None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
- (None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
+ (None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
+ (None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
)
self.assertEqual(
diff --git a/src/documents/views.py b/src/documents/views.py
index f4c5d0797..89d03a4df 100755
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -104,18 +104,6 @@ class DocumentViewSet(RetrieveModelMixin,
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
def file_response(self, pk, disposition):
- # TODO: this should not be necessary here.
- content_types = {
- Document.TYPE_PDF: "application/pdf",
- Document.TYPE_PNG: "image/png",
- Document.TYPE_JPG: "image/jpeg",
- Document.TYPE_GIF: "image/gif",
- Document.TYPE_TIF: "image/tiff",
- Document.TYPE_CSV: "text/csv",
- Document.TYPE_MD: "text/markdown",
- Document.TYPE_TXT: "text/plain"
- }
-
doc = Document.objects.get(id=pk)
if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
@@ -123,7 +111,7 @@ class DocumentViewSet(RetrieveModelMixin,
else:
file_handle = GnuPG.decrypted(doc.source_file)
- response = HttpResponse(file_handle, content_type=content_types[doc.file_type])
+ response = HttpResponse(file_handle, content_type=doc.mime_type)
response["Content-Disposition"] = '{}; filename="{}"'.format(
disposition, doc.file_name)
return response
diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py
index b942e420a..1aea65d90 100644
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -10,6 +10,7 @@ from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
from documents.loggers import LoggingMixin
from documents.models import Correspondent
+from documents.parsers import is_mime_type_supported
from paperless_mail.models import MailAccount, MailRule
@@ -249,8 +250,7 @@ class MailAccountHandler(LoggingMixin):
title = get_title(message, att, rule)
- # TODO: check with parsers what files types are supported
- if att.content_type == 'application/pdf':
+ if is_mime_type_supported(att.content_type):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py
index 3fc6c2a11..712034038 100644
--- a/src/paperless_tesseract/signals.py
+++ b/src/paperless_tesseract/signals.py
@@ -1,5 +1,3 @@
-import re
-
from .parsers import RasterisedDocumentParser
@@ -7,12 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs):
return {
"parser": RasterisedDocumentParser,
"weight": 0,
- "test": tesseract_consumer_test
+ "mime_types": [
+ "application/pdf",
+ "image/jpeg",
+ "image/png"
+ ]
}
-
-
-MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
-
-
-def tesseract_consumer_test(doc):
- return MATCHING_FILES.match(doc.lower())
diff --git a/src/paperless_tesseract/tests/test_signals.py b/src/paperless_tesseract/tests/test_signals.py
deleted file mode 100644
index 354557732..000000000
--- a/src/paperless_tesseract/tests/test_signals.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from django.test import TestCase
-
-from paperless_tesseract.signals import tesseract_consumer_test
-
-
-class SignalsTestCase(TestCase):
-
- def test_test_handles_various_file_names_true(self):
-
- prefixes = (
- "doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags",
- "A document with a . in it", "Doc with -- in it"
- )
- suffixes = (
- "pdf", "jpg", "jpeg", "gif", "png", "tiff", "tif", "pnm", "bmp",
- "PDF", "JPG", "JPEG", "GIF", "PNG", "TIFF", "TIF", "PNM", "BMP",
- "pDf", "jPg", "jpEg", "gIf", "pNg", "tIff", "tIf", "pNm", "bMp",
- )
-
- for prefix in prefixes:
- for suffix in suffixes:
- name = "{}.{}".format(prefix, suffix)
- self.assertTrue(tesseract_consumer_test(name))
-
- def test_test_handles_various_file_names_false(self):
-
- prefixes = ("doc",)
- suffixes = ("txt", "markdown", "",)
-
- for prefix in prefixes:
- for suffix in suffixes:
- name = "{}.{}".format(prefix, suffix)
- self.assertFalse(tesseract_consumer_test(name))
-
- self.assertFalse(tesseract_consumer_test(""))
- self.assertFalse(tesseract_consumer_test("doc"))
diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py
index 784bfd45d..f9ac9ad23 100644
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -1,5 +1,3 @@
-import re
-
from .parsers import TextDocumentParser
@@ -7,12 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
return {
"parser": TextDocumentParser,
"weight": 10,
- "test": text_consumer_test
+ "mime_types": [
+ "text/plain",
+ "text/comma-separated-values"
+ ]
}
-
-
-MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
-
-
-def text_consumer_test(doc):
- return MATCHING_FILES.match(doc.lower())
From 3d5b66c2b77f8653758d87d432e6d379f69a5399 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Fri, 20 Nov 2020 16:18:59 +0100
Subject: [PATCH 08/52] FileType does not care about the extension anymore.
---
src/documents/consumer.py | 2 +-
src/documents/forms.py | 3 +-
src/documents/models.py | 64 ++++----
src/documents/tests/test_consumer.py | 213 +++++++++++----------------
4 files changed, 118 insertions(+), 164 deletions(-)
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index b8eb8cfca..175f6710f 100755
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -197,7 +197,7 @@ class Consumer(LoggingMixin):
# If someone gave us the original filename, use it instead of doc.
- file_info = FileInfo.from_path(self.filename)
+ file_info = FileInfo.from_filename(self.filename)
stats = os.stat(self.path)
diff --git a/src/documents/forms.py b/src/documents/forms.py
index 38a95a068..c3efc774f 100644
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -34,8 +34,7 @@ class UploadForm(forms.Form):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
- # TODO: dont just append pdf. This is here for taht weird regex check at the start of the consumer.
- with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
+ with tempfile.NamedTemporaryFile(prefix="paperless-upload-", dir=settings.SCRATCH_DIR, delete=False) as f:
f.write(document)
os.utime(f.name, times=(t, t))
diff --git a/src/documents/models.py b/src/documents/models.py
index 559c395e0..6288980c5 100755
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -269,7 +269,7 @@ class Log(models.Model):
def __str__(self):
return self.message
-
+# TODO: why is this in the models file?
class FileInfo:
# This epic regex *almost* worked for our needs, so I'm keeping it here for
@@ -284,53 +284,44 @@ class FileInfo:
non_separated_word=r"([\w,. ]|([^\s]-))"
)
)
- # TODO: what is this used for
- formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(
r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P.*) - "
r"(?P.*) - "
- r"(?P[a-z0-9\-,]*)"
- r"\.(?P{})$".format(formats),
+ r"(?P[a-z0-9\-,]*)$",
flags=re.IGNORECASE
)),
("created-title-tags", re.compile(
r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P.*) - "
- r"(?P[a-z0-9\-,]*)"
- r"\.(?P{})$".format(formats),
+ r"(?P[a-z0-9\-,]*)$",
flags=re.IGNORECASE
)),
("created-correspondent-title", re.compile(
r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P.*) - "
- r"(?P.*)"
- r"\.(?P{})$".format(formats),
+ r"(?P.*)$",
flags=re.IGNORECASE
)),
("created-title", re.compile(
r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
- r"(?P.*)"
- r"\.(?P{})$".format(formats),
+ r"(?P.*)$",
flags=re.IGNORECASE
)),
("correspondent-title-tags", re.compile(
r"(?P.*) - "
r"(?P.*) - "
- r"(?P[a-z0-9\-,]*)"
- r"\.(?P{})$".format(formats),
+ r"(?P[a-z0-9\-,]*)$",
flags=re.IGNORECASE
)),
("correspondent-title", re.compile(
r"(?P.*) - "
- r"(?P.*)?"
- r"\.(?P{})$".format(formats),
+ r"(?P.*)?$",
flags=re.IGNORECASE
)),
("title", re.compile(
- r"(?P.*)"
- r"\.(?P{})$".format(formats),
+ r"(?P.*)$",
flags=re.IGNORECASE
))
])
@@ -373,15 +364,6 @@ class FileInfo:
)[0])
return tuple(r)
- @classmethod
- def _get_extension(cls, extension):
- r = extension.lower()
- if r == "jpeg":
- return "jpg"
- if r == "tif":
- return "tiff"
- return r
-
@classmethod
def _mangle_property(cls, properties, name):
if name in properties:
@@ -390,18 +372,16 @@ class FileInfo:
)
@classmethod
- def from_path(cls, path):
+ def from_filename(cls, filename):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
- " - - - ."
- " - - ."
- " - ."
- "."
+ " - - - "
+ " - - "
+ " - "
+ ""
"""
- filename = os.path.basename(path)
-
# Mutate filename in-place before parsing its components
# by applying at most one of the configured transformations.
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
@@ -409,6 +389,23 @@ class FileInfo:
if count:
break
+ # do this after the transforms so that the transforms can do whatever
+ # with the file extension.
+ filename_no_ext = os.path.splitext(filename)[0]
+
+ if filename_no_ext == filename and filename.startswith("."):
+ # This is a very special case where there is no text before the
+ # file type.
+ # TODO: this should be handled better. The ext is not removed
+ # because usually, files like '.pdf' are just hidden files
+ # with the name pdf, but in our case, its more likely that
+ # there's just no name to begin with.
+ filename = ""
+ # This isn't too bad either, since we'll just not match anything
+ # and return an empty title. TODO: actually, this is kinda bad.
+ else:
+ filename = filename_no_ext
+
# Parse filename components.
for regex in cls.REGEXES.values():
m = regex.match(filename)
@@ -418,5 +415,4 @@ class FileInfo:
cls._mangle_property(properties, "correspondent")
cls._mangle_property(properties, "title")
cls._mangle_property(properties, "tags")
- cls._mangle_property(properties, "extension")
return cls(**properties)
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index a89bd75ae..6dab98d02 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -15,57 +15,42 @@ from ..parsers import DocumentParser, ParseError
class TestAttributes(TestCase):
TAGS = ("tag1", "tag2", "tag3")
- EXTENSIONS = (
- "pdf", "png", "jpg", "jpeg", "gif", "tiff", "tif",
- "PDF", "PNG", "JPG", "JPEG", "GIF", "TIFF", "TIF",
- "PdF", "PnG", "JpG", "JPeG", "GiF", "TiFf", "TiF",
- )
- def _test_guess_attributes_from_name(self, path, sender, title, tags):
+ def _test_guess_attributes_from_name(self, filename, sender, title, tags):
+ file_info = FileInfo.from_filename(filename)
- for extension in self.EXTENSIONS:
+ if sender:
+ self.assertEqual(file_info.correspondent.name, sender, filename)
+ else:
+ self.assertIsNone(file_info.correspondent, filename)
- f = path.format(extension)
- file_info = FileInfo.from_path(f)
+ self.assertEqual(file_info.title, title, filename)
- if sender:
- self.assertEqual(file_info.correspondent.name, sender, f)
- else:
- self.assertIsNone(file_info.correspondent, f)
-
- self.assertEqual(file_info.title, title, f)
-
- self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
- if extension.lower() == "jpeg":
- self.assertEqual(file_info.extension, "jpg", f)
- elif extension.lower() == "tif":
- self.assertEqual(file_info.extension, "tiff", f)
- else:
- self.assertEqual(file_info.extension, extension.lower(), f)
+ self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, filename)
def test_guess_attributes_from_name0(self):
self._test_guess_attributes_from_name(
- "/path/to/Sender - Title.{}", "Sender", "Title", ())
+ "Sender - Title.pdf", "Sender", "Title", ())
def test_guess_attributes_from_name1(self):
self._test_guess_attributes_from_name(
- "/path/to/Spaced Sender - Title.{}", "Spaced Sender", "Title", ())
+ "Spaced Sender - Title.pdf", "Spaced Sender", "Title", ())
def test_guess_attributes_from_name2(self):
self._test_guess_attributes_from_name(
- "/path/to/Sender - Spaced Title.{}", "Sender", "Spaced Title", ())
+ "Sender - Spaced Title.pdf", "Sender", "Spaced Title", ())
def test_guess_attributes_from_name3(self):
self._test_guess_attributes_from_name(
- "/path/to/Dashed-Sender - Title.{}", "Dashed-Sender", "Title", ())
+ "Dashed-Sender - Title.pdf", "Dashed-Sender", "Title", ())
def test_guess_attributes_from_name4(self):
self._test_guess_attributes_from_name(
- "/path/to/Sender - Dashed-Title.{}", "Sender", "Dashed-Title", ())
+ "Sender - Dashed-Title.pdf", "Sender", "Dashed-Title", ())
def test_guess_attributes_from_name5(self):
self._test_guess_attributes_from_name(
- "/path/to/Sender - Title - tag1,tag2,tag3.{}",
+ "Sender - Title - tag1,tag2,tag3.pdf",
"Sender",
"Title",
self.TAGS
@@ -73,7 +58,7 @@ class TestAttributes(TestCase):
def test_guess_attributes_from_name6(self):
self._test_guess_attributes_from_name(
- "/path/to/Spaced Sender - Title - tag1,tag2,tag3.{}",
+ "Spaced Sender - Title - tag1,tag2,tag3.pdf",
"Spaced Sender",
"Title",
self.TAGS
@@ -81,7 +66,7 @@ class TestAttributes(TestCase):
def test_guess_attributes_from_name7(self):
self._test_guess_attributes_from_name(
- "/path/to/Sender - Spaced Title - tag1,tag2,tag3.{}",
+ "Sender - Spaced Title - tag1,tag2,tag3.pdf",
"Sender",
"Spaced Title",
self.TAGS
@@ -89,7 +74,7 @@ class TestAttributes(TestCase):
def test_guess_attributes_from_name8(self):
self._test_guess_attributes_from_name(
- "/path/to/Dashed-Sender - Title - tag1,tag2,tag3.{}",
+ "Dashed-Sender - Title - tag1,tag2,tag3.pdf",
"Dashed-Sender",
"Title",
self.TAGS
@@ -97,7 +82,7 @@ class TestAttributes(TestCase):
def test_guess_attributes_from_name9(self):
self._test_guess_attributes_from_name(
- "/path/to/Sender - Dashed-Title - tag1,tag2,tag3.{}",
+ "Sender - Dashed-Title - tag1,tag2,tag3.pdf",
"Sender",
"Dashed-Title",
self.TAGS
@@ -105,7 +90,7 @@ class TestAttributes(TestCase):
def test_guess_attributes_from_name10(self):
self._test_guess_attributes_from_name(
- "/path/to/Σενδερ - Τιτλε - tag1,tag2,tag3.{}",
+ "Σενδερ - Τιτλε - tag1,tag2,tag3.pdf",
"Σενδερ",
"Τιτλε",
self.TAGS
@@ -113,7 +98,7 @@ class TestAttributes(TestCase):
def test_guess_attributes_from_name_when_correspondent_empty(self):
self._test_guess_attributes_from_name(
- '/path/to/ - weird empty correspondent but should not break.{}',
+ ' - weird empty correspondent but should not break.pdf',
None,
'weird empty correspondent but should not break',
()
@@ -121,7 +106,7 @@ class TestAttributes(TestCase):
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
self._test_guess_attributes_from_name(
- '/path/to/- weird but should not break.{}',
+ '- weird but should not break.pdf',
None,
'- weird but should not break',
()
@@ -129,7 +114,7 @@ class TestAttributes(TestCase):
def test_guess_attributes_from_name_when_title_ends_with_dash(self):
self._test_guess_attributes_from_name(
- '/path/to/weird but should not break -.{}',
+ 'weird but should not break -.pdf',
None,
'weird but should not break -',
()
@@ -137,7 +122,7 @@ class TestAttributes(TestCase):
def test_guess_attributes_from_name_when_title_is_empty(self):
self._test_guess_attributes_from_name(
- '/path/to/weird correspondent but should not break - .{}',
+ 'weird correspondent but should not break - .pdf',
'weird correspondent but should not break',
'',
()
@@ -149,11 +134,11 @@ class TestAttributes(TestCase):
:return:
"""
- path = "Title - Correspondent - tAg1,TAG2.pdf"
- self.assertEqual(len(FileInfo.from_path(path).tags), 2)
+ filename = "Title - Correspondent - tAg1,TAG2.pdf"
+ self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
path = "Title - Correspondent - tag1,tag2.pdf"
- self.assertEqual(len(FileInfo.from_path(path).tags), 2)
+ self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
self.assertEqual(Tag.objects.all().count(), 2)
@@ -173,13 +158,12 @@ class TestFieldPermutations(TestCase):
]
valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
- valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"]
def _test_guessed_attributes(self, filename, created=None,
correspondent=None, title=None,
- extension=None, tags=None):
+ tags=None):
- info = FileInfo.from_path(filename)
+ info = FileInfo.from_filename(filename)
# Created
if created is None:
@@ -207,68 +191,56 @@ class TestFieldPermutations(TestCase):
filename
)
- # Extension
- if extension == 'jpeg':
- extension = 'jpg'
- self.assertEqual(info.extension, extension, filename)
-
def test_just_title(self):
- template = '/path/to/{title}.{extension}'
+ template = '{title}.pdf'
for title in self.valid_titles:
- for extension in self.valid_extensions:
- spec = dict(title=title, extension=extension)
+ spec = dict(title=title)
+ filename = template.format(**spec)
+ self._test_guessed_attributes(filename, **spec)
+
+ def test_title_and_correspondent(self):
+ template = '{correspondent} - {title}.pdf'
+ for correspondent in self.valid_correspondents:
+ for title in self.valid_titles:
+ spec = dict(correspondent=correspondent, title=title)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
- def test_title_and_correspondent(self):
- template = '/path/to/{correspondent} - {title}.{extension}'
- for correspondent in self.valid_correspondents:
- for title in self.valid_titles:
- for extension in self.valid_extensions:
- spec = dict(correspondent=correspondent, title=title,
- extension=extension)
- filename = template.format(**spec)
- self._test_guessed_attributes(filename, **spec)
-
def test_title_and_correspondent_and_tags(self):
- template = '/path/to/{correspondent} - {title} - {tags}.{extension}'
+ template = '{correspondent} - {title} - {tags}.pdf'
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for tags in self.valid_tags:
- for extension in self.valid_extensions:
- spec = dict(correspondent=correspondent, title=title,
- tags=tags, extension=extension)
- filename = template.format(**spec)
- self._test_guessed_attributes(filename, **spec)
+ spec = dict(correspondent=correspondent, title=title,
+ tags=tags)
+ filename = template.format(**spec)
+ self._test_guessed_attributes(filename, **spec)
def test_created_and_correspondent_and_title_and_tags(self):
template = (
- "/path/to/{created} - "
+ "{created} - "
"{correspondent} - "
"{title} - "
- "{tags}"
- ".{extension}"
+ "{tags}.pdf"
)
for created in self.valid_dates:
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for tags in self.valid_tags:
- for extension in self.valid_extensions:
- spec = {
- "created": created,
- "correspondent": correspondent,
- "title": title,
- "tags": tags,
- "extension": extension
- }
- self._test_guessed_attributes(
- template.format(**spec), **spec)
+ spec = {
+ "created": created,
+ "correspondent": correspondent,
+ "title": title,
+ "tags": tags,
+ }
+ self._test_guessed_attributes(
+ template.format(**spec), **spec)
def test_created_and_correspondent_and_title(self):
- template = "/path/to/{created} - {correspondent} - {title}.{extension}"
+ template = "{created} - {correspondent} - {title}.pdf"
for created in self.valid_dates:
for correspondent in self.valid_correspondents:
@@ -279,56 +251,50 @@ class TestFieldPermutations(TestCase):
if title.lower() == title:
continue
- for extension in self.valid_extensions:
- spec = {
- "created": created,
- "correspondent": correspondent,
- "title": title,
- "extension": extension
- }
- self._test_guessed_attributes(
- template.format(**spec), **spec)
-
- def test_created_and_title(self):
-
- template = "/path/to/{created} - {title}.{extension}"
-
- for created in self.valid_dates:
- for title in self.valid_titles:
- for extension in self.valid_extensions:
spec = {
"created": created,
- "title": title,
- "extension": extension
+ "correspondent": correspondent,
+ "title": title
}
self._test_guessed_attributes(
template.format(**spec), **spec)
+ def test_created_and_title(self):
+
+ template = "{created} - {title}.pdf"
+
+ for created in self.valid_dates:
+ for title in self.valid_titles:
+ spec = {
+ "created": created,
+ "title": title
+ }
+ self._test_guessed_attributes(
+ template.format(**spec), **spec)
+
def test_created_and_title_and_tags(self):
- template = "/path/to/{created} - {title} - {tags}.{extension}"
+ template = "{created} - {title} - {tags}.pdf"
for created in self.valid_dates:
for title in self.valid_titles:
for tags in self.valid_tags:
- for extension in self.valid_extensions:
- spec = {
- "created": created,
- "title": title,
- "tags": tags,
- "extension": extension
- }
- self._test_guessed_attributes(
- template.format(**spec), **spec)
+ spec = {
+ "created": created,
+ "title": title,
+ "tags": tags
+ }
+ self._test_guessed_attributes(
+ template.format(**spec), **spec)
def test_invalid_date_format(self):
- info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
+ info = FileInfo.from_filename("06112017Z - title.pdf")
self.assertEqual(info.title, "title")
self.assertIsNone(info.created)
def test_filename_parse_transforms(self):
- path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
+ filename = "tag1,tag2_20190908_180610_0001.pdf"
all_patt = re.compile("^.*$")
none_patt = re.compile("$a")
exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
@@ -336,50 +302,44 @@ class TestFieldPermutations(TestCase):
repl2 = "\\2Z - " + repl1 # creation date + repl1
# No transformations configured (= default)
- info = FileInfo.from_path(path)
+ info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
- self.assertEqual(info.extension, "pdf")
self.assertEqual(info.tags, ())
self.assertIsNone(info.created)
# Pattern doesn't match (filename unaltered)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
- info = FileInfo.from_path(path)
+ info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
- self.assertEqual(info.extension, "pdf")
# Simple transformation (match all)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
- info = FileInfo.from_path(path)
+ info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "all")
- self.assertEqual(info.extension, "gif")
# Multiple transformations configured (first pattern matches)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[
(all_patt, "all.gif"),
(all_patt, "anotherall.gif")]):
- info = FileInfo.from_path(path)
+ info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "all")
- self.assertEqual(info.extension, "gif")
# Multiple transformations configured (second pattern matches)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[
(none_patt, "none.gif"),
(all_patt, "anotherall.gif")]):
- info = FileInfo.from_path(path)
+ info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "anotherall")
- self.assertEqual(info.extension, "gif")
# Complex transformation without date in replacement string
with self.settings(
FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
- info = FileInfo.from_path(path)
+ info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "0001")
- self.assertEqual(info.extension, "pdf")
self.assertEqual(len(info.tags), 2)
self.assertEqual(info.tags[0].slug, "tag1")
self.assertEqual(info.tags[1].slug, "tag2")
@@ -392,9 +352,8 @@ class TestFieldPermutations(TestCase):
(exact_patt, repl2), # <-- matches
(exact_patt, repl1),
(all_patt, "all.gif")]):
- info = FileInfo.from_path(path)
+ info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "0001")
- self.assertEqual(info.extension, "pdf")
self.assertEqual(len(info.tags), 2)
self.assertEqual(info.tags[0].slug, "tag1")
self.assertEqual(info.tags[1].slug, "tag2")
From 09acb134b77fd15cf95c6d7416013811314ed8da Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Fri, 20 Nov 2020 18:14:42 +0100
Subject: [PATCH 09/52] updated mail: now uses mime type detection
---
src/paperless_mail/mail.py | 22 +++++-
src/paperless_mail/tests/test_mail.py | 96 ++++++++++++++++++---------
2 files changed, 85 insertions(+), 33 deletions(-)
diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py
index 1aea65d90..6db5e9070 100644
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -2,6 +2,7 @@ import os
import tempfile
from datetime import timedelta, date
+import magic
from django.conf import settings
from django.utils.text import slugify
from django_q.tasks import async_task
@@ -248,9 +249,21 @@ class MailAccountHandler(LoggingMixin):
for att in message.attachments:
+ if not att.content_disposition == "attachment":
+ self.log(
+ 'debug',
+ f"Rule {rule.account}.{rule}: "
+ f"Skipping attachment {att.filename} "
+ f"with content disposition inline")
+ continue
+
title = get_title(message, att, rule)
- if is_mime_type_supported(att.content_type):
+ # don't trust the content type of the attachment. Could be
+ # generic application/octet-stream.
+ mime_type = magic.from_buffer(att.payload, mime=True)
+
+ if is_mime_type_supported(mime_type):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
@@ -275,5 +288,12 @@ class MailAccountHandler(LoggingMixin):
)
processed_attachments += 1
+ else:
+ self.log(
+ 'debug',
+ f"Rule {rule.account}.{rule}: "
+ f"Skipping attachment {att.filename} "
+ f"since guessed mime type {mime_type} is not supported "
+ f"by paperless")
return processed_attachments
diff --git a/src/paperless_mail/tests/test_mail.py b/src/paperless_mail/tests/test_mail.py
index a3404b774..17d7119a0 100644
--- a/src/paperless_mail/tests/test_mail.py
+++ b/src/paperless_mail/tests/test_mail.py
@@ -99,11 +99,7 @@ def create_message(num_attachments=1, body="", subject="the suject", from_="noon
message.from_ = from_
message.body = body
for i in range(num_attachments):
- attachment = namedtuple('Attachment', [])
- attachment.filename = 'some_file.pdf'
- attachment.content_type = 'application/pdf'
- attachment.payload = b'content of the attachment'
- message.attachments.append(attachment)
+ message.attachments.append(create_attachment(filename=f"file_{i}.pdf"))
message.seen = seen
message.flagged = flagged
@@ -111,6 +107,26 @@ def create_message(num_attachments=1, body="", subject="the suject", from_="noon
return message
+def create_attachment(filename="the_file.pdf", content_disposition="attachment", payload=b"a PDF document"):
+ attachment = namedtuple('Attachment', [])
+ attachment.filename = filename
+ attachment.content_disposition = content_disposition
+ attachment.payload = payload
+ return attachment
+
+
+def fake_magic_from_buffer(buffer, mime=False):
+
+ if mime:
+ if 'PDF' in str(buffer):
+ return 'application/pdf'
+ else:
+ return 'unknown/type'
+ else:
+ return 'Some verbose file description'
+
+
+@mock.patch('paperless_mail.mail.magic.from_buffer', fake_magic_from_buffer)
class TestMail(TestCase):
def setUp(self):
@@ -182,26 +198,7 @@ class TestMail(TestCase):
self.assertEqual(get_title(message, att, rule), "the message title")
def test_handle_message(self):
- message = namedtuple('MailMessage', [])
- message.subject = "the message title"
- message.from_ = "Myself"
-
- att = namedtuple('Attachment', [])
- att.filename = "test1.pdf"
- att.content_type = 'application/pdf'
- att.payload = b"attachment contents"
-
- att2 = namedtuple('Attachment', [])
- att2.filename = "test2.pdf"
- att2.content_type = 'application/pdf'
- att2.payload = b"attachment contents"
-
- att3 = namedtuple('Attachment', [])
- att3.filename = "test3.pdf"
- att3.content_type = 'application/invalid'
- att3.payload = b"attachment contents"
-
- message.attachments = [att, att2, att3]
+ message = create_message(subject="the message title", from_="Myself", num_attachments=2)
account = MailAccount()
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
@@ -215,14 +212,13 @@ class TestMail(TestCase):
args1, kwargs1 = self.async_task.call_args_list[0]
args2, kwargs2 = self.async_task.call_args_list[1]
- self.assertEqual(kwargs1['override_title'], "test1")
- self.assertEqual(kwargs1['override_filename'], "test1.pdf")
+ self.assertEqual(kwargs1['override_title'], "file_0")
+ self.assertEqual(kwargs1['override_filename'], "file_0.pdf")
- self.assertEqual(kwargs2['override_title'], "test2")
- self.assertEqual(kwargs2['override_filename'], "test2.pdf")
+ self.assertEqual(kwargs2['override_title'], "file_1")
+ self.assertEqual(kwargs2['override_filename'], "file_1.pdf")
- @mock.patch("paperless_mail.mail.async_task")
- def test_handle_empty_message(self, m):
+ def test_handle_empty_message(self):
message = namedtuple('MailMessage', [])
message.attachments = []
@@ -230,9 +226,45 @@ class TestMail(TestCase):
result = self.mail_account_handler.handle_message(message, rule)
- self.assertFalse(m.called)
+ self.assertFalse(self.async_task.called)
self.assertEqual(result, 0)
+ def test_handle_unknown_mime_type(self):
+ message = create_message()
+ message.attachments = [
+ create_attachment(filename="f1.pdf"),
+ create_attachment(filename="f2.json", payload=b"{'much': 'payload.', 'so': 'json', 'wow': true}")
+ ]
+
+ account = MailAccount()
+ rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
+
+ result = self.mail_account_handler.handle_message(message, rule)
+
+ self.assertEqual(result, 1)
+ self.assertEqual(self.async_task.call_count, 1)
+
+ args, kwargs = self.async_task.call_args
+ self.assertEqual(kwargs['override_filename'], "f1.pdf")
+
+ def test_handle_disposition(self):
+ message = create_message()
+ message.attachments = [
+ create_attachment(filename="f1.pdf", content_disposition='inline'),
+ create_attachment(filename="f2.pdf", content_disposition='attachment')
+ ]
+
+ account = MailAccount()
+ rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
+
+ result = self.mail_account_handler.handle_message(message, rule)
+
+ self.assertEqual(result, 1)
+ self.assertEqual(self.async_task.call_count, 1)
+
+ args, kwargs = self.async_task.call_args
+ self.assertEqual(kwargs['override_filename'], "f2.pdf")
+
def test_handle_mail_account_mark_read(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
From 321adb5df25161ed5d258d1c5aafef0fb26d4520 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Fri, 20 Nov 2020 18:45:37 +0100
Subject: [PATCH 10/52] making the migration reversible
---
src/documents/migrations/1003_mime_types.py | 29 ++++++++++++++++++++-
1 file changed, 28 insertions(+), 1 deletion(-)
diff --git a/src/documents/migrations/1003_mime_types.py b/src/documents/migrations/1003_mime_types.py
index 4c73a4235..1038d57b3 100644
--- a/src/documents/migrations/1003_mime_types.py
+++ b/src/documents/migrations/1003_mime_types.py
@@ -1,4 +1,5 @@
# Generated by Django 3.1.3 on 2020-11-20 11:21
+import mimetypes
import os
import magic
@@ -29,6 +30,15 @@ def add_mime_types(apps, schema_editor):
d.save()
+def add_file_extensions(apps, schema_editor):
+ Document = apps.get_model("documents", "Document")
+ documents = Document.objects.all()
+
+ for d in documents:
+ d.file_type = os.path.splitext(d.filename)[1].strip('.')
+ d.save()
+
+
class Migration(migrations.Migration):
dependencies = [
@@ -42,7 +52,24 @@ class Migration(migrations.Migration):
field=models.CharField(default="-", editable=False, max_length=256),
preserve_default=False,
),
- migrations.RunPython(add_mime_types),
+ migrations.RunPython(add_mime_types, migrations.RunPython.noop),
+
+ # This operation is here so that we can revert the entire migration:
+ # By allowing this field to be blank and null, we can revert the
+ # remove operation further down and the database won't complain about
+ # NOT NULL violations.
+ migrations.AlterField(
+ model_name='document',
+ name='file_type',
+ field=models.CharField(
+ choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF'), ('txt', 'TXT'), ('csv', 'CSV'), ('md', 'MD')],
+ editable=False,
+ max_length=4,
+ null=True,
+ blank=True
+ ),
+ ),
+ migrations.RunPython(migrations.RunPython.noop, add_file_extensions),
migrations.RemoveField(
model_name='document',
name='file_type',
From 77559332bc543216d0f1275c496304cab172f8a2 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Fri, 20 Nov 2020 18:45:44 +0100
Subject: [PATCH 11/52] docs
---
docs/faq.rst | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/docs/faq.rst b/docs/faq.rst
index 747ffaf53..6cfa4d36f 100644
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -21,6 +21,17 @@ is
files around manually. This folder is meant to be entirely managed by docker
and paperless.
+**Q:** *What file types does paperless-ng support?*
+
+**A:** Currently, the following files are supported:
+
+* PDF documents, PNG images and JPEG images are processed with OCR.
+* Plain text documents are supported as well and are added verbatim
+ to paperless.
+
+Paperless determines the type of a file by inspecting its content. The
+file extensions do not matter.
+
**Q:** *Will paperless-ng run on Raspberry Pi?*
**A:** The short answer is yes. I've tested it on a Raspberry Pi 3 B.
From b7fec4d3551cb95b84eb7de5b555f4ae8370c022 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sat, 21 Nov 2020 01:42:55 +0100
Subject: [PATCH 12/52] using mime type checking during upload
---
src/documents/forms.py | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/src/documents/forms.py b/src/documents/forms.py
index c3efc774f..f44090164 100644
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -3,22 +3,35 @@ import tempfile
from datetime import datetime
from time import mktime
+import magic
from django import forms
from django.conf import settings
from django_q.tasks import async_task
from pathvalidate import validate_filename, ValidationError
+from documents.parsers import is_mime_type_supported
+
class UploadForm(forms.Form):
document = forms.FileField()
def clean_document(self):
+ document_name = self.cleaned_data.get("document").name
+
try:
- validate_filename(self.cleaned_data.get("document").name)
+ validate_filename(document_name)
except ValidationError:
raise forms.ValidationError("That filename is suspicious.")
- return self.cleaned_data.get("document")
+
+ document_data = self.cleaned_data.get("document").read()
+
+ mime_type = magic.from_buffer(document_data, mime=True)
+
+ if not is_mime_type_supported(mime_type):
+ raise forms.ValidationError("This mime type is not supported.")
+
+ return document_name, document_data
def save(self):
"""
@@ -27,8 +40,7 @@ class UploadForm(forms.Form):
form do that as well. Think of it as a poor-man's queue server.
"""
- document = self.cleaned_data.get("document").read()
- original_filename = self.cleaned_data.get("document").name
+ original_filename, data = self.cleaned_data.get("document")
t = int(mktime(datetime.now().timetuple()))
@@ -36,7 +48,7 @@ class UploadForm(forms.Form):
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", dir=settings.SCRATCH_DIR, delete=False) as f:
- f.write(document)
+ f.write(data)
os.utime(f.name, times=(t, t))
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
From 529cc04fd1712105b11796117d821500da57a44d Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sat, 21 Nov 2020 12:12:19 +0100
Subject: [PATCH 13/52] code cleanup
---
src/documents/models.py | 1 +
src/documents/tests/test_parsers.py | 1 +
2 files changed, 2 insertions(+)
diff --git a/src/documents/models.py b/src/documents/models.py
index 6288980c5..8e0435647 100755
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -269,6 +269,7 @@ class Log(models.Model):
def __str__(self):
return self.message
+
# TODO: why is this in the models file?
class FileInfo:
diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py
index e99bb8dc6..239203186 100644
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -6,6 +6,7 @@ from django.test import TestCase
from documents.parsers import get_parser_class
+
def fake_magic_from_file(file, mime=False):
if mime:
From 5a84cc835a1a23f857c7c38b883b44971173f7e8 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sat, 21 Nov 2020 13:05:17 +0100
Subject: [PATCH 14/52] updated release script
---
docs/changelog.rst | 13 +++++++++++++
scripts/make-release.sh | 9 ++++++---
2 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 0da528b60..86a24df27 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -5,6 +5,19 @@
Changelog
*********
+next
+####
+
+* Paperless now uses mime types and libmagic detection to determine
+ if a file type is supported and which parser to use. Removes all
+ file type checks that where present in MANY different places in
+ paperless.
+
+* Mail consumer now correctly consumes documents even when their
+ content type was not set correctly. (i.e. PDF documents with
+ content type ``application/octet-stream``)
+
+
paperless-ng 0.9.1
##################
diff --git a/scripts/make-release.sh b/scripts/make-release.sh
index ef3e5769b..06548748b 100755
--- a/scripts/make-release.sh
+++ b/scripts/make-release.sh
@@ -17,6 +17,7 @@ PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
# output directory
PAPERLESS_DIST="$PAPERLESS_ROOT/dist"
PAPERLESS_DIST_APP="$PAPERLESS_DIST/paperless-ng"
+PAPERLESS_DIST_DOCKERFILES="$PAPERLESS_DIST/paperless-ng-dockerfiles"
if [ -d "$PAPERLESS_DIST" ]
then
@@ -27,6 +28,7 @@ fi
mkdir "$PAPERLESS_DIST"
mkdir "$PAPERLESS_DIST_APP"
mkdir "$PAPERLESS_DIST_APP/docker"
+mkdir "$PAPERLESS_DIST_DOCKERFILES"
# setup dependencies.
@@ -78,9 +80,9 @@ cp "$PAPERLESS_ROOT/docker/local/"* "$PAPERLESS_DIST_APP"
cp "$PAPERLESS_ROOT/docker/docker-compose.env" "$PAPERLESS_DIST_APP"
# docker files for pulling from docker hub
-cp "$PAPERLESS_ROOT/docker/hub/"* "$PAPERLESS_DIST"
-cp "$PAPERLESS_ROOT/.env" "$PAPERLESS_DIST"
-cp "$PAPERLESS_ROOT/docker/docker-compose.env" "$PAPERLESS_DIST"
+cp "$PAPERLESS_ROOT/docker/hub/"* "$PAPERLESS_DIST_DOCKERFILES"
+cp "$PAPERLESS_ROOT/.env" "$PAPERLESS_DIST_DOCKERFILES"
+cp "$PAPERLESS_ROOT/docker/docker-compose.env" "$PAPERLESS_DIST_DOCKERFILES"
# auxiliary files required for the docker image
cp "$PAPERLESS_ROOT/docker/docker-entrypoint.sh" "$PAPERLESS_DIST_APP/docker/"
@@ -99,3 +101,4 @@ docker build . -t "jonaswinkler/paperless-ng:$VERSION"
cd "$PAPERLESS_DIST"
tar -cJf "paperless-ng-$VERSION.tar.xz" paperless-ng/
+tar -cJf "paperless-ng-$VERSION-dockerfiles.tar.xz" paperless-ng-dockerfiles/
From b44f8383e447089628673a7b222e0b6c1a9b5c15 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sat, 21 Nov 2020 14:03:45 +0100
Subject: [PATCH 15/52] code cleanup
---
src/documents/consumer.py | 14 +++--
src/documents/file_handling.py | 4 +-
src/documents/forms.py | 9 ++-
src/documents/index.py | 3 +-
.../management/commands/document_consumer.py | 15 +++--
.../management/commands/document_exporter.py | 4 +-
.../management/commands/document_importer.py | 2 +-
.../management/commands/document_retagger.py | 6 +-
src/documents/matching.py | 18 ++++--
src/documents/parsers.py | 30 +++++++---
src/documents/serialisers.py | 6 +-
src/documents/signals/handlers.py | 60 +++++++++++++------
src/documents/tests/test_checks.py | 8 ---
src/documents/views.py | 41 ++++++++++---
src/paperless_mail/mail.py | 14 +++--
src/paperless_mail/models.py | 12 ++--
src/paperless_mail/tasks.py | 3 +-
src/paperless_tesseract/parsers.py | 60 ++++++++++++-------
18 files changed, 208 insertions(+), 101 deletions(-)
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 175f6710f..65febc937 100755
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -104,9 +104,11 @@ class Consumer(LoggingMixin):
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
- raise ConsumerError("No parsers abvailable for {}".format(self.filename))
+ raise ConsumerError(f"No parsers abvailable for {self.filename}")
else:
- self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type))
+ self.log("debug",
+ f"Parser: {parser_class.__name__} "
+ f"based on mime type {mime_type}")
# Notify all listeners that we're going to do some work.
@@ -126,7 +128,7 @@ class Consumer(LoggingMixin):
# Parse the document. This may take some time.
try:
- self.log("debug", "Generating thumbnail for {}...".format(self.filename))
+ self.log("debug", f"Generating thumbnail for {self.filename}...")
thumbnail = document_parser.get_optimised_thumbnail()
self.log("debug", "Parsing {}...".format(self.filename))
text = document_parser.get_text()
@@ -244,10 +246,12 @@ class Consumer(LoggingMixin):
document.title = self.override_title
if self.override_correspondent_id:
- document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
+ document.correspondent = Correspondent.objects.get(
+ pk=self.override_correspondent_id)
if self.override_document_type_id:
- document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
+ document.document_type = DocumentType.objects.get(
+ pk=self.override_document_type_id)
if self.override_tag_ids:
for tag_id in self.override_tag_ids:
diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py
index 06d4d2957..cd47406b6 100644
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -87,7 +87,9 @@ def generate_filename(document):
tags=tags,
)
except (ValueError, KeyError, IndexError):
- logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
+ logging.getLogger(__name__).warning(
+ f"Invalid PAPERLESS_FILENAME_FORMAT: "
+ f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:
diff --git a/src/documents/forms.py b/src/documents/forms.py
index f44090164..0471a8312 100644
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -46,9 +46,14 @@ class UploadForm(forms.Form):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
- with tempfile.NamedTemporaryFile(prefix="paperless-upload-", dir=settings.SCRATCH_DIR, delete=False) as f:
+ with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
+ dir=settings.SCRATCH_DIR,
+ delete=False) as f:
f.write(data)
os.utime(f.name, times=(t, t))
- async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
+ async_task("documents.tasks.consume_file",
+ f.name,
+ override_filename=original_filename,
+ task_name=os.path.basename(original_filename))
diff --git a/src/documents/index.py b/src/documents/index.py
index ad3a50010..cf312cbcc 100644
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -120,6 +120,7 @@ def query_page(ix, query, page):
def autocomplete(ix, term, limit=10):
with ix.reader() as reader:
terms = []
- for (score, t) in reader.most_distinctive_terms("content", limit, term.lower()):
+ for (score, t) in reader.most_distinctive_terms(
+ "content", number=limit, prefix=term.lower()):
terms.append(t)
return terms
diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py
index 2b8ac7100..70c36a03c 100644
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -19,10 +19,13 @@ class Handler(FileSystemEventHandler):
def _consume(self, file):
if os.path.isfile(file):
try:
- async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
+ async_task("documents.tasks.consume_file",
+ file,
+ task_name=os.path.basename(file))
except Exception as e:
# Catch all so that the consumer won't crash.
- logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
+ logging.getLogger(__name__).error(
+ "Error while consuming document: {}".format(e))
def on_created(self, event):
self._consume(event.src_path)
@@ -66,12 +69,14 @@ class Command(BaseCommand):
# Consume all files as this is not done initially by the watchdog
for entry in os.scandir(directory):
if entry.is_file():
- async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
+ async_task("documents.tasks.consume_file",
+ entry.path,
+ task_name=os.path.basename(entry.path))
# Start the watchdog. Woof!
if settings.CONSUMER_POLLING > 0:
- logging.getLogger(__name__).info('Using polling instead of file'
- 'system notifications.')
+ logging.getLogger(__name__).info(
+ "Using polling instead of file system notifications.")
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
else:
observer = Observer()
diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py
index 441f1c475..f86462119 100644
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -63,7 +63,7 @@ class Command(Renderable, BaseCommand):
document = document_map[document_dict["pk"]]
- unique_filename = "{:07}_{}".format(document.pk, document.file_name)
+ unique_filename = f"{document.pk:07}_{document.file_name}"
file_target = os.path.join(self.target, unique_filename)
@@ -73,7 +73,7 @@ class Command(Renderable, BaseCommand):
document_dict[EXPORTER_FILE_NAME] = unique_filename
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
- print("Exporting: {}".format(file_target))
+ print(f"Exporting: {file_target}")
t = int(time.mktime(document.created.timetuple()))
if document.storage_type == Document.STORAGE_TYPE_GPG:
diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py
index da9086144..208a0ef37 100644
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -120,7 +120,7 @@ class Command(Renderable, BaseCommand):
encrypted.write(GnuPG.encrypted(unencrypted))
else:
- print("Moving {} to {}".format(document_path, document.source_path))
+ print(f"Moving {document_path} to {document.source_path}")
shutil.copy(document_path, document.source_path)
shutil.copy(thumbnail_path, document.thumbnail_path)
diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py
index e48b8802c..cf014dc6f 100755
--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -74,13 +74,13 @@ class Command(Renderable, BaseCommand):
try:
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
- logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
+ logging.getLogger(__name__).warning(
+ f"Cannot classify documents: {e}.")
classifier = None
for document in documents:
logging.getLogger(__name__).info(
- "Processing document {}".format(document.title)
- )
+ f"Processing document {document.title}")
if options['correspondent']:
set_correspondent(
diff --git a/src/documents/matching.py b/src/documents/matching.py
index e5789ab2e..ae1a9a9cf 100644
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -6,17 +6,23 @@ from documents.models import MatchingModel, Correspondent, DocumentType, Tag
def match_correspondents(document_content, classifier):
- correspondents = Correspondent.objects.all()
- predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
+ if classifier:
+ pred_id = classifier.predict_correspondent(document_content)
+ else:
+ pred_id = None
- return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
+ correspondents = Correspondent.objects.all()
+ return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id]
def match_document_types(document_content, classifier):
- document_types = DocumentType.objects.all()
- predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
+ if classifier:
+ pred_id = classifier.predict_document_type(document_content)
+ else:
+ pred_id = None
- return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
+ document_types = DocumentType.objects.all()
+ return [o for o in document_types if matches(o, document_content) or o.pk == pred_id]
def match_tags(document_content, classifier):
diff --git a/src/documents/parsers.py b/src/documents/parsers.py
index 98f4c5b12..eb8ccf45e 100644
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -73,7 +73,18 @@ def get_parser_class(path):
return get_parser_class_for_mime_type(mime_type)
-def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
+def run_convert(input_file,
+ output_file,
+ density=None,
+ scale=None,
+ alpha=None,
+ strip=False,
+ trim=False,
+ type=None,
+ depth=None,
+ extra=None,
+ logging_group=None):
+
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@@ -102,10 +113,13 @@ def run_unpaper(pnm, logging_group=None):
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
pnm_out)
- logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group})
+ logger.debug(f"Execute: {' '.join(command_args)}",
+ extra={'group': logging_group})
- if not subprocess.Popen(command_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait() == 0:
- raise ParseError("Unpaper failed at {}".format(command_args))
+ if not subprocess.Popen(command_args,
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL).wait() == 0:
+ raise ParseError(f"Unpaper failed at {command_args}")
return pnm_out
@@ -124,7 +138,8 @@ class DocumentParser(LoggingMixin):
super().__init__()
self.logging_group = logging_group
self.document_path = path
- self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
+ self.tempdir = tempfile.mkdtemp(
+ prefix="paperless-", dir=settings.SCRATCH_DIR)
def get_thumbnail(self):
"""
@@ -137,9 +152,10 @@ class DocumentParser(LoggingMixin):
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "optipng.png")
- args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
+ args = (settings.OPTIPNG_BINARY,
+ "-silent", "-o5", in_path, "-out", out_path)
- self.log('debug', 'Execute: ' + " ".join(args))
+ self.log('debug', f"Execute: {' '.join(args)}")
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index cf48e8bd7..e0ad73a23 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -76,9 +76,11 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):
class DocumentSerializer(serializers.ModelSerializer):
- correspondent_id = CorrespondentField(allow_null=True, source='correspondent')
+ correspondent_id = CorrespondentField(
+ allow_null=True, source='correspondent')
tags_id = TagsField(many=True, source='tags')
- document_type_id = DocumentTypeField(allow_null=True, source='document_type')
+ document_type_id = DocumentTypeField(
+ allow_null=True, source='document_type')
class Meta:
model = Document
diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py
index 671cdb104..f83f88783 100755
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -25,11 +25,18 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
document.tags.add(*inbox_tags)
-def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
+def set_correspondent(sender,
+ document=None,
+ logging_group=None,
+ classifier=None,
+ replace=False,
+ use_first=True,
+ **kwargs):
if document.correspondent and not replace:
return
- potential_correspondents = matching.match_correspondents(document.content, classifier)
+ potential_correspondents = matching.match_correspondents(document.content,
+ classifier)
potential_count = len(potential_correspondents)
if potential_correspondents:
@@ -38,22 +45,22 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
selected = None
if potential_count > 1:
if use_first:
- message = "Detected {} potential correspondents, so we've opted for {}"
logger(
- message.format(potential_count, selected),
+ f"Detected {potential_count} potential correspondents, "
+ f"so we've opted for {selected}",
logging_group
)
else:
- message = "Detected {} potential correspondents, not assigning any correspondent"
logger(
- message.format(potential_count),
+ f"Detected {potential_count} potential correspondents, "
+ f"not assigning any correspondent",
logging_group
)
return
if selected or replace:
logger(
- 'Assigning correspondent "{}" to "{}" '.format(selected, document),
+ f"Assigning correspondent {selected} to {document}",
logging_group
)
@@ -61,11 +68,18 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
document.save(update_fields=("correspondent",))
-def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
+def set_document_type(sender,
+ document=None,
+ logging_group=None,
+ classifier=None,
+ replace=False,
+ use_first=True,
+ **kwargs):
if document.document_type and not replace:
return
- potential_document_type = matching.match_document_types(document.content, classifier)
+ potential_document_type = matching.match_document_types(document.content,
+ classifier)
potential_count = len(potential_document_type)
if potential_document_type:
@@ -75,22 +89,22 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None
if potential_count > 1:
if use_first:
- message = "Detected {} potential document types, so we've opted for {}"
logger(
- message.format(potential_count, selected),
+ f"Detected {potential_count} potential document types, "
+ f"so we've opted for {selected}",
logging_group
)
else:
- message = "Detected {} potential document types, not assigning any document type"
logger(
- message.format(potential_count),
+ f"Detected {potential_count} potential document types, "
+ f"not assigning any document type",
logging_group
)
return
if selected or replace:
logger(
- 'Assigning document type "{}" to "{}" '.format(selected, document),
+ f"Assigning document type {selected} to {document}",
logging_group
)
@@ -98,14 +112,21 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None
document.save(update_fields=("document_type",))
-def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs):
+def set_tags(sender,
+ document=None,
+ logging_group=None,
+ classifier=None,
+ replace=False,
+ **kwargs):
if replace:
document.tags.clear()
current_tags = set([])
else:
current_tags = set(document.tags.all())
- relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags
+ matched_tags = matching.match_tags(document.content, classifier)
+
+ relevant_tags = set(matched_tags) - current_tags
if not relevant_tags:
return
@@ -180,12 +201,15 @@ def update_filename_and_move_files(sender, instance, **kwargs):
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
- logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path))
+ logging.getLogger(__name__).fatal(
+ f"Document {str(instance)}: File {old_path} has gone.")
return
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
- logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path))
+ logging.getLogger(__name__).warning(
+ f"Document {str(instance)}: Cannot rename file "
+ f"since target path {new_path} already exists.")
return
create_source_path_directory(new_path)
diff --git a/src/documents/tests/test_checks.py b/src/documents/tests/test_checks.py
index d316f94b5..1027c11a0 100644
--- a/src/documents/tests/test_checks.py
+++ b/src/documents/tests/test_checks.py
@@ -15,11 +15,3 @@ class ChecksTestCase(TestCase):
def test_changed_password_check_no_encryption(self):
DocumentFactory.create(storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
self.assertEqual(changed_password_check(None), [])
-
- @unittest.skip("I don't know how to test this")
- def test_changed_password_check_gpg_encryption_with_good_password(self):
- pass
-
- @unittest.skip("I don't know how to test this")
- def test_changed_password_check_fail(self):
- pass
diff --git a/src/documents/views.py b/src/documents/views.py
index 89d03a4df..14323e933 100755
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -47,18 +47,30 @@ class IndexView(TemplateView):
class CorrespondentViewSet(ModelViewSet):
model = Correspondent
- queryset = Correspondent.objects.annotate(document_count=Count('documents'), last_correspondence=Max('documents__created')).order_by('name')
+
+ queryset = Correspondent.objects.annotate(
+ document_count=Count('documents'),
+ last_correspondence=Max('documents__created')).order_by('name')
+
serializer_class = CorrespondentSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filterset_class = CorrespondentFilterSet
- ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
+ ordering_fields = (
+ "name",
+ "matching_algorithm",
+ "match",
+ "document_count",
+ "last_correspondence")
class TagViewSet(ModelViewSet):
model = Tag
- queryset = Tag.objects.annotate(document_count=Count('documents')).order_by('name')
+
+ queryset = Tag.objects.annotate(
+ document_count=Count('documents')).order_by('name')
+
serializer_class = TagSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
@@ -69,7 +81,10 @@ class TagViewSet(ModelViewSet):
class DocumentTypeViewSet(ModelViewSet):
model = DocumentType
- queryset = DocumentType.objects.annotate(document_count=Count('documents')).order_by('name')
+
+ queryset = DocumentType.objects.annotate(
+ document_count=Count('documents')).order_by('name')
+
serializer_class = DocumentTypeSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
@@ -92,10 +107,18 @@ class DocumentViewSet(RetrieveModelMixin,
filterset_class = DocumentFilterSet
search_fields = ("title", "correspondent__name", "content")
ordering_fields = (
- "id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
+ "id",
+ "title",
+ "correspondent__name",
+ "document_type__name",
+ "created",
+ "modified",
+ "added",
+ "archive_serial_number")
def update(self, request, *args, **kwargs):
- response = super(DocumentViewSet, self).update(request, *args, **kwargs)
+ response = super(DocumentViewSet, self).update(
+ request, *args, **kwargs)
index.add_or_update_document(self.get_object())
return response
@@ -138,7 +161,8 @@ class DocumentViewSet(RetrieveModelMixin,
@cache_control(public=False, max_age=315360000)
def thumb(self, request, pk=None):
try:
- return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
+ return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
+ content_type='image/png')
except FileNotFoundError:
raise Http404("Document thumbnail does not exist")
@@ -230,5 +254,6 @@ class StatisticsView(APIView):
def get(self, request, format=None):
return Response({
'documents_total': Document.objects.all().count(),
- 'documents_inbox': Document.objects.filter(tags__is_inbox_tag=True).distinct().count()
+ 'documents_inbox': Document.objects.filter(
+ tags__is_inbox_tag=True).distinct().count()
})
diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py
index 6db5e9070..03f915769 100644
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -174,8 +174,8 @@ class MailAccountHandler(LoggingMixin):
M.folder.set(rule.folder)
except MailboxFolderSelectError:
raise MailError(
- f"Rule {rule.name}: Folder {rule.folder} does not exist "
- f"in account {account.name}")
+ f"Rule {rule.name}: Folder {rule.folder} "
+ f"does not exist in account {account.name}")
criterias = make_criterias(rule)
@@ -185,7 +185,8 @@ class MailAccountHandler(LoggingMixin):
f"{str(AND(**criterias))}")
try:
- messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
+ messages = M.fetch(criteria=AND(**criterias),
+ mark_seen=False)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while fetching folder "
@@ -226,8 +227,8 @@ class MailAccountHandler(LoggingMixin):
except Exception:
raise MailError(
- f"Rule {rule.name}: Error while processing post-consume "
- f"actions for account {account.name}")
+ f"Rule {rule.name}: Error while processing "
+ f"post-consume actions for account {account.name}")
return total_processed_files
@@ -266,7 +267,8 @@ class MailAccountHandler(LoggingMixin):
if is_mime_type_supported(mime_type):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
- _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
+ _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-",
+ dir=settings.SCRATCH_DIR)
with open(temp_filename, 'wb') as f:
f.write(att.payload)
diff --git a/src/paperless_mail/models.py b/src/paperless_mail/models.py
index e37fbee16..14da202fa 100644
--- a/src/paperless_mail/models.py
+++ b/src/paperless_mail/models.py
@@ -66,10 +66,14 @@ class MailRule(models.Model):
CORRESPONDENT_FROM_CUSTOM = 4
CORRESPONDENT_SELECTOR = (
- (CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"),
- (CORRESPONDENT_FROM_EMAIL, "Use mail address"),
- (CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"),
- (CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
+ (CORRESPONDENT_FROM_NOTHING,
+ "Do not assign a correspondent"),
+ (CORRESPONDENT_FROM_EMAIL,
+ "Use mail address"),
+ (CORRESPONDENT_FROM_NAME,
+ "Use name (or mail address if not available)"),
+ (CORRESPONDENT_FROM_CUSTOM,
+ "Use correspondent selected below")
)
name = models.CharField(max_length=256, unique=True)
diff --git a/src/paperless_mail/tasks.py b/src/paperless_mail/tasks.py
index 22d512c1e..e75711dce 100644
--- a/src/paperless_mail/tasks.py
+++ b/src/paperless_mail/tasks.py
@@ -7,7 +7,8 @@ from paperless_mail.models import MailAccount
def process_mail_accounts():
total_new_documents = 0
for account in MailAccount.objects.all():
- total_new_documents += MailAccountHandler().handle_mail_account(account)
+ total_new_documents += MailAccountHandler().handle_mail_account(
+ account)
if total_new_documents > 0:
return f"Added {total_new_documents} document(s)."
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
index 73b2414d5..d0ce01327 100644
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -50,7 +50,10 @@ class RasterisedDocumentParser(DocumentParser):
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
- self.log('warning', 'Thumbnail generation with ImageMagick failed, falling back to ghostscript. Check your /etc/ImageMagick-x/policy.xml!')
+ self.log(
+ 'warning',
+ "Thumbnail generation with ImageMagick failed, falling back "
+ "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
@@ -98,24 +101,38 @@ class RasterisedDocumentParser(DocumentParser):
try:
sample_page_index = int(len(images) / 2)
- self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
- sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
+ self.log(
+ "debug",
+ f"Attempting language detection on page "
+ f"{sample_page_index + 1} of {len(images)}...")
+
+ sample_page_text = self._ocr([images[sample_page_index]],
+ settings.OCR_LANGUAGE)[0]
guessed_language = self._guess_language(sample_page_text)
if not guessed_language or guessed_language not in ISO639:
self.log("warning", "Language detection failed.")
- ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+ ocr_pages = self._complete_ocr_default_language(
+ images, sample_page_index, sample_page_text)
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
- self.log("debug", "Detected language: {} (default language)".format(guessed_language))
- ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+ self.log(
+ "debug",
+ f"Detected language: {guessed_language} "
+ f"(default language)")
+ ocr_pages = self._complete_ocr_default_language(
+ images, sample_page_index, sample_page_text)
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
- self.log("warning", "Detected language {} is not available on this system.".format(guessed_language))
- ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+ self.log(
+ "warning",
+ f"Detected language {guessed_language} is not available "
+ f"on this system.")
+ ocr_pages = self._complete_ocr_default_language(
+ images, sample_page_index, sample_page_text)
else:
- self.log("debug", "Detected language: {}".format(guessed_language))
+ self.log("debug", f"Detected language: {guessed_language}")
ocr_pages = self._ocr(images, ISO639[guessed_language])
self.log("debug", "OCR completed.")
@@ -130,7 +147,9 @@ class RasterisedDocumentParser(DocumentParser):
Greyscale images are easier for Tesseract to OCR
"""
- self.log("debug", "Converting document {} into greyscale images...".format(self.document_path))
+ self.log(
+ "debug",
+ f"Converting document {self.document_path} into greyscale images")
# Convert PDF to multiple PNMs
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
@@ -148,7 +167,7 @@ class RasterisedDocumentParser(DocumentParser):
if f.endswith(".pnm"):
pnms.append(os.path.join(self.tempdir, f))
- self.log("debug", "Running unpaper on {} pages...".format(len(pnms)))
+ self.log("debug", f"Running unpaper on {len(pnms)} pages...")
# Run unpaper in parallel on converted images
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
@@ -161,26 +180,25 @@ class RasterisedDocumentParser(DocumentParser):
guess = langdetect.detect(text)
return guess
except Exception as e:
- self.log('warning', "Language detection failed with: {}".format(e))
+ self.log('warning', f"Language detection failed with: {e}")
return None
def _ocr(self, imgs, lang):
- self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
+ self.log(
+ "debug",
+ f"Performing OCR on {len(imgs)} page(s) with language {lang}")
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
return r
- def _complete_ocr_default_language(self, images, sample_page_index, sample_page):
- """
- Given a `middle` value and the text that middle page represents, we OCR
- the remainder of the document and return the whole thing.
- """
- # text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text
- # text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE)
+ def _complete_ocr_default_language(self,
+ images,
+ sample_page_index,
+ sample_page):
images_copy = list(images)
del images_copy[sample_page_index]
if images_copy:
- self.log('debug', 'Continuing ocr with default language.')
+ self.log('debug', "Continuing ocr with default language.")
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
ocr_pages.insert(sample_page_index, sample_page)
return ocr_pages
From 450fb877f6214202240cd7429c2c94c0ed26562b Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sat, 21 Nov 2020 15:34:00 +0100
Subject: [PATCH 16/52] code cleanup
---
src/documents/classifier.py | 30 +++++++++++-------
src/documents/file_handling.py | 31 +++++++++---------
src/documents/matching.py | 51 ++++++++++++++++++++----------
src/paperless/auth.py | 2 +-
src/paperless_mail/mail.py | 4 +--
src/paperless_tesseract/parsers.py | 2 +-
6 files changed, 71 insertions(+), 49 deletions(-)
diff --git a/src/documents/classifier.py b/src/documents/classifier.py
index 1b70dcd6f..6e0d6f946 100755
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -30,10 +30,12 @@ class DocumentClassifier(object):
FORMAT_VERSION = 5
def __init__(self):
- # mtime of the model file on disk. used to prevent reloading when nothing has changed.
+ # mtime of the model file on disk. used to prevent reloading when
+ # nothing has changed.
self.classifier_version = 0
- # hash of the training data. used to prevent re-training when the training data has not changed.
+ # hash of the training data. used to prevent re-training when the
+ # training data has not changed.
self.data_hash = None
self.data_vectorizer = None
@@ -48,10 +50,12 @@ class DocumentClassifier(object):
schema_version = pickle.load(f)
if schema_version != self.FORMAT_VERSION:
- raise IncompatibleClassifierVersionError("Cannor load classifier, incompatible versions.")
+ raise IncompatibleClassifierVersionError(
+ "Cannor load classifier, incompatible versions.")
else:
if self.classifier_version > 0:
- logger.info("Classifier updated on disk, reloading classifier models")
+ logger.info("Classifier updated on disk, "
+ "reloading classifier models")
self.data_hash = pickle.load(f)
self.data_vectorizer = pickle.load(f)
self.tags_binarizer = pickle.load(f)
@@ -82,20 +86,22 @@ class DocumentClassifier(object):
# Step 1: Extract and preprocess training data from the database.
logging.getLogger(__name__).debug("Gathering data from database...")
m = hashlib.sha1()
- for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True):
+ for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True): # NOQA: E501
preprocessed_content = preprocess_content(doc.content)
m.update(preprocessed_content.encode('utf-8'))
data.append(preprocessed_content)
y = -1
- if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
- y = doc.document_type.pk
+ dt = doc.document_type
+ if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
+ y = dt.pk
m.update(y.to_bytes(4, 'little', signed=True))
labels_document_type.append(y)
y = -1
- if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
- y = doc.correspondent.pk
+ cor = doc.correspondent
+ if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
+ y = cor.pk
m.update(y.to_bytes(4, 'little', signed=True))
labels_correspondent.append(y)
@@ -145,7 +151,7 @@ class DocumentClassifier(object):
# Step 3: train the classifiers
if num_tags > 0:
logging.getLogger(__name__).debug("Training tags classifier...")
- self.tags_classifier = MLPClassifier(verbose=True, tol=0.01)
+ self.tags_classifier = MLPClassifier(tol=0.01)
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
else:
self.tags_classifier = None
@@ -157,7 +163,7 @@ class DocumentClassifier(object):
logging.getLogger(__name__).debug(
"Training correspondent classifier..."
)
- self.correspondent_classifier = MLPClassifier(verbose=True, tol=0.01)
+ self.correspondent_classifier = MLPClassifier(tol=0.01)
self.correspondent_classifier.fit(
data_vectorized,
labels_correspondent
@@ -173,7 +179,7 @@ class DocumentClassifier(object):
logging.getLogger(__name__).debug(
"Training document type classifier..."
)
- self.document_type_classifier = MLPClassifier(verbose=True, tol=0.01)
+ self.document_type_classifier = MLPClassifier(tol=0.01)
self.document_type_classifier.fit(
data_vectorized,
labels_document_type
diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py
index cd47406b6..ee7e9b761 100644
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -65,25 +65,24 @@ def many_to_dictionary(field):
return mydictionary
-def generate_filename(document):
- # Create filename based on configured format
+def generate_filename(doc):
path = ""
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdict(lambda: slugify(None),
- many_to_dictionary(document.tags))
+ many_to_dictionary(doc.tags))
path = settings.PAPERLESS_FILENAME_FORMAT.format(
- correspondent=slugify(document.correspondent),
- title=slugify(document.title),
- created=slugify(document.created),
- created_year=document.created.year if document.created else "none",
- created_month=document.created.month if document.created else "none",
- created_day=document.created.day if document.created else "none",
- added=slugify(document.added),
- added_year=document.added.year if document.added else "none",
- added_month=document.added.month if document.added else "none",
- added_day=document.added.day if document.added else "none",
+ correspondent=slugify(doc.correspondent),
+ title=slugify(doc.title),
+ created=slugify(doc.created),
+ created_year=doc.created.year if doc.created else "none",
+ created_month=doc.created.month if doc.created else "none",
+ created_day=doc.created.day if doc.created else "none",
+ added=slugify(doc.added),
+ added_year=doc.added.year if doc.added else "none",
+ added_month=doc.added.month if doc.added else "none",
+ added_day=doc.added.day if doc.added else "none",
tags=tags,
)
except (ValueError, KeyError, IndexError):
@@ -93,12 +92,12 @@ def generate_filename(document):
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:
- filename = "%s-%07i%s" % (path, document.pk, document.file_type)
+ filename = "%s-%07i%s" % (path, doc.pk, doc.file_type)
else:
- filename = "%07i%s" % (document.pk, document.file_type)
+ filename = "%07i%s" % (doc.pk, doc.file_type)
# Append .gpg for encrypted files
- if document.storage_type == document.STORAGE_TYPE_GPG:
+ if doc.storage_type == doc.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
diff --git a/src/documents/matching.py b/src/documents/matching.py
index ae1a9a9cf..212698ad3 100644
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -12,7 +12,10 @@ def match_correspondents(document_content, classifier):
pred_id = None
correspondents = Correspondent.objects.all()
- return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id]
+
+ return list(filter(
+ lambda o: matches(o, document_content) or o.pk == pred_id,
+ correspondents))
def match_document_types(document_content, classifier):
@@ -22,15 +25,23 @@ def match_document_types(document_content, classifier):
pred_id = None
document_types = DocumentType.objects.all()
- return [o for o in document_types if matches(o, document_content) or o.pk == pred_id]
+
+ return list(filter(
+ lambda o: matches(o, document_content) or o.pk == pred_id,
+ document_types))
def match_tags(document_content, classifier):
- objects = Tag.objects.all()
- predicted_tag_ids = classifier.predict_tags(document_content) if classifier else []
+ if classifier:
+ predicted_tag_ids = classifier.predict_tags(document_content)
+ else:
+ predicted_tag_ids = []
- matched_tags = [o for o in objects if matches(o, document_content) or o.pk in predicted_tag_ids]
- return matched_tags
+ tags = Tag.objects.all()
+
+ return list(filter(
+ lambda o: matches(o, document_content) or o.pk in predicted_tag_ids,
+ tags))
def matches(matching_model, document_content):
@@ -48,39 +59,45 @@ def matches(matching_model, document_content):
if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
for word in _split_match(matching_model):
search_result = re.search(
- r"\b{}\b".format(word), document_content, **search_kwargs)
+ rf"\b{word}\b", document_content, **search_kwargs)
if not search_result:
return False
return True
- if matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
+ elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
for word in _split_match(matching_model):
- if re.search(r"\b{}\b".format(word), document_content, **search_kwargs):
+ if re.search(rf"\b{word}\b", document_content, **search_kwargs):
return True
return False
- if matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
+ elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
return bool(re.search(
- r"\b{}\b".format(matching_model.match), document_content, **search_kwargs))
+ rf"\b{matching_model.match}\b",
+ document_content,
+ **search_kwargs
+ ))
- if matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
+ elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
return bool(re.search(
- re.compile(matching_model.match, **search_kwargs), document_content))
+ re.compile(matching_model.match, **search_kwargs),
+ document_content
+ ))
- if matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
+ elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
match = re.sub(r'[^\w\s]', '', matching_model.match)
text = re.sub(r'[^\w\s]', '', document_content)
if matching_model.is_insensitive:
match = match.lower()
text = text.lower()
- return True if fuzz.partial_ratio(match, text) >= 90 else False
+ return fuzz.partial_ratio(match, text) >= 90
- if matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
+ elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
# this is done elsewhere.
return False
- raise NotImplementedError("Unsupported matching algorithm")
+ else:
+ raise NotImplementedError("Unsupported matching algorithm")
def _split_match(matching_model):
diff --git a/src/paperless/auth.py b/src/paperless/auth.py
index ecd697f0e..83279ef36 100644
--- a/src/paperless/auth.py
+++ b/src/paperless/auth.py
@@ -9,7 +9,7 @@ class AngularApiAuthenticationOverride(authentication.BaseAuthentication):
"""
def authenticate(self, request):
- if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'):
+ if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'): # NOQA: E501
user = User.objects.filter(is_staff=True).first()
print("Auto-Login with user {}".format(user))
return (user, None)
diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py
index 03f915769..9d0397f24 100644
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -283,8 +283,8 @@ class MailAccountHandler(LoggingMixin):
path=temp_filename,
override_filename=att.filename,
override_title=title,
- override_correspondent_id=correspondent.id if correspondent else None,
- override_document_type_id=doc_type.id if doc_type else None,
+ override_correspondent_id=correspondent.id if correspondent else None, # NOQA: E501
+ override_document_type_id=doc_type.id if doc_type else None, # NOQA: E501
override_tag_ids=[tag.id] if tag else None,
task_name=f"Mail: {att.filename}"
)
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
index d0ce01327..c9e77486e 100644
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -123,7 +123,7 @@ class RasterisedDocumentParser(DocumentParser):
ocr_pages = self._complete_ocr_default_language(
images, sample_page_index, sample_page_text)
- elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
+ elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501
self.log(
"warning",
f"Detected language {guessed_language} is not available "
From db4519a64433262d8df1c9586a9dc01e1708ff9e Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sat, 21 Nov 2020 15:34:30 +0100
Subject: [PATCH 17/52] url patterns cleanup
---
src/paperless/urls.py | 63 ++++++++++++++++++++++++++-----------------
1 file changed, 38 insertions(+), 25 deletions(-)
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index 220e6402c..dd5e6a379 100755
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -28,43 +28,56 @@ api_router.register(r"tags", TagViewSet)
urlpatterns = [
+ re_path(r"^api/", include([
+ re_path(r"^auth/",
+ include(('rest_framework.urls', 'rest_framework'),
+ namespace="rest_framework")),
- # API
- re_path(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
- re_path(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
- re_path(r"^api/search/", SearchView.as_view(), name="search"),
- re_path(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
- re_path(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
+ re_path(r"^search/autocomplete/",
+ SearchAutoCompleteView.as_view(),
+ name="autocomplete"),
+
+ re_path(r"^search/",
+ SearchView.as_view(),
+ name="search"),
+
+ re_path(r"^statistics/",
+ StatisticsView.as_view(),
+ name="statistics"),
+
+ ] + api_router.urls)),
- # Favicon
re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
- # The Django admin
re_path(r"admin/", admin.site.urls),
- # These redirects are here to support clients that use the old FetchView.
- re_path(
- r"^fetch/doc/(?P\d+)$",
- RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
- ),
- re_path(
- r"^fetch/thumb/(?P\d+)$",
- RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
- ),
- re_path(
- r"^fetch/preview/(?P\d+)$",
- RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
- ),
- re_path(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
+ re_path(r"^fetch/", include([
+ re_path(
+ r"^doc/(?P\d+)$",
+ RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
+ ),
+ re_path(
+ r"^thumb/(?P\d+)$",
+ RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
+ ),
+ re_path(
+ r"^preview/(?P\d+)$",
+ RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
+ ),
+ ])),
- # Frontend assets TODO: this is pretty bad.
- path('assets/', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
+ re_path(r"^push$", csrf_exempt(
+ RedirectView.as_view(url='/api/documents/post_document/'))),
+ # Frontend assets TODO: this is pretty bad, but it works.
+ path('assets/',
+ RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
+
+ # login, logout
path('accounts/', include('django.contrib.auth.urls')),
# Root of the Frontent
re_path(r".*", login_required(IndexView.as_view())),
-
]
# Text in each page's
(and above login form).
From 110c5c392cd1c50113b74f9dfa4ae06733a95974 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sat, 21 Nov 2020 16:07:28 +0100
Subject: [PATCH 18/52] added tests to pycodestyle ignore for now. 79
characters really doesnt work there and i don't really care enough.
---
src/setup.cfg | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/setup.cfg b/src/setup.cfg
index c0f80d964..4b0a216f5 100644
--- a/src/setup.cfg
+++ b/src/setup.cfg
@@ -1,6 +1,5 @@
[pycodestyle]
-exclude = migrations, paperless/settings.py, .tox
-ignore = E501
+exclude = migrations, paperless/settings.py, .tox, */tests/*
[tool:pytest]
DJANGO_SETTINGS_MODULE=paperless.settings
From 3afee66aaa1a06db46aea97c29aae4e4f68b8713 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sat, 21 Nov 2020 20:14:48 +0100
Subject: [PATCH 19/52] updated entrypoint script to wait for postgres
---
docker/docker-entrypoint.sh | 34 ++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh
index c6e0d1cab..dfa7cfc65 100644
--- a/docker/docker-entrypoint.sh
+++ b/docker/docker-entrypoint.sh
@@ -15,8 +15,42 @@ map_uidgid() {
fi
}
+
+wait_for_postgres() {
+ attempt_num=1
+ max_attempts=5
+
+ echo "Waiting for PostgreSQL to start..."
+
+ host="${PAPERLESS_DBHOST}"
+
+ while !
Date: Sat, 21 Nov 2020 20:29:30 +0100
Subject: [PATCH 20/52] updated docs
---
docs/index.rst | 3 ++
docs/setup.rst | 96 +++++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 86 insertions(+), 13 deletions(-)
diff --git a/docs/index.rst b/docs/index.rst
index 756fee3b1..a9142a682 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -44,6 +44,9 @@ resources in the documentation:
that's fully tested and production ready.
* See :ref:`this note ` about GnuPG encryption in
paperless-ng.
+* Paperless is now integrated with a
+ :ref:`task processing queue ` that tells you
+ at a glance when and why something is not working.
* The :ref:`changelog ` contains a detailed list of all changes
in paperless-ng.
diff --git a/docs/setup.rst b/docs/setup.rst
index 0f5db1ae5..5520f5594 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -66,6 +66,8 @@ Paperless consists of the following components:
$ cd /path/to/paperless/src/
$ pipenv run python3 manage.py document_consumer
+ .. _setup-task_processor:
+
* **The task processor:** Paperless relies on `Django Q `_
for doing much of the heavy lifting. This is a task queue that accepts tasks from
multiple sources and processes tasks in parallel. It also comes with a scheduler that executes
@@ -86,7 +88,8 @@ Paperless consists of the following components:
a modern multicore system, consumption with full ocr is blazing fast.
The task processor comes with a built-in admin interface that you can use to see whenever any of the
- tasks fail and inspect the errors.
+ tasks fail and inspect the errors (i.e., wrong email credentials, errors during consuming a specific
+ file, etc).
You may start the task processor by executing:
@@ -249,15 +252,21 @@ Migration to paperless-ng is then performed in a few simple steps:
.. caution::
- Make sure you also download the ``.env`` file. This will set the
- project name for docker compose to ``paperless`` and then it will
- automatically reuse your existing paperless volumes.
+ The release include a ``.env`` file. This will set the
+ project name for docker compose to ``paperless`` so that paperless-ng will
+ automatically reuse your existing paperless volumes. When you start it, it
+ will migrate your existing data. After that, your old paperless installation
+ will be incompatible with the migrated volumes.
-4. Adjust ``docker-compose.yml`` and
+4. Copy the ``docker-compose.sqlite.yml`` file to ``docker-compose.yml``.
+ If you want to migrate to PostgreSQL, do that after you migrated your existing
+ SQLite database.
+
+5. Adjust ``docker-compose.yml`` and
``docker-compose.env`` to your needs.
- See `docker route`_ for details on which edits are required.
+ See `docker route`_ for details on which edits are advised.
-5. Start paperless-ng.
+6. Start paperless-ng.
.. code:: bash
@@ -273,19 +282,80 @@ Migration to paperless-ng is then performed in a few simple steps:
This will run paperless in the background and automatically start it on system boot.
-6. Paperless installed a permanent redirect to ``admin/`` in your browser. This
+7. Paperless installed a permanent redirect to ``admin/`` in your browser. This
redirect is still in place and prevents access to the new UI. Clear
- everything related to paperless in your browsers data in order to fix
- this issue.
+ browsing cache in order to fix this.
+
+8. Optionally, follow the instructions below to migrate your existing data to PostgreSQL.
.. _setup-sqlite_to_psql:
-Moving data from sqlite to postgresql
+Moving data from SQLite to PostgreSQL
=====================================
-.. warning::
+Moving your data from SQLite to PostgreSQL is done via executing a series of django
+management commands as below.
+
+.. caution::
+
+ Make sure that your sqlite database is migrated to the latest version.
+ Starting paperless will make sure that this is the case. If your try to
+ load data from an old database schema in SQLite into a newer database
+ schema in PostgreSQL, you will run into trouble.
+
+1. Stop paperless, if it is running.
+2. Tell paperless to use PostgreSQL:
+
+ a) With docker, copy the provided ``docker-compose.postgres.yml`` file to
+ ``docker-compose.yml``. Remember to adjust the consumption directory,
+ if necessary.
+ b) Without docker, configure the database in your ``paperless.conf`` file.
+ See :ref:`configuration` for details.
+
+3. Open a shell and initialize the database:
+
+ a) With docker, run the following command to open a shell within the paperless
+ container:
+
+ .. code:: shell-session
+
+ $ cd /path/to/paperless
+ $ docker-compose run --rm webserver /bin/bash
+
+ This will lauch the container and initialize the PostgreSQL database.
+
+ b) Without docker, open a shell in your virtual environment, switch to
+ the ``src`` directory and create the database schema:
+
+ .. code:: shell-session
+
+ $ cd /path/to/paperless
+ $ pipenv shell
+ $ cd src
+ $ python3 manage.py migrate
+
+ This will not copy any data yet.
+
+4. Dump your data from SQLite:
+
+ .. code:: shell-session
+
+ $ python3 manage.py dumpdata --database=sqlite --exclude=contenttypes --exclude=auth.Permission > data.json
+
+5. Load your data into PostgreSQL:
+
+ .. code:: shell-session
+
+ $ python3 manage.py loaddata data.json
+
+6. Exit the shell.
+
+ .. code:: shell-session
+
+ $ exit
+
+7. Start paperless.
- TBD.
.. _redis: https://redis.io/
From d3482a4aef874ba20f9ab87b2870582afe622d29 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sat, 21 Nov 2020 20:44:35 +0100
Subject: [PATCH 21/52] changelog
---
docs/changelog.rst | 3 +++
1 file changed, 3 insertions(+)
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 86a24df27..4c938ba87 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -17,6 +17,9 @@ next
content type was not set correctly. (i.e. PDF documents with
content type ``application/octet-stream``)
+* Docker entrypoint script awaits the database server if it is
+ configured.
+
paperless-ng 0.9.1
##################
From af3d161f666b98e0640f038a0c37eb3ae876866d Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sat, 21 Nov 2020 23:12:34 +0100
Subject: [PATCH 22/52] updated the admin, ordering for mail rules
---
src/documents/admin.py | 21 ++++++++++++++-----
src/paperless_mail/admin.py | 10 ++++++++-
src/paperless_mail/mail.py | 2 +-
.../migrations/0004_mailrule_order.py | 18 ++++++++++++++++
src/paperless_mail/models.py | 2 ++
5 files changed, 46 insertions(+), 7 deletions(-)
create mode 100644 src/paperless_mail/migrations/0004_mailrule_order.py
diff --git a/src/documents/admin.py b/src/documents/admin.py
index 5b3975fda..8b9f2fce9 100755
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -51,15 +51,16 @@ class DocumentAdmin(admin.ModelAdmin):
search_fields = ("correspondent__name", "title", "content", "tags__name")
readonly_fields = ("added", "mime_type", "storage_type", "filename")
+
+ list_display_links = ("title",)
+
list_display = (
- "title",
- "created",
- "added",
"correspondent",
+ "title",
"tags_",
- "archive_serial_number",
- "document_type"
+ "created",
)
+
list_filter = (
"document_type",
"tags",
@@ -117,9 +118,19 @@ class DocumentAdmin(admin.ModelAdmin):
class LogAdmin(admin.ModelAdmin):
+ def has_add_permission(self, request):
+ return False
+
+ def has_change_permission(self, request, obj=None):
+ return False
+
list_display = ("created", "message", "level",)
list_filter = ("level", "created",)
+ ordering = ('-created',)
+
+ list_display_links = ("created", "message")
+
admin.site.register(Correspondent, CorrespondentAdmin)
admin.site.register(Tag, TagAdmin)
diff --git a/src/paperless_mail/admin.py b/src/paperless_mail/admin.py
index 8d05c2a42..d8560c418 100644
--- a/src/paperless_mail/admin.py
+++ b/src/paperless_mail/admin.py
@@ -11,7 +11,15 @@ class MailRuleAdmin(admin.ModelAdmin):
list_filter = ("account",)
- list_display = ("name", "account", "folder", "action")
+ list_display = ("order", "name", "account", "folder", "action")
+
+ list_editable = ("order", )
+
+ list_display_links = ("name", )
+
+ sortable_by = []
+
+ ordering = ["order"]
admin.site.register(MailAccount, MailAccountAdmin)
diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py
index 9d0397f24..dfdfa09ce 100644
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -161,7 +161,7 @@ class MailAccountHandler(LoggingMixin):
self.log('debug', f"Account {account}: Processing "
f"{account.rules.count()} rule(s)")
- for rule in account.rules.all():
+ for rule in account.rules.order_by('order'):
self.log(
'debug',
f"Account {account}: Processing rule {rule.name}")
diff --git a/src/paperless_mail/migrations/0004_mailrule_order.py b/src/paperless_mail/migrations/0004_mailrule_order.py
new file mode 100644
index 000000000..498f280a1
--- /dev/null
+++ b/src/paperless_mail/migrations/0004_mailrule_order.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2020-11-21 21:51
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('paperless_mail', '0003_auto_20201118_1940'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='mailrule',
+ name='order',
+ field=models.IntegerField(default=0),
+ ),
+ ]
diff --git a/src/paperless_mail/models.py b/src/paperless_mail/models.py
index 14da202fa..c8ab09479 100644
--- a/src/paperless_mail/models.py
+++ b/src/paperless_mail/models.py
@@ -78,6 +78,8 @@ class MailRule(models.Model):
name = models.CharField(max_length=256, unique=True)
+ order = models.IntegerField(default=0)
+
account = models.ForeignKey(
MailAccount,
related_name="rules",
From d65a118d8aac6cc30d937b5f1d4843611f4c0999 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sun, 22 Nov 2020 00:35:19 +0100
Subject: [PATCH 23/52] use docker compose for building
---
scripts/make-release.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scripts/make-release.sh b/scripts/make-release.sh
index 06548748b..6860b4ae6 100755
--- a/scripts/make-release.sh
+++ b/scripts/make-release.sh
@@ -94,7 +94,7 @@ cp "$PAPERLESS_ROOT/docker/supervisord.conf" "$PAPERLESS_DIST_APP/docker/"
cd "$PAPERLESS_DIST_APP"
-docker build . -t "jonaswinkler/paperless-ng:$VERSION"
+docker-compose -f docker-compose.postgres.yml build
# works. package the app!
From 54af13e4b8981e470734a49dc8521251d2cabc30 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sun, 22 Nov 2020 01:39:48 +0100
Subject: [PATCH 24/52] much better mail rule admin
---
docs/changelog.rst | 8 +++++++-
src/paperless_mail/admin.py | 32 ++++++++++++++++++++++++++++++++
2 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 4c938ba87..2af97b33b 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -12,7 +12,7 @@ next
if a file type is supported and which parser to use. Removes all
file type checks that where present in MANY different places in
paperless.
-
+
* Mail consumer now correctly consumes documents even when their
content type was not set correctly. (i.e. PDF documents with
content type ``application/octet-stream``)
@@ -20,6 +20,12 @@ next
* Docker entrypoint script awaits the database server if it is
configured.
+* Basic sorting of mail rules added.
+
+* Disabled editing of logs.
+
+* Much better admin for mail rule editing.
+
paperless-ng 0.9.1
##################
diff --git a/src/paperless_mail/admin.py b/src/paperless_mail/admin.py
index d8560c418..b959171f7 100644
--- a/src/paperless_mail/admin.py
+++ b/src/paperless_mail/admin.py
@@ -9,6 +9,38 @@ class MailAccountAdmin(admin.ModelAdmin):
class MailRuleAdmin(admin.ModelAdmin):
+ radio_fields = {
+ "action": admin.VERTICAL,
+ "assign_title_from": admin.VERTICAL,
+ "assign_correspondent_from": admin.VERTICAL
+ }
+
+ fieldsets = (
+ (None, {
+ 'fields': ('name', 'order', 'account', 'folder')
+ }),
+ ("Filter", {
+ 'description': "Paperless will only process mails that match ALL "
+ "of the filters given below.",
+ 'fields': ('filter_from', 'filter_subject', 'filter_body', 'maximum_age')
+ }),
+ ("Actions", {
+ 'description': "The action applied to the mail. This action is "
+ "only performed when documents were consumed from "
+ "the mail. Mails without attachments will remain "
+ "entirely untouched.",
+ 'fields': ('action', 'action_parameter')
+ }),
+ ("Metadata", {
+ 'description': "Assign metadata to documents consumed from this "
+ "rule automatically. If you do not assign tags, "
+ "types or correspondents here, paperless will "
+ "still process all matching rules that you have "
+ "defined.",
+ "fields": ('assign_title_from', 'assign_tag', 'assign_document_type', 'assign_correspondent_from', 'assign_correspondent')
+ })
+ )
+
list_filter = ("account",)
list_display = ("order", "name", "account", "folder", "action")
From 532d5c1744b835402019a8e5595dc6ff7d3abae2 Mon Sep 17 00:00:00 2001
From: Jonas Winkler
Date: Sun, 22 Nov 2020 11:35:04 +0100
Subject: [PATCH 25/52] a couple styling changes, collapsible menu
---
.../app-frame/app-frame.component.html | 7 ++++---
.../components/app-frame/app-frame.component.ts | 2 ++
.../document-detail.component.html | 16 ++++++++--------
3 files changed, 14 insertions(+), 11 deletions(-)
diff --git a/src-ui/src/app/components/app-frame/app-frame.component.html b/src-ui/src/app/components/app-frame/app-frame.component.html
index 519b69bf0..1232ecf12 100644
--- a/src-ui/src/app/components/app-frame/app-frame.component.html
+++ b/src-ui/src/app/components/app-frame/app-frame.component.html
@@ -1,7 +1,8 @@