From 2e04ba1c049c3a0f633798b0434d9cf60719bfb1 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Thu, 12 Nov 2020 21:09:45 +0100 Subject: [PATCH] code style fixes --- src/documents/admin.py | 13 ++- src/documents/apps.py | 1 - src/documents/classifier.py | 15 ++- src/documents/consumer.py | 2 +- src/documents/forms.py | 6 +- src/documents/index.py | 4 - src/documents/mail.py | 3 +- .../management/commands/document_consumer.py | 3 +- .../commands/document_create_classifier.py | 1 + .../management/commands/document_exporter.py | 9 +- .../management/commands/document_importer.py | 6 +- .../management/commands/document_logs.py | 4 +- .../management/commands/document_renamer.py | 3 +- src/documents/matching.py | 6 +- src/documents/parsers.py | 16 ++-- src/documents/serialisers.py | 1 - src/documents/tasks.py | 1 - src/documents/tests/test_checks.py | 2 +- src/documents/tests/test_file_handling.py | 92 ++++++------------- src/documents/tests/test_importer.py | 3 +- src/documents/tests/test_logger.py | 1 - src/documents/tests/test_mail.py | 3 +- src/documents/tests/test_models.py | 2 +- src/documents/views.py | 18 ++-- src/paperless/checks.py | 9 +- src/paperless/settings.py | 3 + src/paperless/urls.py | 4 +- src/paperless_tesseract/parsers.py | 20 ++-- src/paperless_tesseract/tests/test_date.py | 2 +- src/paperless_text/parsers.py | 4 +- src/setup.cfg | 2 +- 31 files changed, 110 insertions(+), 149 deletions(-) diff --git a/src/documents/admin.py b/src/documents/admin.py index 51096d860..209ddff35 100755 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -1,5 +1,4 @@ from django.contrib import admin -from django.contrib.auth.models import Group, User from django.utils.html import format_html, format_html_join from django.utils.safestring import mark_safe from whoosh.writing import AsyncWriter @@ -52,8 +51,16 @@ class DocumentAdmin(admin.ModelAdmin): search_fields = ("correspondent__name", "title", "content", "tags__name") readonly_fields = ("added", "file_type", "storage_type", "filename") - list_display = ("title", "created", "added", "correspondent", - "tags_", "archive_serial_number", "document_type", "filename") + list_display = ( + "title", + "created", + "added", + "correspondent", + "tags_", + "archive_serial_number", + "document_type", + "filename" + ) list_filter = ( "document_type", "tags", diff --git a/src/documents/apps.py b/src/documents/apps.py index 6cf815122..2cd7d6c0e 100644 --- a/src/documents/apps.py +++ b/src/documents/apps.py @@ -1,5 +1,4 @@ from django.apps import AppConfig -from django.db.models.signals import post_delete class DocumentsConfig(AppConfig): diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 4ba538162..1b70dcd6f 100755 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -3,7 +3,6 @@ import logging import os import pickle import re -import time from sklearn.feature_extraction.text import CountVectorizer from sklearn.neural_network import MLPClassifier @@ -64,7 +63,7 @@ class DocumentClassifier(object): def save_classifier(self): with open(settings.MODEL_FILE, "wb") as f: - pickle.dump(self.FORMAT_VERSION, f) # Version + pickle.dump(self.FORMAT_VERSION, f) pickle.dump(self.data_hash, f) pickle.dump(self.data_vectorizer, f) @@ -89,16 +88,14 @@ class DocumentClassifier(object): data.append(preprocessed_content) y = -1 - if doc.document_type: - if doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO: - y = doc.document_type.pk + if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO: + y = doc.document_type.pk m.update(y.to_bytes(4, 'little', signed=True)) labels_document_type.append(y) y = -1 - if doc.correspondent: - if doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO: - y = doc.correspondent.pk + if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO: + y = doc.correspondent.pk m.update(y.to_bytes(4, 'little', signed=True)) labels_correspondent.append(y) @@ -137,7 +134,7 @@ class DocumentClassifier(object): logging.getLogger(__name__).debug("Vectorizing data...") self.data_vectorizer = CountVectorizer( analyzer="word", - ngram_range=(1,2), + ngram_range=(1, 2), min_df=0.01 ) data_vectorized = self.data_vectorizer.fit_transform(data) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 2e8c5493f..6754ebf26 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -155,7 +155,7 @@ class Consumer: self.log("debug", "Saving record to database") created = file_info.created or date or timezone.make_aware( - datetime.datetime.fromtimestamp(stats.st_mtime)) + datetime.datetime.fromtimestamp(stats.st_mtime)) with open(doc, "rb") as f: document = Document.objects.create( diff --git a/src/documents/forms.py b/src/documents/forms.py index e6c7bbf41..a1e42dfea 100644 --- a/src/documents/forms.py +++ b/src/documents/forms.py @@ -1,5 +1,4 @@ import os - from datetime import datetime from time import mktime @@ -22,7 +21,10 @@ class UploadForm(forms.Form): def get_filename(self, i=None): return os.path.join( settings.CONSUMPTION_DIR, - "{}_{}".format(str(i), self.cleaned_data.get("document").name) if i else self.cleaned_data.get("document").name + "{}_{}".format( + str(i), + self.cleaned_data.get("document").name + ) if i else self.cleaned_data.get("document").name ) def save(self): diff --git a/src/documents/index.py b/src/documents/index.py index 39471ea51..d46ccedaf 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -1,8 +1,6 @@ import logging from contextlib import contextmanager -from django.db import models -from django.dispatch import receiver from whoosh import highlight from whoosh.fields import Schema, TEXT, NUMERIC from whoosh.highlight import Formatter, get_text @@ -10,10 +8,8 @@ from whoosh.index import create_in, exists_in, open_dir from whoosh.qparser import MultifieldParser from whoosh.writing import AsyncWriter -from documents.models import Document from paperless import settings - logger = logging.getLogger(__name__) diff --git a/src/documents/mail.py b/src/documents/mail.py index d54b387b7..780c09872 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -5,12 +5,11 @@ import os import re import time import uuid - from base64 import b64decode from email import policy from email.parser import BytesParser -from dateutil import parser +from dateutil import parser from django.conf import settings from .models import Correspondent diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index ea6e033ba..bb317a192 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -3,9 +3,8 @@ import os from django.conf import settings from django.core.management.base import BaseCommand - -from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler +from watchdog.observers import Observer from documents.consumer import Consumer diff --git a/src/documents/management/commands/document_create_classifier.py b/src/documents/management/commands/document_create_classifier.py index 839044700..fbfb7f7e6 100755 --- a/src/documents/management/commands/document_create_classifier.py +++ b/src/documents/management/commands/document_create_classifier.py @@ -1,4 +1,5 @@ from django.core.management.base import BaseCommand + from ...mixins import Renderable from ...tasks import train_classifier diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 43582a619..971e6a829 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -1,16 +1,15 @@ import json import os -import time import shutil +import time -from django.core.management.base import BaseCommand, CommandError from django.core import serializers +from django.core.management.base import BaseCommand, CommandError from documents.models import Document, Correspondent, Tag, DocumentType -from paperless.db import GnuPG - -from ...mixins import Renderable from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME +from paperless.db import GnuPG +from ...mixins import Renderable class Command(Renderable, BaseCommand): diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py index ef3eaafc0..da9086144 100644 --- a/src/documents/management/commands/document_importer.py +++ b/src/documents/management/commands/document_importer.py @@ -3,17 +3,15 @@ import os import shutil from django.conf import settings -from django.core.management.base import BaseCommand, CommandError from django.core.management import call_command +from django.core.management.base import BaseCommand, CommandError from documents.models import Document +from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME from paperless.db import GnuPG from ...file_handling import generate_filename, create_source_path_directory - from ...mixins import Renderable -from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME - class Command(Renderable, BaseCommand): diff --git a/src/documents/management/commands/document_logs.py b/src/documents/management/commands/document_logs.py index 9a8271e71..06efc3850 100644 --- a/src/documents/management/commands/document_logs.py +++ b/src/documents/management/commands/document_logs.py @@ -8,5 +8,5 @@ class Command(BaseCommand): help = "A quick & dirty way to see what's in the logs" def handle(self, *args, **options): - for l in Log.objects.order_by("pk"): - print(l) + for log in Log.objects.order_by("pk"): + print(log) diff --git a/src/documents/management/commands/document_renamer.py b/src/documents/management/commands/document_renamer.py index d7d77a111..ba9e74de5 100644 --- a/src/documents/management/commands/document_renamer.py +++ b/src/documents/management/commands/document_renamer.py @@ -1,7 +1,6 @@ from django.core.management.base import BaseCommand -from documents.models import Document, Tag - +from documents.models import Document from ...mixins import Renderable diff --git a/src/documents/matching.py b/src/documents/matching.py index 045e2863a..e5789ab2e 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -9,16 +9,14 @@ def match_correspondents(document_content, classifier): correspondents = Correspondent.objects.all() predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None - matched_correspondents = [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id] - return matched_correspondents + return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id] def match_document_types(document_content, classifier): document_types = DocumentType.objects.all() predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None - matched_document_types = [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id] - return matched_document_types + return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id] def match_tags(document_content, classifier): diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 0cbd13987..adc66df57 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -22,11 +22,13 @@ from django.utils import timezone # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits from documents.signals import document_consumer_declaration +# TODO: isnt there a date parsing library for this? + DATE_REGEX = re.compile( - r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' + + r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' # NOQA: E501 + r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' # NOQA: E501 + r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' # NOQA: E501 + r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))' ) @@ -43,7 +45,7 @@ def get_parser_class(doc): for response in document_consumer_declaration.send(None): parsers.append(response[1]) - #TODO: add a check that checks parser availability. + # TODO: add a check that checks parser availability. options = [] for parser in parsers: @@ -59,7 +61,7 @@ def get_parser_class(doc): options, key=lambda _: _["weight"], reverse=True)[0]["parser"] -def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None): +def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None): environment = os.environ.copy() if settings.CONVERT_MEMORY_LIMIT: environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT @@ -74,7 +76,7 @@ def run_convert(input, output, density=None, scale=None, alpha=None, strip=False args += ['-trim'] if trim else [] args += ['-type', str(type)] if type else [] args += ['-depth', str(depth)] if depth else [] - args += [input, output] + args += [input_file, output_file] logger.debug("Execute: " + " ".join(args), extra={'group': logging_group}) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 60cd7b293..e42e26881 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -105,7 +105,6 @@ class DocumentSerializer(serializers.ModelSerializer): class LogSerializer(serializers.ModelSerializer): - class Meta: model = Log fields = ( diff --git a/src/documents/tasks.py b/src/documents/tasks.py index aaf466bd2..9a3a0d7b8 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -1,7 +1,6 @@ import logging from django.conf import settings -from django_q.tasks import async_task, result from whoosh.writing import AsyncWriter from documents import index diff --git a/src/documents/tests/test_checks.py b/src/documents/tests/test_checks.py index da3a4adf0..d316f94b5 100644 --- a/src/documents/tests/test_checks.py +++ b/src/documents/tests/test_checks.py @@ -2,9 +2,9 @@ import unittest from django.test import TestCase +from .factories import DocumentFactory from ..checks import changed_password_check from ..models import Document -from .factories import DocumentFactory class ChecksTestCase(TestCase): diff --git a/src/documents/tests/test_file_handling.py b/src/documents/tests/test_file_handling.py index 18fd327b1..4aced80cb 100644 --- a/src/documents/tests/test_file_handling.py +++ b/src/documents/tests/test_file_handling.py @@ -1,14 +1,13 @@ import os import shutil -from uuid import uuid4 from pathlib import Path +from uuid import uuid4 +from django.conf import settings from django.test import TestCase, override_settings from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories from ..models import Document, Correspondent -from django.conf import settings - from ..signals.handlers import update_filename_and_move_files @@ -68,24 +67,18 @@ class TestDate(TestCase): # test that creating dirs for the source_path creates the correct directory create_source_path_directory(document.source_path) Path(document.source_path).touch() - self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + - "/none"), True) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True) # Set a correspondent and save the document - document.correspondent = Correspondent.objects.get_or_create( - name="test")[0] + document.correspondent = Correspondent.objects.get_or_create(name="test")[0] document.save() # Check proper handling of files - self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + - "/test"), True) - self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + - "/none"), False) - self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + - "/test/test-{:07d}.pdf.gpg".format(document.pk)), True) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False) + self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test-{:07d}.pdf.gpg".format(document.pk)), True) - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") + @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_file_renaming_missing_permissions(self): document = Document() document.file_type = "pdf" @@ -100,27 +93,22 @@ class TestDate(TestCase): Path(document.source_path).touch() # Test source_path - self.assertEqual(document.source_path, settings.ORIGINALS_DIR + - "/none/none-{:07d}.pdf".format(document.pk)) + self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)) # Make the folder read- and execute-only (no writing and no renaming) os.chmod(settings.ORIGINALS_DIR + "/none", 0o555) # Set a correspondent and save the document - document.correspondent = Correspondent.objects.get_or_create( - name="test")[0] + document.correspondent = Correspondent.objects.get_or_create(name="test")[0] document.save() # Check proper handling of files - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/none/none-{:07d}.pdf".format(document.pk)), True) - self.assertEqual(document.filename, - "none/none-{:07d}.pdf".format(document.pk)) + self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True) + self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") + @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_file_renaming_database_error(self): document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA") @@ -155,13 +143,10 @@ class TestDate(TestCase): # Check proper handling of files self.assertTrue(os.path.isfile(document.source_path)) - self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" + - "originals/none/none-{:07d}.pdf".format(document.pk)), True) - self.assertEqual(document.filename, - "none/none-{:07d}.pdf".format(document.pk)) + self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True) + self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") + @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_document_delete(self): document = Document() document.file_type = "pdf" @@ -179,13 +164,10 @@ class TestDate(TestCase): # Ensure file deletion after delete pk = document.pk document.delete() - self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + - "/none/none-{:07d}.pdf".format(pk)), False) - self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + - "/none"), False) + self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(pk)), False) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False) - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") + @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_document_delete_nofile(self): document = Document() document.file_type = "pdf" @@ -194,8 +176,7 @@ class TestDate(TestCase): document.delete() - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}") + @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_directory_not_empty(self): document = Document() document.file_type = "pdf" @@ -214,18 +195,14 @@ class TestDate(TestCase): Path(important_file).touch() # Set a correspondent and save the document - document.correspondent = Correspondent.objects.get_or_create( - name="test")[0] + document.correspondent = Correspondent.objects.get_or_create(name="test")[0] document.save() # Check proper handling of files - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/test"), True) - self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + - "/documents/originals/none"), True) + self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True) + self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True) self.assertTrue(os.path.isfile(important_file)) - @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") def test_tags_with_underscore(self): document = Document() @@ -304,9 +281,7 @@ class TestDate(TestCase): self.assertEqual(generate_filename(document), "none-{:07d}.pdf".format(document.pk)) - - @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" + - "{correspondent}/{correspondent}") + @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}") def test_nested_directory_cleanup(self): document = Document() document.file_type = "pdf" @@ -315,25 +290,19 @@ class TestDate(TestCase): # Ensure that filename is properly generated document.filename = generate_filename(document) - self.assertEqual(document.filename, - "none/none/none-{:07d}.pdf".format(document.pk)) + self.assertEqual(document.filename, "none/none/none-{:07d}.pdf".format(document.pk)) create_source_path_directory(document.source_path) Path(document.source_path).touch() # Check proper handling of files - self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + - "/none/none"), True) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), True) pk = document.pk document.delete() - self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + - "/none/none/none-{:07d}.pdf".format(pk)), - False) - self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + - "/none/none"), False) - self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + - "/none"), False) + self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none-{:07d}.pdf".format(pk)), False) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), False) + self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False) self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR), True) @override_settings(PAPERLESS_FILENAME_FORMAT=None) @@ -355,8 +324,7 @@ class TestDate(TestCase): Path(os.path.join(tmp, "notempty", "file")).touch() os.makedirs(os.path.join(tmp, "notempty", "empty")) - delete_empty_directories( - os.path.join(tmp, "notempty", "empty")) + delete_empty_directories(os.path.join(tmp, "notempty", "empty")) self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) self.assertEqual(os.path.isfile( os.path.join(tmp, "notempty", "file")), True) diff --git a/src/documents/tests/test_importer.py b/src/documents/tests/test_importer.py index 0efddbd71..01600df33 100644 --- a/src/documents/tests/test_importer.py +++ b/src/documents/tests/test_importer.py @@ -1,9 +1,8 @@ from django.core.management.base import CommandError from django.test import TestCase -from ..management.commands.document_importer import Command - from documents.settings import EXPORTER_FILE_NAME +from ..management.commands.document_importer import Command class TestImporter(TestCase): diff --git a/src/documents/tests/test_logger.py b/src/documents/tests/test_logger.py index 51a4fad83..6e240ffc9 100644 --- a/src/documents/tests/test_logger.py +++ b/src/documents/tests/test_logger.py @@ -1,6 +1,5 @@ import logging import uuid - from unittest import mock from django.test import TestCase diff --git a/src/documents/tests/test_mail.py b/src/documents/tests/test_mail.py index 3e78cd23b..22226217e 100644 --- a/src/documents/tests/test_mail.py +++ b/src/documents/tests/test_mail.py @@ -1,10 +1,9 @@ import base64 import os -import magic - from hashlib import md5 from unittest import mock +import magic from django.conf import settings from django.test import TestCase diff --git a/src/documents/tests/test_models.py b/src/documents/tests/test_models.py index 606403ec1..37b088c7f 100644 --- a/src/documents/tests/test_models.py +++ b/src/documents/tests/test_models.py @@ -1,7 +1,7 @@ from django.test import TestCase -from ..models import Document, Correspondent from .factories import DocumentFactory, CorrespondentFactory +from ..models import Document, Correspondent class CorrespondentTestCase(TestCase): diff --git a/src/documents/views.py b/src/documents/views.py index 8cc330141..166806d8e 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -4,11 +4,6 @@ from django.views.decorators.cache import cache_control from django.views.generic import TemplateView from django_filters.rest_framework import DjangoFilterBackend from rest_framework.decorators import action -from rest_framework.response import Response -from rest_framework.views import APIView - -from paperless.db import GnuPG -from paperless.views import StandardPagination from rest_framework.filters import OrderingFilter, SearchFilter from rest_framework.mixins import ( DestroyModelMixin, @@ -17,12 +12,17 @@ from rest_framework.mixins import ( UpdateModelMixin ) from rest_framework.permissions import IsAuthenticated +from rest_framework.response import Response +from rest_framework.views import APIView from rest_framework.viewsets import ( GenericViewSet, ModelViewSet, ReadOnlyModelViewSet ) +import documents.index as index +from paperless.db import GnuPG +from paperless.views import StandardPagination from .filters import ( CorrespondentFilterSet, DocumentFilterSet, @@ -30,8 +30,6 @@ from .filters import ( DocumentTypeFilterSet, LogFilterSet ) - -import documents.index as index from .forms import UploadForm from .models import Correspondent, Document, Log, Tag, DocumentType from .serialisers import ( @@ -106,7 +104,7 @@ class DocumentViewSet(RetrieveModelMixin, return super(DocumentViewSet, self).destroy(request, *args, **kwargs) def file_response(self, pk, disposition): - #TODO: this should not be necessary here. + # TODO: this should not be necessary here. content_types = { Document.TYPE_PDF: "application/pdf", Document.TYPE_PNG: "image/png", @@ -114,7 +112,7 @@ class DocumentViewSet(RetrieveModelMixin, Document.TYPE_GIF: "image/gif", Document.TYPE_TIF: "image/tiff", Document.TYPE_CSV: "text/csv", - Document.TYPE_MD: "text/markdown", + Document.TYPE_MD: "text/markdown", Document.TYPE_TXT: "text/plain" } @@ -132,7 +130,7 @@ class DocumentViewSet(RetrieveModelMixin, @action(methods=['post'], detail=False) def post_document(self, request, pk=None): - #TODO: is this a good implementation? + # TODO: is this a good implementation? form = UploadForm(data=request.POST, files=request.FILES) if form.is_valid(): form.save() diff --git a/src/paperless/checks.py b/src/paperless/checks.py index 0b725cfd6..8605d0089 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -11,6 +11,8 @@ writeable_hint = ( "Set the permissions of {} to be writeable by the user running the " "Paperless services" ) + + def path_check(env_var): messages = [] directory = os.getenv(env_var) @@ -27,6 +29,7 @@ def path_check(env_var): )) return messages + @register() def paths_check(app_configs, **kwargs): """ @@ -34,9 +37,9 @@ def paths_check(app_configs, **kwargs): """ check_messages = path_check("PAPERLESS_DATA_DIR") + \ - path_check("PAPERLESS_MEDIA_ROOT") + \ - path_check("PAPERLESS_CONSUMPTION_DIR") + \ - path_check("PAPERLESS_STATICDIR") + path_check("PAPERLESS_MEDIA_ROOT") + \ + path_check("PAPERLESS_CONSUMPTION_DIR") + \ + path_check("PAPERLESS_STATICDIR") return check_messages diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 597e6dc15..dda85e039 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -25,6 +25,7 @@ elif os.path.exists("/usr/local/etc/paperless.conf"): # Tesseract process to one thread. os.environ['OMP_THREAD_LIMIT'] = "1" + def __get_boolean(key, default="NO"): """ Return a boolean value based on whatever the user has supplied in the @@ -32,9 +33,11 @@ def __get_boolean(key, default="NO"): """ return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true")) + # NEVER RUN WITH DEBUG IN PRODUCTION. DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO") + ############################################################################### # Directories # ############################################################################### diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 6f5bf8ed2..e53057d67 100755 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -6,7 +6,6 @@ from django.views.decorators.csrf import csrf_exempt from django.views.generic import RedirectView from rest_framework.routers import DefaultRouter -from paperless.views import FaviconView from documents.views import ( CorrespondentViewSet, DocumentViewSet, @@ -18,6 +17,7 @@ from documents.views import ( SearchAutoCompleteView, StatisticsView ) +from paperless.views import FaviconView api_router = DefaultRouter() api_router.register(r"correspondents", CorrespondentViewSet) @@ -30,7 +30,7 @@ api_router.register(r"tags", TagViewSet) urlpatterns = [ # API - url(r"^api/auth/",include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")), + url(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")), url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"), url(r"^api/search/", SearchView.as_view(), name="search"), url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"), diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index befc9bcd7..4018e853a 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -5,15 +5,14 @@ import subprocess from multiprocessing.pool import Pool import langdetect +import pdftotext import pyocr -from django.conf import settings from PIL import Image +from django.conf import settings from pyocr import PyocrException -import pdftotext from documents.parsers import DocumentParser, ParseError, run_unpaper, \ run_convert - from .languages import ISO639 @@ -45,8 +44,8 @@ class RasterisedDocumentParser(DocumentParser): alpha="remove", strip=True, trim=True, - input="{}[0]".format(self.document_path), - output=out_path, + input_file="{}[0]".format(self.document_path), + output_file=out_path, logging_group=self.logging_group) except ParseError: # if convert fails, fall back to extracting @@ -66,8 +65,8 @@ class RasterisedDocumentParser(DocumentParser): alpha="remove", strip=True, trim=True, - input=gs_out_path, - output=out_path, + input_file=gs_out_path, + output_file=out_path, logging_group=self.logging_group) return out_path @@ -99,7 +98,7 @@ class RasterisedDocumentParser(DocumentParser): try: sample_page_index = int(len(images) / 2) - self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index+1, len(images))) + self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images))) sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0] guessed_language = self._guess_language(sample_page_text) @@ -139,8 +138,8 @@ class RasterisedDocumentParser(DocumentParser): run_convert(density=settings.CONVERT_DENSITY, depth="8", type="grayscale", - input=self.document_path, - output=pnm, + input_file=self.document_path, + output_file=pnm, logging_group=self.logging_group) # Get a list of converted images @@ -189,7 +188,6 @@ class RasterisedDocumentParser(DocumentParser): return [sample_page] - def strip_excess_whitespace(text): collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) no_leading_whitespace = re.sub( diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index 51317362f..4d5ff07dd 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -5,10 +5,10 @@ from unittest import mock from uuid import uuid4 from dateutil import tz +from django.conf import settings from django.test import TestCase, override_settings from ..parsers import RasterisedDocumentParser -from django.conf import settings class TestDate(TestCase): diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 0db1e230b..015016fb3 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -47,8 +47,8 @@ class TextDocumentParser(DocumentParser): def read_text(): with open(self.document_path, 'r') as src: - lines = [l.strip() for l in src.readlines()] - text = "\n".join([l for l in lines[:n_lines]]) + lines = [line.strip() for line in src.readlines()] + text = "\n".join([line for line in lines[:n_lines]]) return text.replace('"', "'") def create_txlayer(): diff --git a/src/setup.cfg b/src/setup.cfg index b09b8d4bb..33bef4f4e 100644 --- a/src/setup.cfg +++ b/src/setup.cfg @@ -1,6 +1,6 @@ [pycodestyle] exclude = migrations, paperless/settings.py, .tox - +ignore = E501 [tool:pytest] DJANGO_SETTINGS_MODULE=paperless.settings