code style fixes

This commit is contained in:
Jonas Winkler 2020-11-12 21:09:45 +01:00
parent 9c4cf5d7bd
commit 2e04ba1c04
31 changed files with 110 additions and 149 deletions

View File

@ -1,5 +1,4 @@
from django.contrib import admin
from django.contrib.auth.models import Group, User
from django.utils.html import format_html, format_html_join
from django.utils.safestring import mark_safe
from whoosh.writing import AsyncWriter
@ -52,8 +51,16 @@ class DocumentAdmin(admin.ModelAdmin):
search_fields = ("correspondent__name", "title", "content", "tags__name")
readonly_fields = ("added", "file_type", "storage_type", "filename")
list_display = ("title", "created", "added", "correspondent",
"tags_", "archive_serial_number", "document_type", "filename")
list_display = (
"title",
"created",
"added",
"correspondent",
"tags_",
"archive_serial_number",
"document_type",
"filename"
)
list_filter = (
"document_type",
"tags",

View File

@ -1,5 +1,4 @@
from django.apps import AppConfig
from django.db.models.signals import post_delete
class DocumentsConfig(AppConfig):

View File

@ -3,7 +3,6 @@ import logging
import os
import pickle
import re
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
@ -64,7 +63,7 @@ class DocumentClassifier(object):
def save_classifier(self):
with open(settings.MODEL_FILE, "wb") as f:
pickle.dump(self.FORMAT_VERSION, f) # Version
pickle.dump(self.FORMAT_VERSION, f)
pickle.dump(self.data_hash, f)
pickle.dump(self.data_vectorizer, f)
@ -89,16 +88,14 @@ class DocumentClassifier(object):
data.append(preprocessed_content)
y = -1
if doc.document_type:
if doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.document_type.pk
if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.document_type.pk
m.update(y.to_bytes(4, 'little', signed=True))
labels_document_type.append(y)
y = -1
if doc.correspondent:
if doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.correspondent.pk
if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.correspondent.pk
m.update(y.to_bytes(4, 'little', signed=True))
labels_correspondent.append(y)
@ -137,7 +134,7 @@ class DocumentClassifier(object):
logging.getLogger(__name__).debug("Vectorizing data...")
self.data_vectorizer = CountVectorizer(
analyzer="word",
ngram_range=(1,2),
ngram_range=(1, 2),
min_df=0.01
)
data_vectorized = self.data_vectorizer.fit_transform(data)

View File

@ -155,7 +155,7 @@ class Consumer:
self.log("debug", "Saving record to database")
created = file_info.created or date or timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
datetime.datetime.fromtimestamp(stats.st_mtime))
with open(doc, "rb") as f:
document = Document.objects.create(

View File

@ -1,5 +1,4 @@
import os
from datetime import datetime
from time import mktime
@ -22,7 +21,10 @@ class UploadForm(forms.Form):
def get_filename(self, i=None):
return os.path.join(
settings.CONSUMPTION_DIR,
"{}_{}".format(str(i), self.cleaned_data.get("document").name) if i else self.cleaned_data.get("document").name
"{}_{}".format(
str(i),
self.cleaned_data.get("document").name
) if i else self.cleaned_data.get("document").name
)
def save(self):

View File

@ -1,8 +1,6 @@
import logging
from contextlib import contextmanager
from django.db import models
from django.dispatch import receiver
from whoosh import highlight
from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.highlight import Formatter, get_text
@ -10,10 +8,8 @@ from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.writing import AsyncWriter
from documents.models import Document
from paperless import settings
logger = logging.getLogger(__name__)

View File

@ -5,12 +5,11 @@ import os
import re
import time
import uuid
from base64 import b64decode
from email import policy
from email.parser import BytesParser
from dateutil import parser
from dateutil import parser
from django.conf import settings
from .models import Correspondent

View File

@ -3,9 +3,8 @@ import os
from django.conf import settings
from django.core.management.base import BaseCommand
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
from documents.consumer import Consumer

View File

@ -1,4 +1,5 @@
from django.core.management.base import BaseCommand
from ...mixins import Renderable
from ...tasks import train_classifier

View File

@ -1,16 +1,15 @@
import json
import os
import time
import shutil
import time
from django.core.management.base import BaseCommand, CommandError
from django.core import serializers
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document, Correspondent, Tag, DocumentType
from paperless.db import GnuPG
from ...mixins import Renderable
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
from paperless.db import GnuPG
from ...mixins import Renderable
class Command(Renderable, BaseCommand):

View File

@ -3,17 +3,15 @@ import os
import shutil
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.core.management import call_command
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
from paperless.db import GnuPG
from ...file_handling import generate_filename, create_source_path_directory
from ...mixins import Renderable
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
class Command(Renderable, BaseCommand):

View File

@ -8,5 +8,5 @@ class Command(BaseCommand):
help = "A quick & dirty way to see what's in the logs"
def handle(self, *args, **options):
for l in Log.objects.order_by("pk"):
print(l)
for log in Log.objects.order_by("pk"):
print(log)

View File

@ -1,7 +1,6 @@
from django.core.management.base import BaseCommand
from documents.models import Document, Tag
from documents.models import Document
from ...mixins import Renderable

View File

@ -9,16 +9,14 @@ def match_correspondents(document_content, classifier):
correspondents = Correspondent.objects.all()
predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
matched_correspondents = [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
return matched_correspondents
return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
def match_document_types(document_content, classifier):
document_types = DocumentType.objects.all()
predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
matched_document_types = [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
return matched_document_types
return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
def match_tags(document_content, classifier):

View File

@ -22,11 +22,13 @@ from django.utils import timezone
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
from documents.signals import document_consumer_declaration
# TODO: isnt there a date parsing library for this?
DATE_REGEX = re.compile(
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|'
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
)
@ -43,7 +45,7 @@ def get_parser_class(doc):
for response in document_consumer_declaration.send(None):
parsers.append(response[1])
#TODO: add a check that checks parser availability.
# TODO: add a check that checks parser availability.
options = []
for parser in parsers:
@ -59,7 +61,7 @@ def get_parser_class(doc):
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@ -74,7 +76,7 @@ def run_convert(input, output, density=None, scale=None, alpha=None, strip=False
args += ['-trim'] if trim else []
args += ['-type', str(type)] if type else []
args += ['-depth', str(depth)] if depth else []
args += [input, output]
args += [input_file, output_file]
logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})

View File

@ -105,7 +105,6 @@ class DocumentSerializer(serializers.ModelSerializer):
class LogSerializer(serializers.ModelSerializer):
class Meta:
model = Log
fields = (

View File

@ -1,7 +1,6 @@
import logging
from django.conf import settings
from django_q.tasks import async_task, result
from whoosh.writing import AsyncWriter
from documents import index

View File

@ -2,9 +2,9 @@ import unittest
from django.test import TestCase
from .factories import DocumentFactory
from ..checks import changed_password_check
from ..models import Document
from .factories import DocumentFactory
class ChecksTestCase(TestCase):

View File

@ -1,14 +1,13 @@
import os
import shutil
from uuid import uuid4
from pathlib import Path
from uuid import uuid4
from django.conf import settings
from django.test import TestCase, override_settings
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
from ..models import Document, Correspondent
from django.conf import settings
from ..signals.handlers import update_filename_and_move_files
@ -68,24 +67,18 @@ class TestDate(TestCase):
# test that creating dirs for the source_path creates the correct directory
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR +
"/none"), True)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
document.correspondent = Correspondent.objects.get_or_create(name="test")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR +
"/test"), True)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR +
"/none"), False)
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR +
"/test/test-{:07d}.pdf.gpg".format(document.pk)), True)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test-{:07d}.pdf.gpg".format(document.pk)), True)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_missing_permissions(self):
document = Document()
document.file_type = "pdf"
@ -100,27 +93,22 @@ class TestDate(TestCase):
Path(document.source_path).touch()
# Test source_path
self.assertEqual(document.source_path, settings.ORIGINALS_DIR +
"/none/none-{:07d}.pdf".format(document.pk))
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk))
# Make the folder read- and execute-only (no writing and no renaming)
os.chmod(settings.ORIGINALS_DIR + "/none", 0o555)
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
document.correspondent = Correspondent.objects.get_or_create(name="test")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(document.filename,
"none/none-{:07d}.pdf".format(document.pk))
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_database_error(self):
document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
@ -155,13 +143,10 @@ class TestDate(TestCase):
# Check proper handling of files
self.assertTrue(os.path.isfile(document.source_path))
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
"originals/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(document.filename,
"none/none-{:07d}.pdf".format(document.pk))
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete(self):
document = Document()
document.file_type = "pdf"
@ -179,13 +164,10 @@ class TestDate(TestCase):
# Ensure file deletion after delete
pk = document.pk
document.delete()
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR +
"/none/none-{:07d}.pdf".format(pk)), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR +
"/none"), False)
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(pk)), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete_nofile(self):
document = Document()
document.file_type = "pdf"
@ -194,8 +176,7 @@ class TestDate(TestCase):
document.delete()
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_directory_not_empty(self):
document = Document()
document.file_type = "pdf"
@ -214,18 +195,14 @@ class TestDate(TestCase):
Path(important_file).touch()
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
document.correspondent = Correspondent.objects.get_or_create(name="test")[0]
document.save()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/test"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
"/documents/originals/none"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True)
self.assertTrue(os.path.isfile(important_file))
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_underscore(self):
document = Document()
@ -304,9 +281,7 @@ class TestDate(TestCase):
self.assertEqual(generate_filename(document),
"none-{:07d}.pdf".format(document.pk))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
"{correspondent}/{correspondent}")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
def test_nested_directory_cleanup(self):
document = Document()
document.file_type = "pdf"
@ -315,25 +290,19 @@ class TestDate(TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none/none-{:07d}.pdf".format(document.pk))
self.assertEqual(document.filename, "none/none/none-{:07d}.pdf".format(document.pk))
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR +
"/none/none"), True)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), True)
pk = document.pk
document.delete()
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR +
"/none/none/none-{:07d}.pdf".format(pk)),
False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR +
"/none/none"), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR +
"/none"), False)
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none-{:07d}.pdf".format(pk)), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR), True)
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
@ -355,8 +324,7 @@ class TestDate(TestCase):
Path(os.path.join(tmp, "notempty", "file")).touch()
os.makedirs(os.path.join(tmp, "notempty", "empty"))
delete_empty_directories(
os.path.join(tmp, "notempty", "empty"))
delete_empty_directories(os.path.join(tmp, "notempty", "empty"))
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
self.assertEqual(os.path.isfile(
os.path.join(tmp, "notempty", "file")), True)

View File

@ -1,9 +1,8 @@
from django.core.management.base import CommandError
from django.test import TestCase
from ..management.commands.document_importer import Command
from documents.settings import EXPORTER_FILE_NAME
from ..management.commands.document_importer import Command
class TestImporter(TestCase):

View File

@ -1,6 +1,5 @@
import logging
import uuid
from unittest import mock
from django.test import TestCase

View File

@ -1,10 +1,9 @@
import base64
import os
import magic
from hashlib import md5
from unittest import mock
import magic
from django.conf import settings
from django.test import TestCase

View File

@ -1,7 +1,7 @@
from django.test import TestCase
from ..models import Document, Correspondent
from .factories import DocumentFactory, CorrespondentFactory
from ..models import Document, Correspondent
class CorrespondentTestCase(TestCase):

View File

@ -4,11 +4,6 @@ from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.views import APIView
from paperless.db import GnuPG
from paperless.views import StandardPagination
from rest_framework.filters import OrderingFilter, SearchFilter
from rest_framework.mixins import (
DestroyModelMixin,
@ -17,12 +12,17 @@ from rest_framework.mixins import (
UpdateModelMixin
)
from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework.viewsets import (
GenericViewSet,
ModelViewSet,
ReadOnlyModelViewSet
)
import documents.index as index
from paperless.db import GnuPG
from paperless.views import StandardPagination
from .filters import (
CorrespondentFilterSet,
DocumentFilterSet,
@ -30,8 +30,6 @@ from .filters import (
DocumentTypeFilterSet,
LogFilterSet
)
import documents.index as index
from .forms import UploadForm
from .models import Correspondent, Document, Log, Tag, DocumentType
from .serialisers import (
@ -106,7 +104,7 @@ class DocumentViewSet(RetrieveModelMixin,
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
def file_response(self, pk, disposition):
#TODO: this should not be necessary here.
# TODO: this should not be necessary here.
content_types = {
Document.TYPE_PDF: "application/pdf",
Document.TYPE_PNG: "image/png",
@ -114,7 +112,7 @@ class DocumentViewSet(RetrieveModelMixin,
Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
}
@ -132,7 +130,7 @@ class DocumentViewSet(RetrieveModelMixin,
@action(methods=['post'], detail=False)
def post_document(self, request, pk=None):
#TODO: is this a good implementation?
# TODO: is this a good implementation?
form = UploadForm(data=request.POST, files=request.FILES)
if form.is_valid():
form.save()

View File

@ -11,6 +11,8 @@ writeable_hint = (
"Set the permissions of {} to be writeable by the user running the "
"Paperless services"
)
def path_check(env_var):
messages = []
directory = os.getenv(env_var)
@ -27,6 +29,7 @@ def path_check(env_var):
))
return messages
@register()
def paths_check(app_configs, **kwargs):
"""
@ -34,9 +37,9 @@ def paths_check(app_configs, **kwargs):
"""
check_messages = path_check("PAPERLESS_DATA_DIR") + \
path_check("PAPERLESS_MEDIA_ROOT") + \
path_check("PAPERLESS_CONSUMPTION_DIR") + \
path_check("PAPERLESS_STATICDIR")
path_check("PAPERLESS_MEDIA_ROOT") + \
path_check("PAPERLESS_CONSUMPTION_DIR") + \
path_check("PAPERLESS_STATICDIR")
return check_messages

View File

@ -25,6 +25,7 @@ elif os.path.exists("/usr/local/etc/paperless.conf"):
# Tesseract process to one thread.
os.environ['OMP_THREAD_LIMIT'] = "1"
def __get_boolean(key, default="NO"):
"""
Return a boolean value based on whatever the user has supplied in the
@ -32,9 +33,11 @@ def __get_boolean(key, default="NO"):
"""
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
# NEVER RUN WITH DEBUG IN PRODUCTION.
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
###############################################################################
# Directories #
###############################################################################

View File

@ -6,7 +6,6 @@ from django.views.decorators.csrf import csrf_exempt
from django.views.generic import RedirectView
from rest_framework.routers import DefaultRouter
from paperless.views import FaviconView
from documents.views import (
CorrespondentViewSet,
DocumentViewSet,
@ -18,6 +17,7 @@ from documents.views import (
SearchAutoCompleteView,
StatisticsView
)
from paperless.views import FaviconView
api_router = DefaultRouter()
api_router.register(r"correspondents", CorrespondentViewSet)
@ -30,7 +30,7 @@ api_router.register(r"tags", TagViewSet)
urlpatterns = [
# API
url(r"^api/auth/",include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
url(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
url(r"^api/search/", SearchView.as_view(), name="search"),
url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),

View File

@ -5,15 +5,14 @@ import subprocess
from multiprocessing.pool import Pool
import langdetect
import pdftotext
import pyocr
from django.conf import settings
from PIL import Image
from django.conf import settings
from pyocr import PyocrException
import pdftotext
from documents.parsers import DocumentParser, ParseError, run_unpaper, \
run_convert
from .languages import ISO639
@ -45,8 +44,8 @@ class RasterisedDocumentParser(DocumentParser):
alpha="remove",
strip=True,
trim=True,
input="{}[0]".format(self.document_path),
output=out_path,
input_file="{}[0]".format(self.document_path),
output_file=out_path,
logging_group=self.logging_group)
except ParseError:
# if convert fails, fall back to extracting
@ -66,8 +65,8 @@ class RasterisedDocumentParser(DocumentParser):
alpha="remove",
strip=True,
trim=True,
input=gs_out_path,
output=out_path,
input_file=gs_out_path,
output_file=out_path,
logging_group=self.logging_group)
return out_path
@ -99,7 +98,7 @@ class RasterisedDocumentParser(DocumentParser):
try:
sample_page_index = int(len(images) / 2)
self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index+1, len(images)))
self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
guessed_language = self._guess_language(sample_page_text)
@ -139,8 +138,8 @@ class RasterisedDocumentParser(DocumentParser):
run_convert(density=settings.CONVERT_DENSITY,
depth="8",
type="grayscale",
input=self.document_path,
output=pnm,
input_file=self.document_path,
output_file=pnm,
logging_group=self.logging_group)
# Get a list of converted images
@ -189,7 +188,6 @@ class RasterisedDocumentParser(DocumentParser):
return [sample_page]
def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(

View File

@ -5,10 +5,10 @@ from unittest import mock
from uuid import uuid4
from dateutil import tz
from django.conf import settings
from django.test import TestCase, override_settings
from ..parsers import RasterisedDocumentParser
from django.conf import settings
class TestDate(TestCase):

View File

@ -47,8 +47,8 @@ class TextDocumentParser(DocumentParser):
def read_text():
with open(self.document_path, 'r') as src:
lines = [l.strip() for l in src.readlines()]
text = "\n".join([l for l in lines[:n_lines]])
lines = [line.strip() for line in src.readlines()]
text = "\n".join([line for line in lines[:n_lines]])
return text.replace('"', "'")
def create_txlayer():

View File

@ -1,6 +1,6 @@
[pycodestyle]
exclude = migrations, paperless/settings.py, .tox
ignore = E501
[tool:pytest]
DJANGO_SETTINGS_MODULE=paperless.settings