code cleanup

This commit is contained in:
Jonas Winkler
2020-11-21 14:03:45 +01:00
parent dbe90994ca
commit afc3753e58
18 changed files with 208 additions and 101 deletions

View File

@@ -104,9 +104,11 @@ class Consumer(LoggingMixin):
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
raise ConsumerError(f"No parsers abvailable for {self.filename}")
else:
self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type))
self.log("debug",
f"Parser: {parser_class.__name__} "
f"based on mime type {mime_type}")
# Notify all listeners that we're going to do some work.
@@ -126,7 +128,7 @@ class Consumer(LoggingMixin):
# Parse the document. This may take some time.
try:
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
self.log("debug", f"Generating thumbnail for {self.filename}...")
thumbnail = document_parser.get_optimised_thumbnail()
self.log("debug", "Parsing {}...".format(self.filename))
text = document_parser.get_text()
@@ -244,10 +246,12 @@ class Consumer(LoggingMixin):
document.title = self.override_title
if self.override_correspondent_id:
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
document.correspondent = Correspondent.objects.get(
pk=self.override_correspondent_id)
if self.override_document_type_id:
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
document.document_type = DocumentType.objects.get(
pk=self.override_document_type_id)
if self.override_tag_ids:
for tag_id in self.override_tag_ids:

View File

@@ -87,7 +87,9 @@ def generate_filename(document):
tags=tags,
)
except (ValueError, KeyError, IndexError):
logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
logging.getLogger(__name__).warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:

View File

@@ -46,9 +46,14 @@ class UploadForm(forms.Form):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", dir=settings.SCRATCH_DIR, delete=False) as f:
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
dir=settings.SCRATCH_DIR,
delete=False) as f:
f.write(data)
os.utime(f.name, times=(t, t))
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
async_task("documents.tasks.consume_file",
f.name,
override_filename=original_filename,
task_name=os.path.basename(original_filename))

View File

@@ -120,6 +120,7 @@ def query_page(ix, query, page):
def autocomplete(ix, term, limit=10):
with ix.reader() as reader:
terms = []
for (score, t) in reader.most_distinctive_terms("content", limit, term.lower()):
for (score, t) in reader.most_distinctive_terms(
"content", number=limit, prefix=term.lower()):
terms.append(t)
return terms

View File

@@ -19,10 +19,13 @@ class Handler(FileSystemEventHandler):
def _consume(self, file):
if os.path.isfile(file):
try:
async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
async_task("documents.tasks.consume_file",
file,
task_name=os.path.basename(file))
except Exception as e:
# Catch all so that the consumer won't crash.
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
logging.getLogger(__name__).error(
"Error while consuming document: {}".format(e))
def on_created(self, event):
self._consume(event.src_path)
@@ -66,12 +69,14 @@ class Command(BaseCommand):
# Consume all files as this is not done initially by the watchdog
for entry in os.scandir(directory):
if entry.is_file():
async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
async_task("documents.tasks.consume_file",
entry.path,
task_name=os.path.basename(entry.path))
# Start the watchdog. Woof!
if settings.CONSUMER_POLLING > 0:
logging.getLogger(__name__).info('Using polling instead of file'
'system notifications.')
logging.getLogger(__name__).info(
"Using polling instead of file system notifications.")
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
else:
observer = Observer()

View File

@@ -63,7 +63,7 @@ class Command(Renderable, BaseCommand):
document = document_map[document_dict["pk"]]
unique_filename = "{:07}_{}".format(document.pk, document.file_name)
unique_filename = f"{document.pk:07}_{document.file_name}"
file_target = os.path.join(self.target, unique_filename)
@@ -73,7 +73,7 @@ class Command(Renderable, BaseCommand):
document_dict[EXPORTER_FILE_NAME] = unique_filename
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
print("Exporting: {}".format(file_target))
print(f"Exporting: {file_target}")
t = int(time.mktime(document.created.timetuple()))
if document.storage_type == Document.STORAGE_TYPE_GPG:

View File

@@ -120,7 +120,7 @@ class Command(Renderable, BaseCommand):
encrypted.write(GnuPG.encrypted(unencrypted))
else:
print("Moving {} to {}".format(document_path, document.source_path))
print(f"Moving {document_path} to {document.source_path}")
shutil.copy(document_path, document.source_path)
shutil.copy(thumbnail_path, document.thumbnail_path)

View File

@@ -74,13 +74,13 @@ class Command(Renderable, BaseCommand):
try:
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
logging.getLogger(__name__).warning(
f"Cannot classify documents: {e}.")
classifier = None
for document in documents:
logging.getLogger(__name__).info(
"Processing document {}".format(document.title)
)
f"Processing document {document.title}")
if options['correspondent']:
set_correspondent(

View File

@@ -6,17 +6,23 @@ from documents.models import MatchingModel, Correspondent, DocumentType, Tag
def match_correspondents(document_content, classifier):
correspondents = Correspondent.objects.all()
predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
if classifier:
pred_id = classifier.predict_correspondent(document_content)
else:
pred_id = None
return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
correspondents = Correspondent.objects.all()
return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id]
def match_document_types(document_content, classifier):
document_types = DocumentType.objects.all()
predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
if classifier:
pred_id = classifier.predict_document_type(document_content)
else:
pred_id = None
return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
document_types = DocumentType.objects.all()
return [o for o in document_types if matches(o, document_content) or o.pk == pred_id]
def match_tags(document_content, classifier):

View File

@@ -73,7 +73,18 @@ def get_parser_class(path):
return get_parser_class_for_mime_type(mime_type)
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
def run_convert(input_file,
output_file,
density=None,
scale=None,
alpha=None,
strip=False,
trim=False,
type=None,
depth=None,
extra=None,
logging_group=None):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@@ -102,10 +113,13 @@ def run_unpaper(pnm, logging_group=None):
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
pnm_out)
logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group})
logger.debug(f"Execute: {' '.join(command_args)}",
extra={'group': logging_group})
if not subprocess.Popen(command_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait() == 0:
raise ParseError("Unpaper failed at {}".format(command_args))
if not subprocess.Popen(command_args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL).wait() == 0:
raise ParseError(f"Unpaper failed at {command_args}")
return pnm_out
@@ -124,7 +138,8 @@ class DocumentParser(LoggingMixin):
super().__init__()
self.logging_group = logging_group
self.document_path = path
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
self.tempdir = tempfile.mkdtemp(
prefix="paperless-", dir=settings.SCRATCH_DIR)
def get_thumbnail(self):
"""
@@ -137,9 +152,10 @@ class DocumentParser(LoggingMixin):
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "optipng.png")
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
args = (settings.OPTIPNG_BINARY,
"-silent", "-o5", in_path, "-out", out_path)
self.log('debug', 'Execute: ' + " ".join(args))
self.log('debug', f"Execute: {' '.join(args)}")
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))

View File

@@ -76,9 +76,11 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):
class DocumentSerializer(serializers.ModelSerializer):
correspondent_id = CorrespondentField(allow_null=True, source='correspondent')
correspondent_id = CorrespondentField(
allow_null=True, source='correspondent')
tags_id = TagsField(many=True, source='tags')
document_type_id = DocumentTypeField(allow_null=True, source='document_type')
document_type_id = DocumentTypeField(
allow_null=True, source='document_type')
class Meta:
model = Document

View File

@@ -25,11 +25,18 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
document.tags.add(*inbox_tags)
def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
def set_correspondent(sender,
document=None,
logging_group=None,
classifier=None,
replace=False,
use_first=True,
**kwargs):
if document.correspondent and not replace:
return
potential_correspondents = matching.match_correspondents(document.content, classifier)
potential_correspondents = matching.match_correspondents(document.content,
classifier)
potential_count = len(potential_correspondents)
if potential_correspondents:
@@ -38,22 +45,22 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
selected = None
if potential_count > 1:
if use_first:
message = "Detected {} potential correspondents, so we've opted for {}"
logger(
message.format(potential_count, selected),
f"Detected {potential_count} potential correspondents, "
f"so we've opted for {selected}",
logging_group
)
else:
message = "Detected {} potential correspondents, not assigning any correspondent"
logger(
message.format(potential_count),
f"Detected {potential_count} potential correspondents, "
f"not assigning any correspondent",
logging_group
)
return
if selected or replace:
logger(
'Assigning correspondent "{}" to "{}" '.format(selected, document),
f"Assigning correspondent {selected} to {document}",
logging_group
)
@@ -61,11 +68,18 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
document.save(update_fields=("correspondent",))
def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
def set_document_type(sender,
document=None,
logging_group=None,
classifier=None,
replace=False,
use_first=True,
**kwargs):
if document.document_type and not replace:
return
potential_document_type = matching.match_document_types(document.content, classifier)
potential_document_type = matching.match_document_types(document.content,
classifier)
potential_count = len(potential_document_type)
if potential_document_type:
@@ -75,22 +89,22 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None
if potential_count > 1:
if use_first:
message = "Detected {} potential document types, so we've opted for {}"
logger(
message.format(potential_count, selected),
f"Detected {potential_count} potential document types, "
f"so we've opted for {selected}",
logging_group
)
else:
message = "Detected {} potential document types, not assigning any document type"
logger(
message.format(potential_count),
f"Detected {potential_count} potential document types, "
f"not assigning any document type",
logging_group
)
return
if selected or replace:
logger(
'Assigning document type "{}" to "{}" '.format(selected, document),
f"Assigning document type {selected} to {document}",
logging_group
)
@@ -98,14 +112,21 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None
document.save(update_fields=("document_type",))
def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs):
def set_tags(sender,
document=None,
logging_group=None,
classifier=None,
replace=False,
**kwargs):
if replace:
document.tags.clear()
current_tags = set([])
else:
current_tags = set(document.tags.all())
relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags
matched_tags = matching.match_tags(document.content, classifier)
relevant_tags = set(matched_tags) - current_tags
if not relevant_tags:
return
@@ -180,12 +201,15 @@ def update_filename_and_move_files(sender, instance, **kwargs):
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path))
logging.getLogger(__name__).fatal(
f"Document {str(instance)}: File {old_path} has gone.")
return
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path))
logging.getLogger(__name__).warning(
f"Document {str(instance)}: Cannot rename file "
f"since target path {new_path} already exists.")
return
create_source_path_directory(new_path)

View File

@@ -15,11 +15,3 @@ class ChecksTestCase(TestCase):
def test_changed_password_check_no_encryption(self):
DocumentFactory.create(storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
self.assertEqual(changed_password_check(None), [])
@unittest.skip("I don't know how to test this")
def test_changed_password_check_gpg_encryption_with_good_password(self):
pass
@unittest.skip("I don't know how to test this")
def test_changed_password_check_fail(self):
pass

View File

@@ -47,18 +47,30 @@ class IndexView(TemplateView):
class CorrespondentViewSet(ModelViewSet):
model = Correspondent
queryset = Correspondent.objects.annotate(document_count=Count('documents'), last_correspondence=Max('documents__created')).order_by('name')
queryset = Correspondent.objects.annotate(
document_count=Count('documents'),
last_correspondence=Max('documents__created')).order_by('name')
serializer_class = CorrespondentSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filterset_class = CorrespondentFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
ordering_fields = (
"name",
"matching_algorithm",
"match",
"document_count",
"last_correspondence")
class TagViewSet(ModelViewSet):
model = Tag
queryset = Tag.objects.annotate(document_count=Count('documents')).order_by('name')
queryset = Tag.objects.annotate(
document_count=Count('documents')).order_by('name')
serializer_class = TagSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
@@ -69,7 +81,10 @@ class TagViewSet(ModelViewSet):
class DocumentTypeViewSet(ModelViewSet):
model = DocumentType
queryset = DocumentType.objects.annotate(document_count=Count('documents')).order_by('name')
queryset = DocumentType.objects.annotate(
document_count=Count('documents')).order_by('name')
serializer_class = DocumentTypeSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
@@ -92,10 +107,18 @@ class DocumentViewSet(RetrieveModelMixin,
filterset_class = DocumentFilterSet
search_fields = ("title", "correspondent__name", "content")
ordering_fields = (
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
"id",
"title",
"correspondent__name",
"document_type__name",
"created",
"modified",
"added",
"archive_serial_number")
def update(self, request, *args, **kwargs):
response = super(DocumentViewSet, self).update(request, *args, **kwargs)
response = super(DocumentViewSet, self).update(
request, *args, **kwargs)
index.add_or_update_document(self.get_object())
return response
@@ -138,7 +161,8 @@ class DocumentViewSet(RetrieveModelMixin,
@cache_control(public=False, max_age=315360000)
def thumb(self, request, pk=None):
try:
return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
content_type='image/png')
except FileNotFoundError:
raise Http404("Document thumbnail does not exist")
@@ -230,5 +254,6 @@ class StatisticsView(APIView):
def get(self, request, format=None):
return Response({
'documents_total': Document.objects.all().count(),
'documents_inbox': Document.objects.filter(tags__is_inbox_tag=True).distinct().count()
'documents_inbox': Document.objects.filter(
tags__is_inbox_tag=True).distinct().count()
})