code cleanup

2025-12-16 01:31:09 -06:00 · 2020-11-21 14:03:45 +01:00
parent dbe90994ca
commit afc3753e58
18 changed files with 208 additions and 101 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -104,9 +104,11 @@ class Consumer(LoggingMixin):

        parser_class = get_parser_class_for_mime_type(mime_type)
        if not parser_class:
-            raise ConsumerError("No parsers abvailable for {}".format(self.filename))
+            raise ConsumerError(f"No parsers abvailable for {self.filename}")
        else:
-            self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type))
+            self.log("debug",
+                     f"Parser: {parser_class.__name__} "
+                     f"based on mime type {mime_type}")

        # Notify all listeners that we're going to do some work.

@@ -126,7 +128,7 @@ class Consumer(LoggingMixin):
        # Parse the document. This may take some time.

        try:
-            self.log("debug", "Generating thumbnail for {}...".format(self.filename))
+            self.log("debug", f"Generating thumbnail for {self.filename}...")
            thumbnail = document_parser.get_optimised_thumbnail()
            self.log("debug", "Parsing {}...".format(self.filename))
            text = document_parser.get_text()
@@ -244,10 +246,12 @@ class Consumer(LoggingMixin):
            document.title = self.override_title

        if self.override_correspondent_id:
-            document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
+            document.correspondent = Correspondent.objects.get(
+                pk=self.override_correspondent_id)

        if self.override_document_type_id:
-            document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
+            document.document_type = DocumentType.objects.get(
+                pk=self.override_document_type_id)

        if self.override_tag_ids:
            for tag_id in self.override_tag_ids:
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -87,7 +87,9 @@ def generate_filename(document):
                tags=tags,
            )
    except (ValueError, KeyError, IndexError):
-        logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
+        logging.getLogger(__name__).warning(
+            f"Invalid PAPERLESS_FILENAME_FORMAT: "
+            f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")

    # Always append the primary key to guarantee uniqueness of filename
    if len(path) > 0:
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -46,9 +46,14 @@ class UploadForm(forms.Form):

        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)

-        with tempfile.NamedTemporaryFile(prefix="paperless-upload-", dir=settings.SCRATCH_DIR, delete=False) as f:
+        with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
+                                         dir=settings.SCRATCH_DIR,
+                                         delete=False) as f:

            f.write(data)
            os.utime(f.name, times=(t, t))

-            async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
+            async_task("documents.tasks.consume_file",
+                       f.name,
+                       override_filename=original_filename,
+                       task_name=os.path.basename(original_filename))
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -120,6 +120,7 @@ def query_page(ix, query, page):
 def autocomplete(ix, term, limit=10):
    with ix.reader() as reader:
        terms = []
-        for (score, t) in reader.most_distinctive_terms("content", limit, term.lower()):
+        for (score, t) in reader.most_distinctive_terms(
+                "content", number=limit, prefix=term.lower()):
            terms.append(t)
        return terms
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -19,10 +19,13 @@ class Handler(FileSystemEventHandler):
    def _consume(self, file):
        if os.path.isfile(file):
            try:
-                async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
+                async_task("documents.tasks.consume_file",
+                           file,
+                           task_name=os.path.basename(file))
            except Exception as e:
                # Catch all so that the consumer won't crash.
-                logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
+                logging.getLogger(__name__).error(
+                    "Error while consuming document: {}".format(e))

    def on_created(self, event):
        self._consume(event.src_path)
@@ -66,12 +69,14 @@ class Command(BaseCommand):
        # Consume all files as this is not done initially by the watchdog
        for entry in os.scandir(directory):
            if entry.is_file():
-                async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
+                async_task("documents.tasks.consume_file",
+                           entry.path,
+                           task_name=os.path.basename(entry.path))

        # Start the watchdog. Woof!
        if settings.CONSUMER_POLLING > 0:
-            logging.getLogger(__name__).info('Using polling instead of file'
-                                             'system notifications.')
+            logging.getLogger(__name__).info(
+                "Using polling instead of file system notifications.")
            observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
        else:
            observer = Observer()
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -63,7 +63,7 @@ class Command(Renderable, BaseCommand):

            document = document_map[document_dict["pk"]]

-            unique_filename = "{:07}_{}".format(document.pk, document.file_name)
+            unique_filename = f"{document.pk:07}_{document.file_name}"

            file_target = os.path.join(self.target, unique_filename)

@@ -73,7 +73,7 @@ class Command(Renderable, BaseCommand):
            document_dict[EXPORTER_FILE_NAME] = unique_filename
            document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name

-            print("Exporting: {}".format(file_target))
+            print(f"Exporting: {file_target}")

            t = int(time.mktime(document.created.timetuple()))
            if document.storage_type == Document.STORAGE_TYPE_GPG:
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -120,7 +120,7 @@ class Command(Renderable, BaseCommand):
                        encrypted.write(GnuPG.encrypted(unencrypted))

            else:
-                print("Moving {} to {}".format(document_path, document.source_path))
+                print(f"Moving {document_path} to {document.source_path}")
                shutil.copy(document_path, document.source_path)
                shutil.copy(thumbnail_path, document.thumbnail_path)

--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -74,13 +74,13 @@ class Command(Renderable, BaseCommand):
        try:
            classifier.reload()
        except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
-            logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
+            logging.getLogger(__name__).warning(
+                f"Cannot classify documents: {e}.")
            classifier = None

        for document in documents:
            logging.getLogger(__name__).info(
-                "Processing document {}".format(document.title)
-            )
+                f"Processing document {document.title}")

            if options['correspondent']:
                set_correspondent(
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -6,17 +6,23 @@ from documents.models import MatchingModel, Correspondent, DocumentType, Tag


 def match_correspondents(document_content, classifier):
-    correspondents = Correspondent.objects.all()
-    predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
+    if classifier:
+        pred_id = classifier.predict_correspondent(document_content)
+    else:
+        pred_id = None

-    return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
+    correspondents = Correspondent.objects.all()
+    return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id]


 def match_document_types(document_content, classifier):
-    document_types = DocumentType.objects.all()
-    predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
+    if classifier:
+        pred_id = classifier.predict_document_type(document_content)
+    else:
+        pred_id = None

-    return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
+    document_types = DocumentType.objects.all()
+    return [o for o in document_types if matches(o, document_content) or o.pk == pred_id]


 def match_tags(document_content, classifier):
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -73,7 +73,18 @@ def get_parser_class(path):
    return get_parser_class_for_mime_type(mime_type)


-def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
+def run_convert(input_file,
+                output_file,
+                density=None,
+                scale=None,
+                alpha=None,
+                strip=False,
+                trim=False,
+                type=None,
+                depth=None,
+                extra=None,
+                logging_group=None):
+
    environment = os.environ.copy()
    if settings.CONVERT_MEMORY_LIMIT:
        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@@ -102,10 +113,13 @@ def run_unpaper(pnm, logging_group=None):
    command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
                    pnm_out)

-    logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group})
+    logger.debug(f"Execute: {' '.join(command_args)}",
+                 extra={'group': logging_group})

-    if not subprocess.Popen(command_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait() == 0:
-        raise ParseError("Unpaper failed at {}".format(command_args))
+    if not subprocess.Popen(command_args,
+                            stdout=subprocess.DEVNULL,
+                            stderr=subprocess.DEVNULL).wait() == 0:
+        raise ParseError(f"Unpaper failed at {command_args}")

    return pnm_out

@@ -124,7 +138,8 @@ class DocumentParser(LoggingMixin):
        super().__init__()
        self.logging_group = logging_group
        self.document_path = path
-        self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
+        self.tempdir = tempfile.mkdtemp(
+            prefix="paperless-", dir=settings.SCRATCH_DIR)

    def get_thumbnail(self):
        """
@@ -137,9 +152,10 @@ class DocumentParser(LoggingMixin):
        if settings.OPTIMIZE_THUMBNAILS:
            out_path = os.path.join(self.tempdir, "optipng.png")

-            args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
+            args = (settings.OPTIPNG_BINARY,
+                    "-silent", "-o5", in_path, "-out", out_path)

-            self.log('debug', 'Execute: ' + " ".join(args))
+            self.log('debug', f"Execute: {' '.join(args)}")

            if not subprocess.Popen(args).wait() == 0:
                raise ParseError("Optipng failed at {}".format(args))
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -76,9 +76,11 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):

 class DocumentSerializer(serializers.ModelSerializer):

-    correspondent_id = CorrespondentField(allow_null=True, source='correspondent')
+    correspondent_id = CorrespondentField(
+        allow_null=True, source='correspondent')
    tags_id = TagsField(many=True, source='tags')
-    document_type_id = DocumentTypeField(allow_null=True, source='document_type')
+    document_type_id = DocumentTypeField(
+        allow_null=True, source='document_type')

    class Meta:
        model = Document
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -25,11 +25,18 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
    document.tags.add(*inbox_tags)


-def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
+def set_correspondent(sender,
+                      document=None,
+                      logging_group=None,
+                      classifier=None,
+                      replace=False,
+                      use_first=True,
+                      **kwargs):
    if document.correspondent and not replace:
        return

-    potential_correspondents = matching.match_correspondents(document.content, classifier)
+    potential_correspondents = matching.match_correspondents(document.content,
+                                                             classifier)

    potential_count = len(potential_correspondents)
    if potential_correspondents:
@@ -38,22 +45,22 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
        selected = None
    if potential_count > 1:
        if use_first:
-            message = "Detected {} potential correspondents, so we've opted for {}"
            logger(
-                message.format(potential_count, selected),
+                f"Detected {potential_count} potential correspondents, "
+                f"so we've opted for {selected}",
                logging_group
            )
        else:
-            message = "Detected {} potential correspondents, not assigning any correspondent"
            logger(
-                message.format(potential_count),
+                f"Detected {potential_count} potential correspondents, "
+                f"not assigning any correspondent",
                logging_group
            )
            return

    if selected or replace:
        logger(
-            'Assigning correspondent "{}" to "{}" '.format(selected, document),
+            f"Assigning correspondent {selected} to {document}",
            logging_group
        )

@@ -61,11 +68,18 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
        document.save(update_fields=("correspondent",))


-def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
+def set_document_type(sender,
+                      document=None,
+                      logging_group=None,
+                      classifier=None,
+                      replace=False,
+                      use_first=True,
+                      **kwargs):
    if document.document_type and not replace:
        return

-    potential_document_type = matching.match_document_types(document.content, classifier)
+    potential_document_type = matching.match_document_types(document.content,
+                                                            classifier)

    potential_count = len(potential_document_type)
    if potential_document_type:
@@ -75,22 +89,22 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None

    if potential_count > 1:
        if use_first:
-            message = "Detected {} potential document types, so we've opted for {}"
            logger(
-                message.format(potential_count, selected),
+                f"Detected {potential_count} potential document types, "
+                f"so we've opted for {selected}",
                logging_group
            )
        else:
-            message = "Detected {} potential document types, not assigning any document type"
            logger(
-                message.format(potential_count),
+                f"Detected {potential_count} potential document types, "
+                f"not assigning any document type",
                logging_group
            )
            return

    if selected or replace:
        logger(
-            'Assigning document type "{}" to "{}" '.format(selected, document),
+            f"Assigning document type {selected} to {document}",
            logging_group
        )

@@ -98,14 +112,21 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None
        document.save(update_fields=("document_type",))


-def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs):
+def set_tags(sender,
+             document=None,
+             logging_group=None,
+             classifier=None,
+             replace=False,
+             **kwargs):
    if replace:
        document.tags.clear()
        current_tags = set([])
    else:
        current_tags = set(document.tags.all())

-    relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags
+    matched_tags = matching.match_tags(document.content, classifier)
+
+    relevant_tags = set(matched_tags) - current_tags

    if not relevant_tags:
        return
@@ -180,12 +201,15 @@ def update_filename_and_move_files(sender, instance, **kwargs):

    if not os.path.isfile(old_path):
        # Can't do anything if the old file does not exist anymore.
-        logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path))
+        logging.getLogger(__name__).fatal(
+            f"Document {str(instance)}: File {old_path} has gone.")
        return

    if os.path.isfile(new_path):
        # Can't do anything if the new file already exists. Skip updating file.
-        logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path))
+        logging.getLogger(__name__).warning(
+            f"Document {str(instance)}: Cannot rename file "
+            f"since target path {new_path} already exists.")
        return

    create_source_path_directory(new_path)
--- a/src/documents/tests/test_checks.py
+++ b/src/documents/tests/test_checks.py
@@ -15,11 +15,3 @@ class ChecksTestCase(TestCase):
    def test_changed_password_check_no_encryption(self):
        DocumentFactory.create(storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
        self.assertEqual(changed_password_check(None), [])
-
-    @unittest.skip("I don't know how to test this")
-    def test_changed_password_check_gpg_encryption_with_good_password(self):
-        pass
-
-    @unittest.skip("I don't know how to test this")
-    def test_changed_password_check_fail(self):
-        pass
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -47,18 +47,30 @@ class IndexView(TemplateView):

 class CorrespondentViewSet(ModelViewSet):
    model = Correspondent
-    queryset = Correspondent.objects.annotate(document_count=Count('documents'), last_correspondence=Max('documents__created')).order_by('name')
+
+    queryset = Correspondent.objects.annotate(
+        document_count=Count('documents'),
+        last_correspondence=Max('documents__created')).order_by('name')
+
    serializer_class = CorrespondentSerializer
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
    filter_backends = (DjangoFilterBackend, OrderingFilter)
    filterset_class = CorrespondentFilterSet
-    ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
+    ordering_fields = (
+        "name",
+        "matching_algorithm",
+        "match",
+        "document_count",
+        "last_correspondence")


 class TagViewSet(ModelViewSet):
    model = Tag
-    queryset = Tag.objects.annotate(document_count=Count('documents')).order_by('name')
+
+    queryset = Tag.objects.annotate(
+        document_count=Count('documents')).order_by('name')
+
    serializer_class = TagSerializer
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
@@ -69,7 +81,10 @@ class TagViewSet(ModelViewSet):

 class DocumentTypeViewSet(ModelViewSet):
    model = DocumentType
-    queryset = DocumentType.objects.annotate(document_count=Count('documents')).order_by('name')
+
+    queryset = DocumentType.objects.annotate(
+        document_count=Count('documents')).order_by('name')
+
    serializer_class = DocumentTypeSerializer
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
@@ -92,10 +107,18 @@ class DocumentViewSet(RetrieveModelMixin,
    filterset_class = DocumentFilterSet
    search_fields = ("title", "correspondent__name", "content")
    ordering_fields = (
-        "id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
+        "id",
+        "title",
+        "correspondent__name",
+        "document_type__name",
+        "created",
+        "modified",
+        "added",
+        "archive_serial_number")

    def update(self, request, *args, **kwargs):
-        response = super(DocumentViewSet, self).update(request, *args, **kwargs)
+        response = super(DocumentViewSet, self).update(
+            request, *args, **kwargs)
        index.add_or_update_document(self.get_object())
        return response

@@ -138,7 +161,8 @@ class DocumentViewSet(RetrieveModelMixin,
    @cache_control(public=False, max_age=315360000)
    def thumb(self, request, pk=None):
        try:
-            return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
+            return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
+                                content_type='image/png')
        except FileNotFoundError:
            raise Http404("Document thumbnail does not exist")

@@ -230,5 +254,6 @@ class StatisticsView(APIView):
    def get(self, request, format=None):
        return Response({
            'documents_total': Document.objects.all().count(),
-            'documents_inbox': Document.objects.filter(tags__is_inbox_tag=True).distinct().count()
+            'documents_inbox': Document.objects.filter(
+                tags__is_inbox_tag=True).distinct().count()
        })
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -174,8 +174,8 @@ class MailAccountHandler(LoggingMixin):
                    M.folder.set(rule.folder)
                except MailboxFolderSelectError:
                    raise MailError(
-                        f"Rule {rule.name}: Folder {rule.folder} does not exist "
-                        f"in account {account.name}")
+                        f"Rule {rule.name}: Folder {rule.folder} "
+                        f"does not exist in account {account.name}")

                criterias = make_criterias(rule)

@@ -185,7 +185,8 @@ class MailAccountHandler(LoggingMixin):
                    f"{str(AND(**criterias))}")

                try:
-                    messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
+                    messages = M.fetch(criteria=AND(**criterias),
+                                       mark_seen=False)
                except Exception:
                    raise MailError(
                        f"Rule {rule.name}: Error while fetching folder "
@@ -226,8 +227,8 @@ class MailAccountHandler(LoggingMixin):

                except Exception:
                    raise MailError(
-                        f"Rule {rule.name}: Error while processing post-consume "
-                        f"actions for account {account.name}")
+                        f"Rule {rule.name}: Error while processing "
+                        f"post-consume actions for account {account.name}")

        return total_processed_files

@@ -266,7 +267,8 @@ class MailAccountHandler(LoggingMixin):
            if is_mime_type_supported(mime_type):

                os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
-                _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
+                _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-",
+                                                    dir=settings.SCRATCH_DIR)
                with open(temp_filename, 'wb') as f:
                    f.write(att.payload)

--- a/src/paperless_mail/models.py
+++ b/src/paperless_mail/models.py
@@ -66,10 +66,14 @@ class MailRule(models.Model):
    CORRESPONDENT_FROM_CUSTOM = 4

    CORRESPONDENT_SELECTOR = (
-        (CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"),
-        (CORRESPONDENT_FROM_EMAIL, "Use mail address"),
-        (CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"),
-        (CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
+        (CORRESPONDENT_FROM_NOTHING,
+         "Do not assign a correspondent"),
+        (CORRESPONDENT_FROM_EMAIL,
+         "Use mail address"),
+        (CORRESPONDENT_FROM_NAME,
+         "Use name (or mail address if not available)"),
+        (CORRESPONDENT_FROM_CUSTOM,
+         "Use correspondent selected below")
    )

    name = models.CharField(max_length=256, unique=True)
--- a/src/paperless_mail/tasks.py
+++ b/src/paperless_mail/tasks.py
@@ -7,7 +7,8 @@ from paperless_mail.models import MailAccount
 def process_mail_accounts():
    total_new_documents = 0
    for account in MailAccount.objects.all():
-        total_new_documents += MailAccountHandler().handle_mail_account(account)
+        total_new_documents += MailAccountHandler().handle_mail_account(
+            account)

    if total_new_documents > 0:
        return f"Added {total_new_documents} document(s)."
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -50,7 +50,10 @@ class RasterisedDocumentParser(DocumentParser):
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
-            self.log('warning', 'Thumbnail generation with ImageMagick failed, falling back to ghostscript. Check your /etc/ImageMagick-x/policy.xml!')
+            self.log(
+                'warning',
+                "Thumbnail generation with ImageMagick failed, falling back "
+                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
            cmd = [settings.GS_BINARY,
                   "-q",
@@ -98,24 +101,38 @@ class RasterisedDocumentParser(DocumentParser):
        try:

            sample_page_index = int(len(images) / 2)
-            self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
-            sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
+            self.log(
+                "debug",
+                f"Attempting language detection on page "
+                f"{sample_page_index + 1} of {len(images)}...")
+
+            sample_page_text = self._ocr([images[sample_page_index]],
+                                         settings.OCR_LANGUAGE)[0]
            guessed_language = self._guess_language(sample_page_text)

            if not guessed_language or guessed_language not in ISO639:
                self.log("warning", "Language detection failed.")
-                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+                ocr_pages = self._complete_ocr_default_language(
+                    images, sample_page_index, sample_page_text)

            elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
-                self.log("debug", "Detected language: {} (default language)".format(guessed_language))
-                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+                self.log(
+                    "debug",
+                    f"Detected language: {guessed_language} "
+                    f"(default language)")
+                ocr_pages = self._complete_ocr_default_language(
+                    images, sample_page_index, sample_page_text)

            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
-                self.log("warning", "Detected language {} is not available on this system.".format(guessed_language))
-                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+                self.log(
+                    "warning",
+                    f"Detected language {guessed_language} is not available "
+                    f"on this system.")
+                ocr_pages = self._complete_ocr_default_language(
+                    images, sample_page_index, sample_page_text)

            else:
-                self.log("debug", "Detected language: {}".format(guessed_language))
+                self.log("debug", f"Detected language: {guessed_language}")
                ocr_pages = self._ocr(images, ISO639[guessed_language])

            self.log("debug", "OCR completed.")
@@ -130,7 +147,9 @@ class RasterisedDocumentParser(DocumentParser):
        Greyscale images are easier for Tesseract to OCR
        """

-        self.log("debug", "Converting document {} into greyscale images...".format(self.document_path))
+        self.log(
+            "debug",
+            f"Converting document {self.document_path} into greyscale images")

        # Convert PDF to multiple PNMs
        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
@@ -148,7 +167,7 @@ class RasterisedDocumentParser(DocumentParser):
            if f.endswith(".pnm"):
                pnms.append(os.path.join(self.tempdir, f))

-        self.log("debug", "Running unpaper on {} pages...".format(len(pnms)))
+        self.log("debug", f"Running unpaper on {len(pnms)} pages...")

        # Run unpaper in parallel on converted images
        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
@@ -161,26 +180,25 @@ class RasterisedDocumentParser(DocumentParser):
            guess = langdetect.detect(text)
            return guess
        except Exception as e:
-            self.log('warning', "Language detection failed with: {}".format(e))
+            self.log('warning', f"Language detection failed with: {e}")
            return None

    def _ocr(self, imgs, lang):
-        self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
+        self.log(
+            "debug",
+            f"Performing OCR on {len(imgs)} page(s) with language {lang}")
        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
            return r

-    def _complete_ocr_default_language(self, images, sample_page_index, sample_page):
-        """
-        Given a `middle` value and the text that middle page represents, we OCR
-        the remainder of the document and return the whole thing.
-        """
-        # text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text
-        # text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE)
+    def _complete_ocr_default_language(self,
+                                       images,
+                                       sample_page_index,
+                                       sample_page):
        images_copy = list(images)
        del images_copy[sample_page_index]
        if images_copy:
-            self.log('debug', 'Continuing ocr with default language.')
+            self.log('debug', "Continuing ocr with default language.")
            ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
            ocr_pages.insert(sample_page_index, sample_page)
            return ocr_pages