diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 175f6710f..65febc937 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -104,9 +104,11 @@ class Consumer(LoggingMixin): parser_class = get_parser_class_for_mime_type(mime_type) if not parser_class: - raise ConsumerError("No parsers abvailable for {}".format(self.filename)) + raise ConsumerError(f"No parsers abvailable for {self.filename}") else: - self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type)) + self.log("debug", + f"Parser: {parser_class.__name__} " + f"based on mime type {mime_type}") # Notify all listeners that we're going to do some work. @@ -126,7 +128,7 @@ class Consumer(LoggingMixin): # Parse the document. This may take some time. try: - self.log("debug", "Generating thumbnail for {}...".format(self.filename)) + self.log("debug", f"Generating thumbnail for {self.filename}...") thumbnail = document_parser.get_optimised_thumbnail() self.log("debug", "Parsing {}...".format(self.filename)) text = document_parser.get_text() @@ -244,10 +246,12 @@ class Consumer(LoggingMixin): document.title = self.override_title if self.override_correspondent_id: - document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id) + document.correspondent = Correspondent.objects.get( + pk=self.override_correspondent_id) if self.override_document_type_id: - document.document_type = DocumentType.objects.get(pk=self.override_document_type_id) + document.document_type = DocumentType.objects.get( + pk=self.override_document_type_id) if self.override_tag_ids: for tag_id in self.override_tag_ids: diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index 06d4d2957..cd47406b6 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -87,7 +87,9 @@ def generate_filename(document): tags=tags, ) except (ValueError, KeyError, IndexError): - logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT)) + logging.getLogger(__name__).warning( + f"Invalid PAPERLESS_FILENAME_FORMAT: " + f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default") # Always append the primary key to guarantee uniqueness of filename if len(path) > 0: diff --git a/src/documents/forms.py b/src/documents/forms.py index f44090164..0471a8312 100644 --- a/src/documents/forms.py +++ b/src/documents/forms.py @@ -46,9 +46,14 @@ class UploadForm(forms.Form): os.makedirs(settings.SCRATCH_DIR, exist_ok=True) - with tempfile.NamedTemporaryFile(prefix="paperless-upload-", dir=settings.SCRATCH_DIR, delete=False) as f: + with tempfile.NamedTemporaryFile(prefix="paperless-upload-", + dir=settings.SCRATCH_DIR, + delete=False) as f: f.write(data) os.utime(f.name, times=(t, t)) - async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename)) + async_task("documents.tasks.consume_file", + f.name, + override_filename=original_filename, + task_name=os.path.basename(original_filename)) diff --git a/src/documents/index.py b/src/documents/index.py index ad3a50010..cf312cbcc 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -120,6 +120,7 @@ def query_page(ix, query, page): def autocomplete(ix, term, limit=10): with ix.reader() as reader: terms = [] - for (score, t) in reader.most_distinctive_terms("content", limit, term.lower()): + for (score, t) in reader.most_distinctive_terms( + "content", number=limit, prefix=term.lower()): terms.append(t) return terms diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 2b8ac7100..70c36a03c 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -19,10 +19,13 @@ class Handler(FileSystemEventHandler): def _consume(self, file): if os.path.isfile(file): try: - async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file)) + async_task("documents.tasks.consume_file", + file, + task_name=os.path.basename(file)) except Exception as e: # Catch all so that the consumer won't crash. - logging.getLogger(__name__).error("Error while consuming document: {}".format(e)) + logging.getLogger(__name__).error( + "Error while consuming document: {}".format(e)) def on_created(self, event): self._consume(event.src_path) @@ -66,12 +69,14 @@ class Command(BaseCommand): # Consume all files as this is not done initially by the watchdog for entry in os.scandir(directory): if entry.is_file(): - async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path)) + async_task("documents.tasks.consume_file", + entry.path, + task_name=os.path.basename(entry.path)) # Start the watchdog. Woof! if settings.CONSUMER_POLLING > 0: - logging.getLogger(__name__).info('Using polling instead of file' - 'system notifications.') + logging.getLogger(__name__).info( + "Using polling instead of file system notifications.") observer = PollingObserver(timeout=settings.CONSUMER_POLLING) else: observer = Observer() diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 441f1c475..f86462119 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -63,7 +63,7 @@ class Command(Renderable, BaseCommand): document = document_map[document_dict["pk"]] - unique_filename = "{:07}_{}".format(document.pk, document.file_name) + unique_filename = f"{document.pk:07}_{document.file_name}" file_target = os.path.join(self.target, unique_filename) @@ -73,7 +73,7 @@ class Command(Renderable, BaseCommand): document_dict[EXPORTER_FILE_NAME] = unique_filename document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name - print("Exporting: {}".format(file_target)) + print(f"Exporting: {file_target}") t = int(time.mktime(document.created.timetuple())) if document.storage_type == Document.STORAGE_TYPE_GPG: diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py index da9086144..208a0ef37 100644 --- a/src/documents/management/commands/document_importer.py +++ b/src/documents/management/commands/document_importer.py @@ -120,7 +120,7 @@ class Command(Renderable, BaseCommand): encrypted.write(GnuPG.encrypted(unencrypted)) else: - print("Moving {} to {}".format(document_path, document.source_path)) + print(f"Moving {document_path} to {document.source_path}") shutil.copy(document_path, document.source_path) shutil.copy(thumbnail_path, document.thumbnail_path) diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py index e48b8802c..cf014dc6f 100755 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -74,13 +74,13 @@ class Command(Renderable, BaseCommand): try: classifier.reload() except (FileNotFoundError, IncompatibleClassifierVersionError) as e: - logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e)) + logging.getLogger(__name__).warning( + f"Cannot classify documents: {e}.") classifier = None for document in documents: logging.getLogger(__name__).info( - "Processing document {}".format(document.title) - ) + f"Processing document {document.title}") if options['correspondent']: set_correspondent( diff --git a/src/documents/matching.py b/src/documents/matching.py index e5789ab2e..ae1a9a9cf 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -6,17 +6,23 @@ from documents.models import MatchingModel, Correspondent, DocumentType, Tag def match_correspondents(document_content, classifier): - correspondents = Correspondent.objects.all() - predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None + if classifier: + pred_id = classifier.predict_correspondent(document_content) + else: + pred_id = None - return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id] + correspondents = Correspondent.objects.all() + return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id] def match_document_types(document_content, classifier): - document_types = DocumentType.objects.all() - predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None + if classifier: + pred_id = classifier.predict_document_type(document_content) + else: + pred_id = None - return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id] + document_types = DocumentType.objects.all() + return [o for o in document_types if matches(o, document_content) or o.pk == pred_id] def match_tags(document_content, classifier): diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 98f4c5b12..eb8ccf45e 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -73,7 +73,18 @@ def get_parser_class(path): return get_parser_class_for_mime_type(mime_type) -def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None): +def run_convert(input_file, + output_file, + density=None, + scale=None, + alpha=None, + strip=False, + trim=False, + type=None, + depth=None, + extra=None, + logging_group=None): + environment = os.environ.copy() if settings.CONVERT_MEMORY_LIMIT: environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT @@ -102,10 +113,13 @@ def run_unpaper(pnm, logging_group=None): command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, pnm_out) - logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group}) + logger.debug(f"Execute: {' '.join(command_args)}", + extra={'group': logging_group}) - if not subprocess.Popen(command_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait() == 0: - raise ParseError("Unpaper failed at {}".format(command_args)) + if not subprocess.Popen(command_args, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL).wait() == 0: + raise ParseError(f"Unpaper failed at {command_args}") return pnm_out @@ -124,7 +138,8 @@ class DocumentParser(LoggingMixin): super().__init__() self.logging_group = logging_group self.document_path = path - self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) + self.tempdir = tempfile.mkdtemp( + prefix="paperless-", dir=settings.SCRATCH_DIR) def get_thumbnail(self): """ @@ -137,9 +152,10 @@ class DocumentParser(LoggingMixin): if settings.OPTIMIZE_THUMBNAILS: out_path = os.path.join(self.tempdir, "optipng.png") - args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path) + args = (settings.OPTIPNG_BINARY, + "-silent", "-o5", in_path, "-out", out_path) - self.log('debug', 'Execute: ' + " ".join(args)) + self.log('debug', f"Execute: {' '.join(args)}") if not subprocess.Popen(args).wait() == 0: raise ParseError("Optipng failed at {}".format(args)) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index cf48e8bd7..e0ad73a23 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -76,9 +76,11 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField): class DocumentSerializer(serializers.ModelSerializer): - correspondent_id = CorrespondentField(allow_null=True, source='correspondent') + correspondent_id = CorrespondentField( + allow_null=True, source='correspondent') tags_id = TagsField(many=True, source='tags') - document_type_id = DocumentTypeField(allow_null=True, source='document_type') + document_type_id = DocumentTypeField( + allow_null=True, source='document_type') class Meta: model = Document diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index 671cdb104..f83f88783 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -25,11 +25,18 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs): document.tags.add(*inbox_tags) -def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs): +def set_correspondent(sender, + document=None, + logging_group=None, + classifier=None, + replace=False, + use_first=True, + **kwargs): if document.correspondent and not replace: return - potential_correspondents = matching.match_correspondents(document.content, classifier) + potential_correspondents = matching.match_correspondents(document.content, + classifier) potential_count = len(potential_correspondents) if potential_correspondents: @@ -38,22 +45,22 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None selected = None if potential_count > 1: if use_first: - message = "Detected {} potential correspondents, so we've opted for {}" logger( - message.format(potential_count, selected), + f"Detected {potential_count} potential correspondents, " + f"so we've opted for {selected}", logging_group ) else: - message = "Detected {} potential correspondents, not assigning any correspondent" logger( - message.format(potential_count), + f"Detected {potential_count} potential correspondents, " + f"not assigning any correspondent", logging_group ) return if selected or replace: logger( - 'Assigning correspondent "{}" to "{}" '.format(selected, document), + f"Assigning correspondent {selected} to {document}", logging_group ) @@ -61,11 +68,18 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None document.save(update_fields=("correspondent",)) -def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs): +def set_document_type(sender, + document=None, + logging_group=None, + classifier=None, + replace=False, + use_first=True, + **kwargs): if document.document_type and not replace: return - potential_document_type = matching.match_document_types(document.content, classifier) + potential_document_type = matching.match_document_types(document.content, + classifier) potential_count = len(potential_document_type) if potential_document_type: @@ -75,22 +89,22 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None if potential_count > 1: if use_first: - message = "Detected {} potential document types, so we've opted for {}" logger( - message.format(potential_count, selected), + f"Detected {potential_count} potential document types, " + f"so we've opted for {selected}", logging_group ) else: - message = "Detected {} potential document types, not assigning any document type" logger( - message.format(potential_count), + f"Detected {potential_count} potential document types, " + f"not assigning any document type", logging_group ) return if selected or replace: logger( - 'Assigning document type "{}" to "{}" '.format(selected, document), + f"Assigning document type {selected} to {document}", logging_group ) @@ -98,14 +112,21 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None document.save(update_fields=("document_type",)) -def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs): +def set_tags(sender, + document=None, + logging_group=None, + classifier=None, + replace=False, + **kwargs): if replace: document.tags.clear() current_tags = set([]) else: current_tags = set(document.tags.all()) - relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags + matched_tags = matching.match_tags(document.content, classifier) + + relevant_tags = set(matched_tags) - current_tags if not relevant_tags: return @@ -180,12 +201,15 @@ def update_filename_and_move_files(sender, instance, **kwargs): if not os.path.isfile(old_path): # Can't do anything if the old file does not exist anymore. - logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path)) + logging.getLogger(__name__).fatal( + f"Document {str(instance)}: File {old_path} has gone.") return if os.path.isfile(new_path): # Can't do anything if the new file already exists. Skip updating file. - logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path)) + logging.getLogger(__name__).warning( + f"Document {str(instance)}: Cannot rename file " + f"since target path {new_path} already exists.") return create_source_path_directory(new_path) diff --git a/src/documents/tests/test_checks.py b/src/documents/tests/test_checks.py index d316f94b5..1027c11a0 100644 --- a/src/documents/tests/test_checks.py +++ b/src/documents/tests/test_checks.py @@ -15,11 +15,3 @@ class ChecksTestCase(TestCase): def test_changed_password_check_no_encryption(self): DocumentFactory.create(storage_type=Document.STORAGE_TYPE_UNENCRYPTED) self.assertEqual(changed_password_check(None), []) - - @unittest.skip("I don't know how to test this") - def test_changed_password_check_gpg_encryption_with_good_password(self): - pass - - @unittest.skip("I don't know how to test this") - def test_changed_password_check_fail(self): - pass diff --git a/src/documents/views.py b/src/documents/views.py index 89d03a4df..14323e933 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -47,18 +47,30 @@ class IndexView(TemplateView): class CorrespondentViewSet(ModelViewSet): model = Correspondent - queryset = Correspondent.objects.annotate(document_count=Count('documents'), last_correspondence=Max('documents__created')).order_by('name') + + queryset = Correspondent.objects.annotate( + document_count=Count('documents'), + last_correspondence=Max('documents__created')).order_by('name') + serializer_class = CorrespondentSerializer pagination_class = StandardPagination permission_classes = (IsAuthenticated,) filter_backends = (DjangoFilterBackend, OrderingFilter) filterset_class = CorrespondentFilterSet - ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence") + ordering_fields = ( + "name", + "matching_algorithm", + "match", + "document_count", + "last_correspondence") class TagViewSet(ModelViewSet): model = Tag - queryset = Tag.objects.annotate(document_count=Count('documents')).order_by('name') + + queryset = Tag.objects.annotate( + document_count=Count('documents')).order_by('name') + serializer_class = TagSerializer pagination_class = StandardPagination permission_classes = (IsAuthenticated,) @@ -69,7 +81,10 @@ class TagViewSet(ModelViewSet): class DocumentTypeViewSet(ModelViewSet): model = DocumentType - queryset = DocumentType.objects.annotate(document_count=Count('documents')).order_by('name') + + queryset = DocumentType.objects.annotate( + document_count=Count('documents')).order_by('name') + serializer_class = DocumentTypeSerializer pagination_class = StandardPagination permission_classes = (IsAuthenticated,) @@ -92,10 +107,18 @@ class DocumentViewSet(RetrieveModelMixin, filterset_class = DocumentFilterSet search_fields = ("title", "correspondent__name", "content") ordering_fields = ( - "id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number") + "id", + "title", + "correspondent__name", + "document_type__name", + "created", + "modified", + "added", + "archive_serial_number") def update(self, request, *args, **kwargs): - response = super(DocumentViewSet, self).update(request, *args, **kwargs) + response = super(DocumentViewSet, self).update( + request, *args, **kwargs) index.add_or_update_document(self.get_object()) return response @@ -138,7 +161,8 @@ class DocumentViewSet(RetrieveModelMixin, @cache_control(public=False, max_age=315360000) def thumb(self, request, pk=None): try: - return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png') + return HttpResponse(Document.objects.get(id=pk).thumbnail_file, + content_type='image/png') except FileNotFoundError: raise Http404("Document thumbnail does not exist") @@ -230,5 +254,6 @@ class StatisticsView(APIView): def get(self, request, format=None): return Response({ 'documents_total': Document.objects.all().count(), - 'documents_inbox': Document.objects.filter(tags__is_inbox_tag=True).distinct().count() + 'documents_inbox': Document.objects.filter( + tags__is_inbox_tag=True).distinct().count() }) diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py index 6db5e9070..03f915769 100644 --- a/src/paperless_mail/mail.py +++ b/src/paperless_mail/mail.py @@ -174,8 +174,8 @@ class MailAccountHandler(LoggingMixin): M.folder.set(rule.folder) except MailboxFolderSelectError: raise MailError( - f"Rule {rule.name}: Folder {rule.folder} does not exist " - f"in account {account.name}") + f"Rule {rule.name}: Folder {rule.folder} " + f"does not exist in account {account.name}") criterias = make_criterias(rule) @@ -185,7 +185,8 @@ class MailAccountHandler(LoggingMixin): f"{str(AND(**criterias))}") try: - messages = M.fetch(criteria=AND(**criterias), mark_seen=False) + messages = M.fetch(criteria=AND(**criterias), + mark_seen=False) except Exception: raise MailError( f"Rule {rule.name}: Error while fetching folder " @@ -226,8 +227,8 @@ class MailAccountHandler(LoggingMixin): except Exception: raise MailError( - f"Rule {rule.name}: Error while processing post-consume " - f"actions for account {account.name}") + f"Rule {rule.name}: Error while processing " + f"post-consume actions for account {account.name}") return total_processed_files @@ -266,7 +267,8 @@ class MailAccountHandler(LoggingMixin): if is_mime_type_supported(mime_type): os.makedirs(settings.SCRATCH_DIR, exist_ok=True) - _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR) + _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", + dir=settings.SCRATCH_DIR) with open(temp_filename, 'wb') as f: f.write(att.payload) diff --git a/src/paperless_mail/models.py b/src/paperless_mail/models.py index e37fbee16..14da202fa 100644 --- a/src/paperless_mail/models.py +++ b/src/paperless_mail/models.py @@ -66,10 +66,14 @@ class MailRule(models.Model): CORRESPONDENT_FROM_CUSTOM = 4 CORRESPONDENT_SELECTOR = ( - (CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"), - (CORRESPONDENT_FROM_EMAIL, "Use mail address"), - (CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"), - (CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below") + (CORRESPONDENT_FROM_NOTHING, + "Do not assign a correspondent"), + (CORRESPONDENT_FROM_EMAIL, + "Use mail address"), + (CORRESPONDENT_FROM_NAME, + "Use name (or mail address if not available)"), + (CORRESPONDENT_FROM_CUSTOM, + "Use correspondent selected below") ) name = models.CharField(max_length=256, unique=True) diff --git a/src/paperless_mail/tasks.py b/src/paperless_mail/tasks.py index 22d512c1e..e75711dce 100644 --- a/src/paperless_mail/tasks.py +++ b/src/paperless_mail/tasks.py @@ -7,7 +7,8 @@ from paperless_mail.models import MailAccount def process_mail_accounts(): total_new_documents = 0 for account in MailAccount.objects.all(): - total_new_documents += MailAccountHandler().handle_mail_account(account) + total_new_documents += MailAccountHandler().handle_mail_account( + account) if total_new_documents > 0: return f"Added {total_new_documents} document(s)." diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 73b2414d5..d0ce01327 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -50,7 +50,10 @@ class RasterisedDocumentParser(DocumentParser): except ParseError: # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript - self.log('warning', 'Thumbnail generation with ImageMagick failed, falling back to ghostscript. Check your /etc/ImageMagick-x/policy.xml!') + self.log( + 'warning', + "Thumbnail generation with ImageMagick failed, falling back " + "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!") gs_out_path = os.path.join(self.tempdir, "gs_out.png") cmd = [settings.GS_BINARY, "-q", @@ -98,24 +101,38 @@ class RasterisedDocumentParser(DocumentParser): try: sample_page_index = int(len(images) / 2) - self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images))) - sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0] + self.log( + "debug", + f"Attempting language detection on page " + f"{sample_page_index + 1} of {len(images)}...") + + sample_page_text = self._ocr([images[sample_page_index]], + settings.OCR_LANGUAGE)[0] guessed_language = self._guess_language(sample_page_text) if not guessed_language or guessed_language not in ISO639: self.log("warning", "Language detection failed.") - ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) + ocr_pages = self._complete_ocr_default_language( + images, sample_page_index, sample_page_text) elif ISO639[guessed_language] == settings.OCR_LANGUAGE: - self.log("debug", "Detected language: {} (default language)".format(guessed_language)) - ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) + self.log( + "debug", + f"Detected language: {guessed_language} " + f"(default language)") + ocr_pages = self._complete_ocr_default_language( + images, sample_page_index, sample_page_text) elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): - self.log("warning", "Detected language {} is not available on this system.".format(guessed_language)) - ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) + self.log( + "warning", + f"Detected language {guessed_language} is not available " + f"on this system.") + ocr_pages = self._complete_ocr_default_language( + images, sample_page_index, sample_page_text) else: - self.log("debug", "Detected language: {}".format(guessed_language)) + self.log("debug", f"Detected language: {guessed_language}") ocr_pages = self._ocr(images, ISO639[guessed_language]) self.log("debug", "OCR completed.") @@ -130,7 +147,9 @@ class RasterisedDocumentParser(DocumentParser): Greyscale images are easier for Tesseract to OCR """ - self.log("debug", "Converting document {} into greyscale images...".format(self.document_path)) + self.log( + "debug", + f"Converting document {self.document_path} into greyscale images") # Convert PDF to multiple PNMs pnm = os.path.join(self.tempdir, "convert-%04d.pnm") @@ -148,7 +167,7 @@ class RasterisedDocumentParser(DocumentParser): if f.endswith(".pnm"): pnms.append(os.path.join(self.tempdir, f)) - self.log("debug", "Running unpaper on {} pages...".format(len(pnms))) + self.log("debug", f"Running unpaper on {len(pnms)} pages...") # Run unpaper in parallel on converted images with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: @@ -161,26 +180,25 @@ class RasterisedDocumentParser(DocumentParser): guess = langdetect.detect(text) return guess except Exception as e: - self.log('warning', "Language detection failed with: {}".format(e)) + self.log('warning', f"Language detection failed with: {e}") return None def _ocr(self, imgs, lang): - self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang)) + self.log( + "debug", + f"Performing OCR on {len(imgs)} page(s) with language {lang}") with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: r = pool.map(image_to_string, itertools.product(imgs, [lang])) return r - def _complete_ocr_default_language(self, images, sample_page_index, sample_page): - """ - Given a `middle` value and the text that middle page represents, we OCR - the remainder of the document and return the whole thing. - """ - # text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text - # text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE) + def _complete_ocr_default_language(self, + images, + sample_page_index, + sample_page): images_copy = list(images) del images_copy[sample_page_index] if images_copy: - self.log('debug', 'Continuing ocr with default language.') + self.log('debug', "Continuing ocr with default language.") ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) ocr_pages.insert(sample_page_index, sample_page) return ocr_pages