mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	code cleanup
This commit is contained in:
		| @@ -104,9 +104,11 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         parser_class = get_parser_class_for_mime_type(mime_type) | ||||
|         if not parser_class: | ||||
|             raise ConsumerError("No parsers abvailable for {}".format(self.filename)) | ||||
|             raise ConsumerError(f"No parsers abvailable for {self.filename}") | ||||
|         else: | ||||
|             self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type)) | ||||
|             self.log("debug", | ||||
|                      f"Parser: {parser_class.__name__} " | ||||
|                      f"based on mime type {mime_type}") | ||||
|  | ||||
|         # Notify all listeners that we're going to do some work. | ||||
|  | ||||
| @@ -126,7 +128,7 @@ class Consumer(LoggingMixin): | ||||
|         # Parse the document. This may take some time. | ||||
|  | ||||
|         try: | ||||
|             self.log("debug", "Generating thumbnail for {}...".format(self.filename)) | ||||
|             self.log("debug", f"Generating thumbnail for {self.filename}...") | ||||
|             thumbnail = document_parser.get_optimised_thumbnail() | ||||
|             self.log("debug", "Parsing {}...".format(self.filename)) | ||||
|             text = document_parser.get_text() | ||||
| @@ -244,10 +246,12 @@ class Consumer(LoggingMixin): | ||||
|             document.title = self.override_title | ||||
|  | ||||
|         if self.override_correspondent_id: | ||||
|             document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id) | ||||
|             document.correspondent = Correspondent.objects.get( | ||||
|                 pk=self.override_correspondent_id) | ||||
|  | ||||
|         if self.override_document_type_id: | ||||
|             document.document_type = DocumentType.objects.get(pk=self.override_document_type_id) | ||||
|             document.document_type = DocumentType.objects.get( | ||||
|                 pk=self.override_document_type_id) | ||||
|  | ||||
|         if self.override_tag_ids: | ||||
|             for tag_id in self.override_tag_ids: | ||||
|   | ||||
| @@ -87,7 +87,9 @@ def generate_filename(document): | ||||
|                 tags=tags, | ||||
|             ) | ||||
|     except (ValueError, KeyError, IndexError): | ||||
|         logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT)) | ||||
|         logging.getLogger(__name__).warning( | ||||
|             f"Invalid PAPERLESS_FILENAME_FORMAT: " | ||||
|             f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default") | ||||
|  | ||||
|     # Always append the primary key to guarantee uniqueness of filename | ||||
|     if len(path) > 0: | ||||
|   | ||||
| @@ -46,9 +46,14 @@ class UploadForm(forms.Form): | ||||
|  | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|  | ||||
|         with tempfile.NamedTemporaryFile(prefix="paperless-upload-", dir=settings.SCRATCH_DIR, delete=False) as f: | ||||
|         with tempfile.NamedTemporaryFile(prefix="paperless-upload-", | ||||
|                                          dir=settings.SCRATCH_DIR, | ||||
|                                          delete=False) as f: | ||||
|  | ||||
|             f.write(data) | ||||
|             os.utime(f.name, times=(t, t)) | ||||
|  | ||||
|             async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename)) | ||||
|             async_task("documents.tasks.consume_file", | ||||
|                        f.name, | ||||
|                        override_filename=original_filename, | ||||
|                        task_name=os.path.basename(original_filename)) | ||||
|   | ||||
| @@ -120,6 +120,7 @@ def query_page(ix, query, page): | ||||
| def autocomplete(ix, term, limit=10): | ||||
|     with ix.reader() as reader: | ||||
|         terms = [] | ||||
|         for (score, t) in reader.most_distinctive_terms("content", limit, term.lower()): | ||||
|         for (score, t) in reader.most_distinctive_terms( | ||||
|                 "content", number=limit, prefix=term.lower()): | ||||
|             terms.append(t) | ||||
|         return terms | ||||
|   | ||||
| @@ -19,10 +19,13 @@ class Handler(FileSystemEventHandler): | ||||
|     def _consume(self, file): | ||||
|         if os.path.isfile(file): | ||||
|             try: | ||||
|                 async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file)) | ||||
|                 async_task("documents.tasks.consume_file", | ||||
|                            file, | ||||
|                            task_name=os.path.basename(file)) | ||||
|             except Exception as e: | ||||
|                 # Catch all so that the consumer won't crash. | ||||
|                 logging.getLogger(__name__).error("Error while consuming document: {}".format(e)) | ||||
|                 logging.getLogger(__name__).error( | ||||
|                     "Error while consuming document: {}".format(e)) | ||||
|  | ||||
|     def on_created(self, event): | ||||
|         self._consume(event.src_path) | ||||
| @@ -66,12 +69,14 @@ class Command(BaseCommand): | ||||
|         # Consume all files as this is not done initially by the watchdog | ||||
|         for entry in os.scandir(directory): | ||||
|             if entry.is_file(): | ||||
|                 async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path)) | ||||
|                 async_task("documents.tasks.consume_file", | ||||
|                            entry.path, | ||||
|                            task_name=os.path.basename(entry.path)) | ||||
|  | ||||
|         # Start the watchdog. Woof! | ||||
|         if settings.CONSUMER_POLLING > 0: | ||||
|             logging.getLogger(__name__).info('Using polling instead of file' | ||||
|                                              'system notifications.') | ||||
|             logging.getLogger(__name__).info( | ||||
|                 "Using polling instead of file system notifications.") | ||||
|             observer = PollingObserver(timeout=settings.CONSUMER_POLLING) | ||||
|         else: | ||||
|             observer = Observer() | ||||
|   | ||||
| @@ -63,7 +63,7 @@ class Command(Renderable, BaseCommand): | ||||
|  | ||||
|             document = document_map[document_dict["pk"]] | ||||
|  | ||||
|             unique_filename = "{:07}_{}".format(document.pk, document.file_name) | ||||
|             unique_filename = f"{document.pk:07}_{document.file_name}" | ||||
|  | ||||
|             file_target = os.path.join(self.target, unique_filename) | ||||
|  | ||||
| @@ -73,7 +73,7 @@ class Command(Renderable, BaseCommand): | ||||
|             document_dict[EXPORTER_FILE_NAME] = unique_filename | ||||
|             document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name | ||||
|  | ||||
|             print("Exporting: {}".format(file_target)) | ||||
|             print(f"Exporting: {file_target}") | ||||
|  | ||||
|             t = int(time.mktime(document.created.timetuple())) | ||||
|             if document.storage_type == Document.STORAGE_TYPE_GPG: | ||||
|   | ||||
| @@ -120,7 +120,7 @@ class Command(Renderable, BaseCommand): | ||||
|                         encrypted.write(GnuPG.encrypted(unencrypted)) | ||||
|  | ||||
|             else: | ||||
|                 print("Moving {} to {}".format(document_path, document.source_path)) | ||||
|                 print(f"Moving {document_path} to {document.source_path}") | ||||
|                 shutil.copy(document_path, document.source_path) | ||||
|                 shutil.copy(thumbnail_path, document.thumbnail_path) | ||||
|  | ||||
|   | ||||
| @@ -74,13 +74,13 @@ class Command(Renderable, BaseCommand): | ||||
|         try: | ||||
|             classifier.reload() | ||||
|         except (FileNotFoundError, IncompatibleClassifierVersionError) as e: | ||||
|             logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e)) | ||||
|             logging.getLogger(__name__).warning( | ||||
|                 f"Cannot classify documents: {e}.") | ||||
|             classifier = None | ||||
|  | ||||
|         for document in documents: | ||||
|             logging.getLogger(__name__).info( | ||||
|                 "Processing document {}".format(document.title) | ||||
|             ) | ||||
|                 f"Processing document {document.title}") | ||||
|  | ||||
|             if options['correspondent']: | ||||
|                 set_correspondent( | ||||
|   | ||||
| @@ -6,17 +6,23 @@ from documents.models import MatchingModel, Correspondent, DocumentType, Tag | ||||
|  | ||||
|  | ||||
| def match_correspondents(document_content, classifier): | ||||
|     correspondents = Correspondent.objects.all() | ||||
|     predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None | ||||
|     if classifier: | ||||
|         pred_id = classifier.predict_correspondent(document_content) | ||||
|     else: | ||||
|         pred_id = None | ||||
|  | ||||
|     return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id] | ||||
|     correspondents = Correspondent.objects.all() | ||||
|     return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id] | ||||
|  | ||||
|  | ||||
| def match_document_types(document_content, classifier): | ||||
|     document_types = DocumentType.objects.all() | ||||
|     predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None | ||||
|     if classifier: | ||||
|         pred_id = classifier.predict_document_type(document_content) | ||||
|     else: | ||||
|         pred_id = None | ||||
|  | ||||
|     return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id] | ||||
|     document_types = DocumentType.objects.all() | ||||
|     return [o for o in document_types if matches(o, document_content) or o.pk == pred_id] | ||||
|  | ||||
|  | ||||
| def match_tags(document_content, classifier): | ||||
|   | ||||
| @@ -73,7 +73,18 @@ def get_parser_class(path): | ||||
|     return get_parser_class_for_mime_type(mime_type) | ||||
|  | ||||
|  | ||||
| def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None): | ||||
| def run_convert(input_file, | ||||
|                 output_file, | ||||
|                 density=None, | ||||
|                 scale=None, | ||||
|                 alpha=None, | ||||
|                 strip=False, | ||||
|                 trim=False, | ||||
|                 type=None, | ||||
|                 depth=None, | ||||
|                 extra=None, | ||||
|                 logging_group=None): | ||||
|  | ||||
|     environment = os.environ.copy() | ||||
|     if settings.CONVERT_MEMORY_LIMIT: | ||||
|         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT | ||||
| @@ -102,10 +113,13 @@ def run_unpaper(pnm, logging_group=None): | ||||
|     command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, | ||||
|                     pnm_out) | ||||
|  | ||||
|     logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group}) | ||||
|     logger.debug(f"Execute: {' '.join(command_args)}", | ||||
|                  extra={'group': logging_group}) | ||||
|  | ||||
|     if not subprocess.Popen(command_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait() == 0: | ||||
|         raise ParseError("Unpaper failed at {}".format(command_args)) | ||||
|     if not subprocess.Popen(command_args, | ||||
|                             stdout=subprocess.DEVNULL, | ||||
|                             stderr=subprocess.DEVNULL).wait() == 0: | ||||
|         raise ParseError(f"Unpaper failed at {command_args}") | ||||
|  | ||||
|     return pnm_out | ||||
|  | ||||
| @@ -124,7 +138,8 @@ class DocumentParser(LoggingMixin): | ||||
|         super().__init__() | ||||
|         self.logging_group = logging_group | ||||
|         self.document_path = path | ||||
|         self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|         self.tempdir = tempfile.mkdtemp( | ||||
|             prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
| @@ -137,9 +152,10 @@ class DocumentParser(LoggingMixin): | ||||
|         if settings.OPTIMIZE_THUMBNAILS: | ||||
|             out_path = os.path.join(self.tempdir, "optipng.png") | ||||
|  | ||||
|             args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path) | ||||
|             args = (settings.OPTIPNG_BINARY, | ||||
|                     "-silent", "-o5", in_path, "-out", out_path) | ||||
|  | ||||
|             self.log('debug', 'Execute: ' + " ".join(args)) | ||||
|             self.log('debug', f"Execute: {' '.join(args)}") | ||||
|  | ||||
|             if not subprocess.Popen(args).wait() == 0: | ||||
|                 raise ParseError("Optipng failed at {}".format(args)) | ||||
|   | ||||
| @@ -76,9 +76,11 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField): | ||||
|  | ||||
| class DocumentSerializer(serializers.ModelSerializer): | ||||
|  | ||||
|     correspondent_id = CorrespondentField(allow_null=True, source='correspondent') | ||||
|     correspondent_id = CorrespondentField( | ||||
|         allow_null=True, source='correspondent') | ||||
|     tags_id = TagsField(many=True, source='tags') | ||||
|     document_type_id = DocumentTypeField(allow_null=True, source='document_type') | ||||
|     document_type_id = DocumentTypeField( | ||||
|         allow_null=True, source='document_type') | ||||
|  | ||||
|     class Meta: | ||||
|         model = Document | ||||
|   | ||||
| @@ -25,11 +25,18 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs): | ||||
|     document.tags.add(*inbox_tags) | ||||
|  | ||||
|  | ||||
| def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs): | ||||
| def set_correspondent(sender, | ||||
|                       document=None, | ||||
|                       logging_group=None, | ||||
|                       classifier=None, | ||||
|                       replace=False, | ||||
|                       use_first=True, | ||||
|                       **kwargs): | ||||
|     if document.correspondent and not replace: | ||||
|         return | ||||
|  | ||||
|     potential_correspondents = matching.match_correspondents(document.content, classifier) | ||||
|     potential_correspondents = matching.match_correspondents(document.content, | ||||
|                                                              classifier) | ||||
|  | ||||
|     potential_count = len(potential_correspondents) | ||||
|     if potential_correspondents: | ||||
| @@ -38,22 +45,22 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None | ||||
|         selected = None | ||||
|     if potential_count > 1: | ||||
|         if use_first: | ||||
|             message = "Detected {} potential correspondents, so we've opted for {}" | ||||
|             logger( | ||||
|                 message.format(potential_count, selected), | ||||
|                 f"Detected {potential_count} potential correspondents, " | ||||
|                 f"so we've opted for {selected}", | ||||
|                 logging_group | ||||
|             ) | ||||
|         else: | ||||
|             message = "Detected {} potential correspondents, not assigning any correspondent" | ||||
|             logger( | ||||
|                 message.format(potential_count), | ||||
|                 f"Detected {potential_count} potential correspondents, " | ||||
|                 f"not assigning any correspondent", | ||||
|                 logging_group | ||||
|             ) | ||||
|             return | ||||
|  | ||||
|     if selected or replace: | ||||
|         logger( | ||||
|             'Assigning correspondent "{}" to "{}" '.format(selected, document), | ||||
|             f"Assigning correspondent {selected} to {document}", | ||||
|             logging_group | ||||
|         ) | ||||
|  | ||||
| @@ -61,11 +68,18 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None | ||||
|         document.save(update_fields=("correspondent",)) | ||||
|  | ||||
|  | ||||
| def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs): | ||||
| def set_document_type(sender, | ||||
|                       document=None, | ||||
|                       logging_group=None, | ||||
|                       classifier=None, | ||||
|                       replace=False, | ||||
|                       use_first=True, | ||||
|                       **kwargs): | ||||
|     if document.document_type and not replace: | ||||
|         return | ||||
|  | ||||
|     potential_document_type = matching.match_document_types(document.content, classifier) | ||||
|     potential_document_type = matching.match_document_types(document.content, | ||||
|                                                             classifier) | ||||
|  | ||||
|     potential_count = len(potential_document_type) | ||||
|     if potential_document_type: | ||||
| @@ -75,22 +89,22 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None | ||||
|  | ||||
|     if potential_count > 1: | ||||
|         if use_first: | ||||
|             message = "Detected {} potential document types, so we've opted for {}" | ||||
|             logger( | ||||
|                 message.format(potential_count, selected), | ||||
|                 f"Detected {potential_count} potential document types, " | ||||
|                 f"so we've opted for {selected}", | ||||
|                 logging_group | ||||
|             ) | ||||
|         else: | ||||
|             message = "Detected {} potential document types, not assigning any document type" | ||||
|             logger( | ||||
|                 message.format(potential_count), | ||||
|                 f"Detected {potential_count} potential document types, " | ||||
|                 f"not assigning any document type", | ||||
|                 logging_group | ||||
|             ) | ||||
|             return | ||||
|  | ||||
|     if selected or replace: | ||||
|         logger( | ||||
|             'Assigning document type "{}" to "{}" '.format(selected, document), | ||||
|             f"Assigning document type {selected} to {document}", | ||||
|             logging_group | ||||
|         ) | ||||
|  | ||||
| @@ -98,14 +112,21 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None | ||||
|         document.save(update_fields=("document_type",)) | ||||
|  | ||||
|  | ||||
| def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs): | ||||
| def set_tags(sender, | ||||
|              document=None, | ||||
|              logging_group=None, | ||||
|              classifier=None, | ||||
|              replace=False, | ||||
|              **kwargs): | ||||
|     if replace: | ||||
|         document.tags.clear() | ||||
|         current_tags = set([]) | ||||
|     else: | ||||
|         current_tags = set(document.tags.all()) | ||||
|  | ||||
|     relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags | ||||
|     matched_tags = matching.match_tags(document.content, classifier) | ||||
|  | ||||
|     relevant_tags = set(matched_tags) - current_tags | ||||
|  | ||||
|     if not relevant_tags: | ||||
|         return | ||||
| @@ -180,12 +201,15 @@ def update_filename_and_move_files(sender, instance, **kwargs): | ||||
|  | ||||
|     if not os.path.isfile(old_path): | ||||
|         # Can't do anything if the old file does not exist anymore. | ||||
|         logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path)) | ||||
|         logging.getLogger(__name__).fatal( | ||||
|             f"Document {str(instance)}: File {old_path} has gone.") | ||||
|         return | ||||
|  | ||||
|     if os.path.isfile(new_path): | ||||
|         # Can't do anything if the new file already exists. Skip updating file. | ||||
|         logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path)) | ||||
|         logging.getLogger(__name__).warning( | ||||
|             f"Document {str(instance)}: Cannot rename file " | ||||
|             f"since target path {new_path} already exists.") | ||||
|         return | ||||
|  | ||||
|     create_source_path_directory(new_path) | ||||
|   | ||||
| @@ -15,11 +15,3 @@ class ChecksTestCase(TestCase): | ||||
|     def test_changed_password_check_no_encryption(self): | ||||
|         DocumentFactory.create(storage_type=Document.STORAGE_TYPE_UNENCRYPTED) | ||||
|         self.assertEqual(changed_password_check(None), []) | ||||
|  | ||||
|     @unittest.skip("I don't know how to test this") | ||||
|     def test_changed_password_check_gpg_encryption_with_good_password(self): | ||||
|         pass | ||||
|  | ||||
|     @unittest.skip("I don't know how to test this") | ||||
|     def test_changed_password_check_fail(self): | ||||
|         pass | ||||
|   | ||||
| @@ -47,18 +47,30 @@ class IndexView(TemplateView): | ||||
|  | ||||
| class CorrespondentViewSet(ModelViewSet): | ||||
|     model = Correspondent | ||||
|     queryset = Correspondent.objects.annotate(document_count=Count('documents'), last_correspondence=Max('documents__created')).order_by('name') | ||||
|  | ||||
|     queryset = Correspondent.objects.annotate( | ||||
|         document_count=Count('documents'), | ||||
|         last_correspondence=Max('documents__created')).order_by('name') | ||||
|  | ||||
|     serializer_class = CorrespondentSerializer | ||||
|     pagination_class = StandardPagination | ||||
|     permission_classes = (IsAuthenticated,) | ||||
|     filter_backends = (DjangoFilterBackend, OrderingFilter) | ||||
|     filterset_class = CorrespondentFilterSet | ||||
|     ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence") | ||||
|     ordering_fields = ( | ||||
|         "name", | ||||
|         "matching_algorithm", | ||||
|         "match", | ||||
|         "document_count", | ||||
|         "last_correspondence") | ||||
|  | ||||
|  | ||||
| class TagViewSet(ModelViewSet): | ||||
|     model = Tag | ||||
|     queryset = Tag.objects.annotate(document_count=Count('documents')).order_by('name') | ||||
|  | ||||
|     queryset = Tag.objects.annotate( | ||||
|         document_count=Count('documents')).order_by('name') | ||||
|  | ||||
|     serializer_class = TagSerializer | ||||
|     pagination_class = StandardPagination | ||||
|     permission_classes = (IsAuthenticated,) | ||||
| @@ -69,7 +81,10 @@ class TagViewSet(ModelViewSet): | ||||
|  | ||||
| class DocumentTypeViewSet(ModelViewSet): | ||||
|     model = DocumentType | ||||
|     queryset = DocumentType.objects.annotate(document_count=Count('documents')).order_by('name') | ||||
|  | ||||
|     queryset = DocumentType.objects.annotate( | ||||
|         document_count=Count('documents')).order_by('name') | ||||
|  | ||||
|     serializer_class = DocumentTypeSerializer | ||||
|     pagination_class = StandardPagination | ||||
|     permission_classes = (IsAuthenticated,) | ||||
| @@ -92,10 +107,18 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|     filterset_class = DocumentFilterSet | ||||
|     search_fields = ("title", "correspondent__name", "content") | ||||
|     ordering_fields = ( | ||||
|         "id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number") | ||||
|         "id", | ||||
|         "title", | ||||
|         "correspondent__name", | ||||
|         "document_type__name", | ||||
|         "created", | ||||
|         "modified", | ||||
|         "added", | ||||
|         "archive_serial_number") | ||||
|  | ||||
|     def update(self, request, *args, **kwargs): | ||||
|         response = super(DocumentViewSet, self).update(request, *args, **kwargs) | ||||
|         response = super(DocumentViewSet, self).update( | ||||
|             request, *args, **kwargs) | ||||
|         index.add_or_update_document(self.get_object()) | ||||
|         return response | ||||
|  | ||||
| @@ -138,7 +161,8 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|     @cache_control(public=False, max_age=315360000) | ||||
|     def thumb(self, request, pk=None): | ||||
|         try: | ||||
|             return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png') | ||||
|             return HttpResponse(Document.objects.get(id=pk).thumbnail_file, | ||||
|                                 content_type='image/png') | ||||
|         except FileNotFoundError: | ||||
|             raise Http404("Document thumbnail does not exist") | ||||
|  | ||||
| @@ -230,5 +254,6 @@ class StatisticsView(APIView): | ||||
|     def get(self, request, format=None): | ||||
|         return Response({ | ||||
|             'documents_total': Document.objects.all().count(), | ||||
|             'documents_inbox': Document.objects.filter(tags__is_inbox_tag=True).distinct().count() | ||||
|             'documents_inbox': Document.objects.filter( | ||||
|                 tags__is_inbox_tag=True).distinct().count() | ||||
|         }) | ||||
|   | ||||
| @@ -174,8 +174,8 @@ class MailAccountHandler(LoggingMixin): | ||||
|                     M.folder.set(rule.folder) | ||||
|                 except MailboxFolderSelectError: | ||||
|                     raise MailError( | ||||
|                         f"Rule {rule.name}: Folder {rule.folder} does not exist " | ||||
|                         f"in account {account.name}") | ||||
|                         f"Rule {rule.name}: Folder {rule.folder} " | ||||
|                         f"does not exist in account {account.name}") | ||||
|  | ||||
|                 criterias = make_criterias(rule) | ||||
|  | ||||
| @@ -185,7 +185,8 @@ class MailAccountHandler(LoggingMixin): | ||||
|                     f"{str(AND(**criterias))}") | ||||
|  | ||||
|                 try: | ||||
|                     messages = M.fetch(criteria=AND(**criterias), mark_seen=False) | ||||
|                     messages = M.fetch(criteria=AND(**criterias), | ||||
|                                        mark_seen=False) | ||||
|                 except Exception: | ||||
|                     raise MailError( | ||||
|                         f"Rule {rule.name}: Error while fetching folder " | ||||
| @@ -226,8 +227,8 @@ class MailAccountHandler(LoggingMixin): | ||||
|  | ||||
|                 except Exception: | ||||
|                     raise MailError( | ||||
|                         f"Rule {rule.name}: Error while processing post-consume " | ||||
|                         f"actions for account {account.name}") | ||||
|                         f"Rule {rule.name}: Error while processing " | ||||
|                         f"post-consume actions for account {account.name}") | ||||
|  | ||||
|         return total_processed_files | ||||
|  | ||||
| @@ -266,7 +267,8 @@ class MailAccountHandler(LoggingMixin): | ||||
|             if is_mime_type_supported(mime_type): | ||||
|  | ||||
|                 os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|                 _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR) | ||||
|                 _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", | ||||
|                                                     dir=settings.SCRATCH_DIR) | ||||
|                 with open(temp_filename, 'wb') as f: | ||||
|                     f.write(att.payload) | ||||
|  | ||||
|   | ||||
| @@ -66,10 +66,14 @@ class MailRule(models.Model): | ||||
|     CORRESPONDENT_FROM_CUSTOM = 4 | ||||
|  | ||||
|     CORRESPONDENT_SELECTOR = ( | ||||
|         (CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"), | ||||
|         (CORRESPONDENT_FROM_EMAIL, "Use mail address"), | ||||
|         (CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"), | ||||
|         (CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below") | ||||
|         (CORRESPONDENT_FROM_NOTHING, | ||||
|          "Do not assign a correspondent"), | ||||
|         (CORRESPONDENT_FROM_EMAIL, | ||||
|          "Use mail address"), | ||||
|         (CORRESPONDENT_FROM_NAME, | ||||
|          "Use name (or mail address if not available)"), | ||||
|         (CORRESPONDENT_FROM_CUSTOM, | ||||
|          "Use correspondent selected below") | ||||
|     ) | ||||
|  | ||||
|     name = models.CharField(max_length=256, unique=True) | ||||
|   | ||||
| @@ -7,7 +7,8 @@ from paperless_mail.models import MailAccount | ||||
| def process_mail_accounts(): | ||||
|     total_new_documents = 0 | ||||
|     for account in MailAccount.objects.all(): | ||||
|         total_new_documents += MailAccountHandler().handle_mail_account(account) | ||||
|         total_new_documents += MailAccountHandler().handle_mail_account( | ||||
|             account) | ||||
|  | ||||
|     if total_new_documents > 0: | ||||
|         return f"Added {total_new_documents} document(s)." | ||||
|   | ||||
| @@ -50,7 +50,10 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         except ParseError: | ||||
|             # if convert fails, fall back to extracting | ||||
|             # the first PDF page as a PNG using Ghostscript | ||||
|             self.log('warning', 'Thumbnail generation with ImageMagick failed, falling back to ghostscript. Check your /etc/ImageMagick-x/policy.xml!') | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 "Thumbnail generation with ImageMagick failed, falling back " | ||||
|                 "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!") | ||||
|             gs_out_path = os.path.join(self.tempdir, "gs_out.png") | ||||
|             cmd = [settings.GS_BINARY, | ||||
|                    "-q", | ||||
| @@ -98,24 +101,38 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         try: | ||||
|  | ||||
|             sample_page_index = int(len(images) / 2) | ||||
|             self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images))) | ||||
|             sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0] | ||||
|             self.log( | ||||
|                 "debug", | ||||
|                 f"Attempting language detection on page " | ||||
|                 f"{sample_page_index + 1} of {len(images)}...") | ||||
|  | ||||
|             sample_page_text = self._ocr([images[sample_page_index]], | ||||
|                                          settings.OCR_LANGUAGE)[0] | ||||
|             guessed_language = self._guess_language(sample_page_text) | ||||
|  | ||||
|             if not guessed_language or guessed_language not in ISO639: | ||||
|                 self.log("warning", "Language detection failed.") | ||||
|                 ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) | ||||
|                 ocr_pages = self._complete_ocr_default_language( | ||||
|                     images, sample_page_index, sample_page_text) | ||||
|  | ||||
|             elif ISO639[guessed_language] == settings.OCR_LANGUAGE: | ||||
|                 self.log("debug", "Detected language: {} (default language)".format(guessed_language)) | ||||
|                 ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                     f"Detected language: {guessed_language} " | ||||
|                     f"(default language)") | ||||
|                 ocr_pages = self._complete_ocr_default_language( | ||||
|                     images, sample_page_index, sample_page_text) | ||||
|  | ||||
|             elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): | ||||
|                 self.log("warning", "Detected language {} is not available on this system.".format(guessed_language)) | ||||
|                 ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     f"Detected language {guessed_language} is not available " | ||||
|                     f"on this system.") | ||||
|                 ocr_pages = self._complete_ocr_default_language( | ||||
|                     images, sample_page_index, sample_page_text) | ||||
|  | ||||
|             else: | ||||
|                 self.log("debug", "Detected language: {}".format(guessed_language)) | ||||
|                 self.log("debug", f"Detected language: {guessed_language}") | ||||
|                 ocr_pages = self._ocr(images, ISO639[guessed_language]) | ||||
|  | ||||
|             self.log("debug", "OCR completed.") | ||||
| @@ -130,7 +147,9 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         Greyscale images are easier for Tesseract to OCR | ||||
|         """ | ||||
|  | ||||
|         self.log("debug", "Converting document {} into greyscale images...".format(self.document_path)) | ||||
|         self.log( | ||||
|             "debug", | ||||
|             f"Converting document {self.document_path} into greyscale images") | ||||
|  | ||||
|         # Convert PDF to multiple PNMs | ||||
|         pnm = os.path.join(self.tempdir, "convert-%04d.pnm") | ||||
| @@ -148,7 +167,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             if f.endswith(".pnm"): | ||||
|                 pnms.append(os.path.join(self.tempdir, f)) | ||||
|  | ||||
|         self.log("debug", "Running unpaper on {} pages...".format(len(pnms))) | ||||
|         self.log("debug", f"Running unpaper on {len(pnms)} pages...") | ||||
|  | ||||
|         # Run unpaper in parallel on converted images | ||||
|         with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: | ||||
| @@ -161,26 +180,25 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             guess = langdetect.detect(text) | ||||
|             return guess | ||||
|         except Exception as e: | ||||
|             self.log('warning', "Language detection failed with: {}".format(e)) | ||||
|             self.log('warning', f"Language detection failed with: {e}") | ||||
|             return None | ||||
|  | ||||
|     def _ocr(self, imgs, lang): | ||||
|         self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang)) | ||||
|         self.log( | ||||
|             "debug", | ||||
|             f"Performing OCR on {len(imgs)} page(s) with language {lang}") | ||||
|         with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: | ||||
|             r = pool.map(image_to_string, itertools.product(imgs, [lang])) | ||||
|             return r | ||||
|  | ||||
|     def _complete_ocr_default_language(self, images, sample_page_index, sample_page): | ||||
|         """ | ||||
|         Given a `middle` value and the text that middle page represents, we OCR | ||||
|         the remainder of the document and return the whole thing. | ||||
|         """ | ||||
|         # text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text | ||||
|         # text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE) | ||||
|     def _complete_ocr_default_language(self, | ||||
|                                        images, | ||||
|                                        sample_page_index, | ||||
|                                        sample_page): | ||||
|         images_copy = list(images) | ||||
|         del images_copy[sample_page_index] | ||||
|         if images_copy: | ||||
|             self.log('debug', 'Continuing ocr with default language.') | ||||
|             self.log('debug', "Continuing ocr with default language.") | ||||
|             ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) | ||||
|             ocr_pages.insert(sample_page_index, sample_page) | ||||
|             return ocr_pages | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler