mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
code cleanup
This commit is contained in:
parent
5a84cc835a
commit
b44f8383e4
@ -104,9 +104,11 @@ class Consumer(LoggingMixin):
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if not parser_class:
|
||||
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
|
||||
raise ConsumerError(f"No parsers abvailable for {self.filename}")
|
||||
else:
|
||||
self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type))
|
||||
self.log("debug",
|
||||
f"Parser: {parser_class.__name__} "
|
||||
f"based on mime type {mime_type}")
|
||||
|
||||
# Notify all listeners that we're going to do some work.
|
||||
|
||||
@ -126,7 +128,7 @@ class Consumer(LoggingMixin):
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
try:
|
||||
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
|
||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||
thumbnail = document_parser.get_optimised_thumbnail()
|
||||
self.log("debug", "Parsing {}...".format(self.filename))
|
||||
text = document_parser.get_text()
|
||||
@ -244,10 +246,12 @@ class Consumer(LoggingMixin):
|
||||
document.title = self.override_title
|
||||
|
||||
if self.override_correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
|
||||
document.correspondent = Correspondent.objects.get(
|
||||
pk=self.override_correspondent_id)
|
||||
|
||||
if self.override_document_type_id:
|
||||
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
|
||||
document.document_type = DocumentType.objects.get(
|
||||
pk=self.override_document_type_id)
|
||||
|
||||
if self.override_tag_ids:
|
||||
for tag_id in self.override_tag_ids:
|
||||
|
@ -87,7 +87,9 @@ def generate_filename(document):
|
||||
tags=tags,
|
||||
)
|
||||
except (ValueError, KeyError, IndexError):
|
||||
logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Invalid PAPERLESS_FILENAME_FORMAT: "
|
||||
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
if len(path) > 0:
|
||||
|
@ -46,9 +46,14 @@ class UploadForm(forms.Form):
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", dir=settings.SCRATCH_DIR, delete=False) as f:
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
delete=False) as f:
|
||||
|
||||
f.write(data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
|
||||
async_task("documents.tasks.consume_file",
|
||||
f.name,
|
||||
override_filename=original_filename,
|
||||
task_name=os.path.basename(original_filename))
|
||||
|
@ -120,6 +120,7 @@ def query_page(ix, query, page):
|
||||
def autocomplete(ix, term, limit=10):
|
||||
with ix.reader() as reader:
|
||||
terms = []
|
||||
for (score, t) in reader.most_distinctive_terms("content", limit, term.lower()):
|
||||
for (score, t) in reader.most_distinctive_terms(
|
||||
"content", number=limit, prefix=term.lower()):
|
||||
terms.append(t)
|
||||
return terms
|
||||
|
@ -19,10 +19,13 @@ class Handler(FileSystemEventHandler):
|
||||
def _consume(self, file):
|
||||
if os.path.isfile(file):
|
||||
try:
|
||||
async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
|
||||
async_task("documents.tasks.consume_file",
|
||||
file,
|
||||
task_name=os.path.basename(file))
|
||||
except Exception as e:
|
||||
# Catch all so that the consumer won't crash.
|
||||
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
|
||||
logging.getLogger(__name__).error(
|
||||
"Error while consuming document: {}".format(e))
|
||||
|
||||
def on_created(self, event):
|
||||
self._consume(event.src_path)
|
||||
@ -66,12 +69,14 @@ class Command(BaseCommand):
|
||||
# Consume all files as this is not done initially by the watchdog
|
||||
for entry in os.scandir(directory):
|
||||
if entry.is_file():
|
||||
async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
|
||||
async_task("documents.tasks.consume_file",
|
||||
entry.path,
|
||||
task_name=os.path.basename(entry.path))
|
||||
|
||||
# Start the watchdog. Woof!
|
||||
if settings.CONSUMER_POLLING > 0:
|
||||
logging.getLogger(__name__).info('Using polling instead of file'
|
||||
'system notifications.')
|
||||
logging.getLogger(__name__).info(
|
||||
"Using polling instead of file system notifications.")
|
||||
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
else:
|
||||
observer = Observer()
|
||||
|
@ -63,7 +63,7 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
unique_filename = "{:07}_{}".format(document.pk, document.file_name)
|
||||
unique_filename = f"{document.pk:07}_{document.file_name}"
|
||||
|
||||
file_target = os.path.join(self.target, unique_filename)
|
||||
|
||||
@ -73,7 +73,7 @@ class Command(Renderable, BaseCommand):
|
||||
document_dict[EXPORTER_FILE_NAME] = unique_filename
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
|
||||
|
||||
print("Exporting: {}".format(file_target))
|
||||
print(f"Exporting: {file_target}")
|
||||
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
|
@ -120,7 +120,7 @@ class Command(Renderable, BaseCommand):
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
else:
|
||||
print("Moving {} to {}".format(document_path, document.source_path))
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
|
||||
|
@ -74,13 +74,13 @@ class Command(Renderable, BaseCommand):
|
||||
try:
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Cannot classify documents: {e}.")
|
||||
classifier = None
|
||||
|
||||
for document in documents:
|
||||
logging.getLogger(__name__).info(
|
||||
"Processing document {}".format(document.title)
|
||||
)
|
||||
f"Processing document {document.title}")
|
||||
|
||||
if options['correspondent']:
|
||||
set_correspondent(
|
||||
|
@ -6,17 +6,23 @@ from documents.models import MatchingModel, Correspondent, DocumentType, Tag
|
||||
|
||||
|
||||
def match_correspondents(document_content, classifier):
|
||||
correspondents = Correspondent.objects.all()
|
||||
predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
|
||||
if classifier:
|
||||
pred_id = classifier.predict_correspondent(document_content)
|
||||
else:
|
||||
pred_id = None
|
||||
|
||||
return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
|
||||
correspondents = Correspondent.objects.all()
|
||||
return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id]
|
||||
|
||||
|
||||
def match_document_types(document_content, classifier):
|
||||
document_types = DocumentType.objects.all()
|
||||
predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
|
||||
if classifier:
|
||||
pred_id = classifier.predict_document_type(document_content)
|
||||
else:
|
||||
pred_id = None
|
||||
|
||||
return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
|
||||
document_types = DocumentType.objects.all()
|
||||
return [o for o in document_types if matches(o, document_content) or o.pk == pred_id]
|
||||
|
||||
|
||||
def match_tags(document_content, classifier):
|
||||
|
@ -73,7 +73,18 @@ def get_parser_class(path):
|
||||
return get_parser_class_for_mime_type(mime_type)
|
||||
|
||||
|
||||
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
|
||||
def run_convert(input_file,
|
||||
output_file,
|
||||
density=None,
|
||||
scale=None,
|
||||
alpha=None,
|
||||
strip=False,
|
||||
trim=False,
|
||||
type=None,
|
||||
depth=None,
|
||||
extra=None,
|
||||
logging_group=None):
|
||||
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||
@ -102,10 +113,13 @@ def run_unpaper(pnm, logging_group=None):
|
||||
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
|
||||
pnm_out)
|
||||
|
||||
logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group})
|
||||
logger.debug(f"Execute: {' '.join(command_args)}",
|
||||
extra={'group': logging_group})
|
||||
|
||||
if not subprocess.Popen(command_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait() == 0:
|
||||
raise ParseError("Unpaper failed at {}".format(command_args))
|
||||
if not subprocess.Popen(command_args,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL).wait() == 0:
|
||||
raise ParseError(f"Unpaper failed at {command_args}")
|
||||
|
||||
return pnm_out
|
||||
|
||||
@ -124,7 +138,8 @@ class DocumentParser(LoggingMixin):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
self.tempdir = tempfile.mkdtemp(
|
||||
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
@ -137,9 +152,10 @@ class DocumentParser(LoggingMixin):
|
||||
if settings.OPTIMIZE_THUMBNAILS:
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
|
||||
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
|
||||
args = (settings.OPTIPNG_BINARY,
|
||||
"-silent", "-o5", in_path, "-out", out_path)
|
||||
|
||||
self.log('debug', 'Execute: ' + " ".join(args))
|
||||
self.log('debug', f"Execute: {' '.join(args)}")
|
||||
|
||||
if not subprocess.Popen(args).wait() == 0:
|
||||
raise ParseError("Optipng failed at {}".format(args))
|
||||
|
@ -76,9 +76,11 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):
|
||||
|
||||
class DocumentSerializer(serializers.ModelSerializer):
|
||||
|
||||
correspondent_id = CorrespondentField(allow_null=True, source='correspondent')
|
||||
correspondent_id = CorrespondentField(
|
||||
allow_null=True, source='correspondent')
|
||||
tags_id = TagsField(many=True, source='tags')
|
||||
document_type_id = DocumentTypeField(allow_null=True, source='document_type')
|
||||
document_type_id = DocumentTypeField(
|
||||
allow_null=True, source='document_type')
|
||||
|
||||
class Meta:
|
||||
model = Document
|
||||
|
@ -25,11 +25,18 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
|
||||
document.tags.add(*inbox_tags)
|
||||
|
||||
|
||||
def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
|
||||
def set_correspondent(sender,
|
||||
document=None,
|
||||
logging_group=None,
|
||||
classifier=None,
|
||||
replace=False,
|
||||
use_first=True,
|
||||
**kwargs):
|
||||
if document.correspondent and not replace:
|
||||
return
|
||||
|
||||
potential_correspondents = matching.match_correspondents(document.content, classifier)
|
||||
potential_correspondents = matching.match_correspondents(document.content,
|
||||
classifier)
|
||||
|
||||
potential_count = len(potential_correspondents)
|
||||
if potential_correspondents:
|
||||
@ -38,22 +45,22 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
|
||||
selected = None
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
message = "Detected {} potential correspondents, so we've opted for {}"
|
||||
logger(
|
||||
message.format(potential_count, selected),
|
||||
f"Detected {potential_count} potential correspondents, "
|
||||
f"so we've opted for {selected}",
|
||||
logging_group
|
||||
)
|
||||
else:
|
||||
message = "Detected {} potential correspondents, not assigning any correspondent"
|
||||
logger(
|
||||
message.format(potential_count),
|
||||
f"Detected {potential_count} potential correspondents, "
|
||||
f"not assigning any correspondent",
|
||||
logging_group
|
||||
)
|
||||
return
|
||||
|
||||
if selected or replace:
|
||||
logger(
|
||||
'Assigning correspondent "{}" to "{}" '.format(selected, document),
|
||||
f"Assigning correspondent {selected} to {document}",
|
||||
logging_group
|
||||
)
|
||||
|
||||
@ -61,11 +68,18 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
|
||||
document.save(update_fields=("correspondent",))
|
||||
|
||||
|
||||
def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
|
||||
def set_document_type(sender,
|
||||
document=None,
|
||||
logging_group=None,
|
||||
classifier=None,
|
||||
replace=False,
|
||||
use_first=True,
|
||||
**kwargs):
|
||||
if document.document_type and not replace:
|
||||
return
|
||||
|
||||
potential_document_type = matching.match_document_types(document.content, classifier)
|
||||
potential_document_type = matching.match_document_types(document.content,
|
||||
classifier)
|
||||
|
||||
potential_count = len(potential_document_type)
|
||||
if potential_document_type:
|
||||
@ -75,22 +89,22 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None
|
||||
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
message = "Detected {} potential document types, so we've opted for {}"
|
||||
logger(
|
||||
message.format(potential_count, selected),
|
||||
f"Detected {potential_count} potential document types, "
|
||||
f"so we've opted for {selected}",
|
||||
logging_group
|
||||
)
|
||||
else:
|
||||
message = "Detected {} potential document types, not assigning any document type"
|
||||
logger(
|
||||
message.format(potential_count),
|
||||
f"Detected {potential_count} potential document types, "
|
||||
f"not assigning any document type",
|
||||
logging_group
|
||||
)
|
||||
return
|
||||
|
||||
if selected or replace:
|
||||
logger(
|
||||
'Assigning document type "{}" to "{}" '.format(selected, document),
|
||||
f"Assigning document type {selected} to {document}",
|
||||
logging_group
|
||||
)
|
||||
|
||||
@ -98,14 +112,21 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None
|
||||
document.save(update_fields=("document_type",))
|
||||
|
||||
|
||||
def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs):
|
||||
def set_tags(sender,
|
||||
document=None,
|
||||
logging_group=None,
|
||||
classifier=None,
|
||||
replace=False,
|
||||
**kwargs):
|
||||
if replace:
|
||||
document.tags.clear()
|
||||
current_tags = set([])
|
||||
else:
|
||||
current_tags = set(document.tags.all())
|
||||
|
||||
relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags
|
||||
matched_tags = matching.match_tags(document.content, classifier)
|
||||
|
||||
relevant_tags = set(matched_tags) - current_tags
|
||||
|
||||
if not relevant_tags:
|
||||
return
|
||||
@ -180,12 +201,15 @@ def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path))
|
||||
logging.getLogger(__name__).fatal(
|
||||
f"Document {str(instance)}: File {old_path} has gone.")
|
||||
return
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path))
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Document {str(instance)}: Cannot rename file "
|
||||
f"since target path {new_path} already exists.")
|
||||
return
|
||||
|
||||
create_source_path_directory(new_path)
|
||||
|
@ -15,11 +15,3 @@ class ChecksTestCase(TestCase):
|
||||
def test_changed_password_check_no_encryption(self):
|
||||
DocumentFactory.create(storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
self.assertEqual(changed_password_check(None), [])
|
||||
|
||||
@unittest.skip("I don't know how to test this")
|
||||
def test_changed_password_check_gpg_encryption_with_good_password(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("I don't know how to test this")
|
||||
def test_changed_password_check_fail(self):
|
||||
pass
|
||||
|
@ -47,18 +47,30 @@ class IndexView(TemplateView):
|
||||
|
||||
class CorrespondentViewSet(ModelViewSet):
|
||||
model = Correspondent
|
||||
queryset = Correspondent.objects.annotate(document_count=Count('documents'), last_correspondence=Max('documents__created')).order_by('name')
|
||||
|
||||
queryset = Correspondent.objects.annotate(
|
||||
document_count=Count('documents'),
|
||||
last_correspondence=Max('documents__created')).order_by('name')
|
||||
|
||||
serializer_class = CorrespondentSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filterset_class = CorrespondentFilterSet
|
||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
|
||||
ordering_fields = (
|
||||
"name",
|
||||
"matching_algorithm",
|
||||
"match",
|
||||
"document_count",
|
||||
"last_correspondence")
|
||||
|
||||
|
||||
class TagViewSet(ModelViewSet):
|
||||
model = Tag
|
||||
queryset = Tag.objects.annotate(document_count=Count('documents')).order_by('name')
|
||||
|
||||
queryset = Tag.objects.annotate(
|
||||
document_count=Count('documents')).order_by('name')
|
||||
|
||||
serializer_class = TagSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
@ -69,7 +81,10 @@ class TagViewSet(ModelViewSet):
|
||||
|
||||
class DocumentTypeViewSet(ModelViewSet):
|
||||
model = DocumentType
|
||||
queryset = DocumentType.objects.annotate(document_count=Count('documents')).order_by('name')
|
||||
|
||||
queryset = DocumentType.objects.annotate(
|
||||
document_count=Count('documents')).order_by('name')
|
||||
|
||||
serializer_class = DocumentTypeSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
@ -92,10 +107,18 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
filterset_class = DocumentFilterSet
|
||||
search_fields = ("title", "correspondent__name", "content")
|
||||
ordering_fields = (
|
||||
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
|
||||
"id",
|
||||
"title",
|
||||
"correspondent__name",
|
||||
"document_type__name",
|
||||
"created",
|
||||
"modified",
|
||||
"added",
|
||||
"archive_serial_number")
|
||||
|
||||
def update(self, request, *args, **kwargs):
|
||||
response = super(DocumentViewSet, self).update(request, *args, **kwargs)
|
||||
response = super(DocumentViewSet, self).update(
|
||||
request, *args, **kwargs)
|
||||
index.add_or_update_document(self.get_object())
|
||||
return response
|
||||
|
||||
@ -138,7 +161,8 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
@cache_control(public=False, max_age=315360000)
|
||||
def thumb(self, request, pk=None):
|
||||
try:
|
||||
return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
|
||||
return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
|
||||
content_type='image/png')
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document thumbnail does not exist")
|
||||
|
||||
@ -230,5 +254,6 @@ class StatisticsView(APIView):
|
||||
def get(self, request, format=None):
|
||||
return Response({
|
||||
'documents_total': Document.objects.all().count(),
|
||||
'documents_inbox': Document.objects.filter(tags__is_inbox_tag=True).distinct().count()
|
||||
'documents_inbox': Document.objects.filter(
|
||||
tags__is_inbox_tag=True).distinct().count()
|
||||
})
|
||||
|
@ -174,8 +174,8 @@ class MailAccountHandler(LoggingMixin):
|
||||
M.folder.set(rule.folder)
|
||||
except MailboxFolderSelectError:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Folder {rule.folder} does not exist "
|
||||
f"in account {account.name}")
|
||||
f"Rule {rule.name}: Folder {rule.folder} "
|
||||
f"does not exist in account {account.name}")
|
||||
|
||||
criterias = make_criterias(rule)
|
||||
|
||||
@ -185,7 +185,8 @@ class MailAccountHandler(LoggingMixin):
|
||||
f"{str(AND(**criterias))}")
|
||||
|
||||
try:
|
||||
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
|
||||
messages = M.fetch(criteria=AND(**criterias),
|
||||
mark_seen=False)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while fetching folder "
|
||||
@ -226,8 +227,8 @@ class MailAccountHandler(LoggingMixin):
|
||||
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while processing post-consume "
|
||||
f"actions for account {account.name}")
|
||||
f"Rule {rule.name}: Error while processing "
|
||||
f"post-consume actions for account {account.name}")
|
||||
|
||||
return total_processed_files
|
||||
|
||||
@ -266,7 +267,8 @@ class MailAccountHandler(LoggingMixin):
|
||||
if is_mime_type_supported(mime_type):
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
|
||||
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-",
|
||||
dir=settings.SCRATCH_DIR)
|
||||
with open(temp_filename, 'wb') as f:
|
||||
f.write(att.payload)
|
||||
|
||||
|
@ -66,10 +66,14 @@ class MailRule(models.Model):
|
||||
CORRESPONDENT_FROM_CUSTOM = 4
|
||||
|
||||
CORRESPONDENT_SELECTOR = (
|
||||
(CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"),
|
||||
(CORRESPONDENT_FROM_EMAIL, "Use mail address"),
|
||||
(CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"),
|
||||
(CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
|
||||
(CORRESPONDENT_FROM_NOTHING,
|
||||
"Do not assign a correspondent"),
|
||||
(CORRESPONDENT_FROM_EMAIL,
|
||||
"Use mail address"),
|
||||
(CORRESPONDENT_FROM_NAME,
|
||||
"Use name (or mail address if not available)"),
|
||||
(CORRESPONDENT_FROM_CUSTOM,
|
||||
"Use correspondent selected below")
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=256, unique=True)
|
||||
|
@ -7,7 +7,8 @@ from paperless_mail.models import MailAccount
|
||||
def process_mail_accounts():
|
||||
total_new_documents = 0
|
||||
for account in MailAccount.objects.all():
|
||||
total_new_documents += MailAccountHandler().handle_mail_account(account)
|
||||
total_new_documents += MailAccountHandler().handle_mail_account(
|
||||
account)
|
||||
|
||||
if total_new_documents > 0:
|
||||
return f"Added {total_new_documents} document(s)."
|
||||
|
@ -50,7 +50,10 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
except ParseError:
|
||||
# if convert fails, fall back to extracting
|
||||
# the first PDF page as a PNG using Ghostscript
|
||||
self.log('warning', 'Thumbnail generation with ImageMagick failed, falling back to ghostscript. Check your /etc/ImageMagick-x/policy.xml!')
|
||||
self.log(
|
||||
'warning',
|
||||
"Thumbnail generation with ImageMagick failed, falling back "
|
||||
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
|
||||
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
|
||||
cmd = [settings.GS_BINARY,
|
||||
"-q",
|
||||
@ -98,24 +101,38 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
try:
|
||||
|
||||
sample_page_index = int(len(images) / 2)
|
||||
self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
|
||||
sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
|
||||
self.log(
|
||||
"debug",
|
||||
f"Attempting language detection on page "
|
||||
f"{sample_page_index + 1} of {len(images)}...")
|
||||
|
||||
sample_page_text = self._ocr([images[sample_page_index]],
|
||||
settings.OCR_LANGUAGE)[0]
|
||||
guessed_language = self._guess_language(sample_page_text)
|
||||
|
||||
if not guessed_language or guessed_language not in ISO639:
|
||||
self.log("warning", "Language detection failed.")
|
||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
|
||||
self.log("debug", "Detected language: {} (default language)".format(guessed_language))
|
||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||
self.log(
|
||||
"debug",
|
||||
f"Detected language: {guessed_language} "
|
||||
f"(default language)")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
|
||||
self.log("warning", "Detected language {} is not available on this system.".format(guessed_language))
|
||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||
self.log(
|
||||
"warning",
|
||||
f"Detected language {guessed_language} is not available "
|
||||
f"on this system.")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
else:
|
||||
self.log("debug", "Detected language: {}".format(guessed_language))
|
||||
self.log("debug", f"Detected language: {guessed_language}")
|
||||
ocr_pages = self._ocr(images, ISO639[guessed_language])
|
||||
|
||||
self.log("debug", "OCR completed.")
|
||||
@ -130,7 +147,9 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
"""
|
||||
|
||||
self.log("debug", "Converting document {} into greyscale images...".format(self.document_path))
|
||||
self.log(
|
||||
"debug",
|
||||
f"Converting document {self.document_path} into greyscale images")
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||
@ -148,7 +167,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
self.log("debug", "Running unpaper on {} pages...".format(len(pnms)))
|
||||
self.log("debug", f"Running unpaper on {len(pnms)} pages...")
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
@ -161,26 +180,25 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
guess = langdetect.detect(text)
|
||||
return guess
|
||||
except Exception as e:
|
||||
self.log('warning', "Language detection failed with: {}".format(e))
|
||||
self.log('warning', f"Language detection failed with: {e}")
|
||||
return None
|
||||
|
||||
def _ocr(self, imgs, lang):
|
||||
self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
|
||||
self.log(
|
||||
"debug",
|
||||
f"Performing OCR on {len(imgs)} page(s) with language {lang}")
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
return r
|
||||
|
||||
def _complete_ocr_default_language(self, images, sample_page_index, sample_page):
|
||||
"""
|
||||
Given a `middle` value and the text that middle page represents, we OCR
|
||||
the remainder of the document and return the whole thing.
|
||||
"""
|
||||
# text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text
|
||||
# text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE)
|
||||
def _complete_ocr_default_language(self,
|
||||
images,
|
||||
sample_page_index,
|
||||
sample_page):
|
||||
images_copy = list(images)
|
||||
del images_copy[sample_page_index]
|
||||
if images_copy:
|
||||
self.log('debug', 'Continuing ocr with default language.')
|
||||
self.log('debug', "Continuing ocr with default language.")
|
||||
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
|
||||
ocr_pages.insert(sample_page_index, sample_page)
|
||||
return ocr_pages
|
||||
|
Loading…
x
Reference in New Issue
Block a user