mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'dev' into feature-server-side-saved-views
This commit is contained in:
@@ -8,6 +8,12 @@ from django.conf import settings
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
|
||||
class defaultdictNoStr(defaultdict):
|
||||
|
||||
def __str__(self):
|
||||
raise ValueError("Don't use {tags} directly.")
|
||||
|
||||
|
||||
def create_source_path_directory(source_path):
|
||||
os.makedirs(os.path.dirname(source_path), exist_ok=True)
|
||||
|
||||
@@ -90,8 +96,8 @@ def generate_filename(doc, counter=0):
|
||||
|
||||
try:
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
many_to_dictionary(doc.tags))
|
||||
tags = defaultdictNoStr(lambda: slugify(None),
|
||||
many_to_dictionary(doc.tags))
|
||||
|
||||
if doc.correspondent:
|
||||
correspondent = pathvalidate.sanitize_filename(
|
||||
@@ -114,14 +120,18 @@ def generate_filename(doc, counter=0):
|
||||
document_type=document_type,
|
||||
created=datetime.date.isoformat(doc.created),
|
||||
created_year=doc.created.year if doc.created else "none",
|
||||
created_month=doc.created.month if doc.created else "none",
|
||||
created_day=doc.created.day if doc.created else "none",
|
||||
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
|
||||
created_day=f"{doc.created.day:02}" if doc.created else "none",
|
||||
added=datetime.date.isoformat(doc.added),
|
||||
added_year=doc.added.year if doc.added else "none",
|
||||
added_month=doc.added.month if doc.added else "none",
|
||||
added_day=doc.added.day if doc.added else "none",
|
||||
added_month=f"{doc.added.month:02}" if doc.added else "none",
|
||||
added_day=f"{doc.added.day:02}" if doc.added else "none",
|
||||
tags=tags,
|
||||
)
|
||||
tag_list=",".join([tag.name for tag in doc.tags.all()])
|
||||
).strip()
|
||||
|
||||
path = path.strip(os.sep)
|
||||
|
||||
except (ValueError, KeyError, IndexError):
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Invalid PAPERLESS_FILENAME_FORMAT: "
|
||||
|
@@ -2,6 +2,7 @@ import logging
|
||||
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db.models.signals import post_save
|
||||
|
||||
from documents.models import Document
|
||||
from ...mixins import Renderable
|
||||
@@ -24,5 +25,4 @@ class Command(Renderable, BaseCommand):
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
for document in tqdm.tqdm(Document.objects.all()):
|
||||
# Saving the document again will generate a new filename and rename
|
||||
document.save()
|
||||
post_save.send(Document, instance=document)
|
||||
|
@@ -13,7 +13,7 @@ from django.test import TestCase, override_settings
|
||||
from .utils import DirectoriesMixin
|
||||
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories, \
|
||||
generate_unique_filename
|
||||
from ..models import Document, Correspondent
|
||||
from ..models import Document, Correspondent, Tag
|
||||
|
||||
|
||||
class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
@@ -267,6 +267,57 @@ class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
self.assertEqual(generate_filename(document),
|
||||
"none.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags}")
|
||||
def test_tags_without_args(self):
|
||||
document = Document()
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
self.assertEqual(generate_filename(document), f"{document.pk:07}.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title} {tag_list}")
|
||||
def test_tag_list(self):
|
||||
doc = Document.objects.create(title="doc1", mime_type="application/pdf")
|
||||
doc.tags.create(name="tag2")
|
||||
doc.tags.create(name="tag1")
|
||||
|
||||
self.assertEqual(generate_filename(doc), "doc1 tag1,tag2.pdf")
|
||||
|
||||
doc = Document.objects.create(title="doc2", checksum="B", mime_type="application/pdf")
|
||||
|
||||
self.assertEqual(generate_filename(doc), "doc2.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="//etc/something/{title}")
|
||||
def test_filename_relative(self):
|
||||
doc = Document.objects.create(title="doc1", mime_type="application/pdf")
|
||||
doc.filename = generate_filename(doc)
|
||||
doc.save()
|
||||
|
||||
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "etc", "something", "doc1.pdf"))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{created_year}-{created_month}-{created_day}")
|
||||
def test_created_year_month_day(self):
|
||||
d1 = datetime.datetime(2020, 3, 6, 1, 1, 1)
|
||||
doc1 = Document.objects.create(title="doc1", mime_type="application/pdf", created=d1)
|
||||
|
||||
self.assertEqual(generate_filename(doc1), "2020-03-06.pdf")
|
||||
|
||||
doc1.created = datetime.datetime(2020, 11, 16, 1, 1, 1)
|
||||
|
||||
self.assertEqual(generate_filename(doc1), "2020-11-16.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{added_year}-{added_month}-{added_day}")
|
||||
def test_added_year_month_day(self):
|
||||
d1 = datetime.datetime(232, 1, 9, 1, 1, 1)
|
||||
doc1 = Document.objects.create(title="doc1", mime_type="application/pdf", added=d1)
|
||||
|
||||
self.assertEqual(generate_filename(doc1), "232-01-09.pdf")
|
||||
|
||||
doc1.added = datetime.datetime(2020, 11, 16, 1, 1, 1)
|
||||
|
||||
self.assertEqual(generate_filename(doc1), "2020-11-16.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
|
||||
def test_nested_directory_cleanup(self):
|
||||
document = Document()
|
||||
|
@@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def calculate_a4_dpi(self, image):
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
width, height = im.size
|
||||
# divide image width by A4 width (210mm) in inches.
|
||||
dpi = int(width / (21 / 2.54))
|
||||
self.log(
|
||||
'debug',
|
||||
f"Estimated DPI {dpi} based on image width {width}"
|
||||
)
|
||||
return dpi
|
||||
|
||||
except Exception as e:
|
||||
self.log(
|
||||
'warning',
|
||||
f"Error while calculating DPI for image {image}: {e}")
|
||||
return None
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
mode = settings.OCR_MODE
|
||||
|
||||
@@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
if self.is_image(mime_type):
|
||||
dpi = self.get_dpi(document_path)
|
||||
a4_dpi = self.calculate_a4_dpi(document_path)
|
||||
if dpi:
|
||||
self.log(
|
||||
"debug",
|
||||
@@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
ocr_args['image_dpi'] = dpi
|
||||
elif settings.OCR_IMAGE_DPI:
|
||||
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
|
||||
elif a4_dpi:
|
||||
ocr_args['image_dpi'] = a4_dpi
|
||||
else:
|
||||
raise ParseError(
|
||||
f"Cannot produce archive PDF for image {document_path}, "
|
||||
@@ -241,6 +262,9 @@ def strip_excess_whitespace(text):
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
|
||||
if not os.path.isfile(pdf_file):
|
||||
return None
|
||||
|
||||
with open(pdf_file, "rb") as f:
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
|
@@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
|
||||
def test_image_calc_a4_dpi(self, m):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def test_image_no_dpi_fail(self):
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
|
||||
self.assertEqual(kwargs['image_dpi'], 62)
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
|
||||
def test_image_dpi_fail(self, m):
|
||||
m.return_value = None
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def f():
|
||||
|
Reference in New Issue
Block a user