Merge branch 'dev' into feature-server-side-saved-views

This commit is contained in:
jonaswinkler
2020-12-14 11:49:03 +01:00
51 changed files with 1112 additions and 321 deletions

View File

@@ -8,6 +8,12 @@ from django.conf import settings
from django.template.defaultfilters import slugify
class defaultdictNoStr(defaultdict):
def __str__(self):
raise ValueError("Don't use {tags} directly.")
def create_source_path_directory(source_path):
os.makedirs(os.path.dirname(source_path), exist_ok=True)
@@ -90,8 +96,8 @@ def generate_filename(doc, counter=0):
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdict(lambda: slugify(None),
many_to_dictionary(doc.tags))
tags = defaultdictNoStr(lambda: slugify(None),
many_to_dictionary(doc.tags))
if doc.correspondent:
correspondent = pathvalidate.sanitize_filename(
@@ -114,14 +120,18 @@ def generate_filename(doc, counter=0):
document_type=document_type,
created=datetime.date.isoformat(doc.created),
created_year=doc.created.year if doc.created else "none",
created_month=doc.created.month if doc.created else "none",
created_day=doc.created.day if doc.created else "none",
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
created_day=f"{doc.created.day:02}" if doc.created else "none",
added=datetime.date.isoformat(doc.added),
added_year=doc.added.year if doc.added else "none",
added_month=doc.added.month if doc.added else "none",
added_day=doc.added.day if doc.added else "none",
added_month=f"{doc.added.month:02}" if doc.added else "none",
added_day=f"{doc.added.day:02}" if doc.added else "none",
tags=tags,
)
tag_list=",".join([tag.name for tag in doc.tags.all()])
).strip()
path = path.strip(os.sep)
except (ValueError, KeyError, IndexError):
logging.getLogger(__name__).warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "

View File

@@ -2,6 +2,7 @@ import logging
import tqdm
from django.core.management.base import BaseCommand
from django.db.models.signals import post_save
from documents.models import Document
from ...mixins import Renderable
@@ -24,5 +25,4 @@ class Command(Renderable, BaseCommand):
logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm(Document.objects.all()):
# Saving the document again will generate a new filename and rename
document.save()
post_save.send(Document, instance=document)

View File

@@ -13,7 +13,7 @@ from django.test import TestCase, override_settings
from .utils import DirectoriesMixin
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories, \
generate_unique_filename
from ..models import Document, Correspondent
from ..models import Document, Correspondent, Tag
class TestFileHandling(DirectoriesMixin, TestCase):
@@ -267,6 +267,57 @@ class TestFileHandling(DirectoriesMixin, TestCase):
self.assertEqual(generate_filename(document),
"none.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags}")
def test_tags_without_args(self):
document = Document()
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
self.assertEqual(generate_filename(document), f"{document.pk:07}.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{title} {tag_list}")
def test_tag_list(self):
doc = Document.objects.create(title="doc1", mime_type="application/pdf")
doc.tags.create(name="tag2")
doc.tags.create(name="tag1")
self.assertEqual(generate_filename(doc), "doc1 tag1,tag2.pdf")
doc = Document.objects.create(title="doc2", checksum="B", mime_type="application/pdf")
self.assertEqual(generate_filename(doc), "doc2.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="//etc/something/{title}")
def test_filename_relative(self):
doc = Document.objects.create(title="doc1", mime_type="application/pdf")
doc.filename = generate_filename(doc)
doc.save()
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "etc", "something", "doc1.pdf"))
@override_settings(PAPERLESS_FILENAME_FORMAT="{created_year}-{created_month}-{created_day}")
def test_created_year_month_day(self):
d1 = datetime.datetime(2020, 3, 6, 1, 1, 1)
doc1 = Document.objects.create(title="doc1", mime_type="application/pdf", created=d1)
self.assertEqual(generate_filename(doc1), "2020-03-06.pdf")
doc1.created = datetime.datetime(2020, 11, 16, 1, 1, 1)
self.assertEqual(generate_filename(doc1), "2020-11-16.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{added_year}-{added_month}-{added_day}")
def test_added_year_month_day(self):
d1 = datetime.datetime(232, 1, 9, 1, 1, 1)
doc1 = Document.objects.create(title="doc1", mime_type="application/pdf", added=d1)
self.assertEqual(generate_filename(doc1), "232-01-09.pdf")
doc1.added = datetime.datetime(2020, 11, 16, 1, 1, 1)
self.assertEqual(generate_filename(doc1), "2020-11-16.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
def test_nested_directory_cleanup(self):
document = Document()

View File

@@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser):
f"Error while getting DPI from image {image}: {e}")
return None
def calculate_a4_dpi(self, image):
try:
with Image.open(image) as im:
width, height = im.size
# divide image width by A4 width (210mm) in inches.
dpi = int(width / (21 / 2.54))
self.log(
'debug',
f"Estimated DPI {dpi} based on image width {width}"
)
return dpi
except Exception as e:
self.log(
'warning',
f"Error while calculating DPI for image {image}: {e}")
return None
def parse(self, document_path, mime_type):
mode = settings.OCR_MODE
@@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
if self.is_image(mime_type):
dpi = self.get_dpi(document_path)
a4_dpi = self.calculate_a4_dpi(document_path)
if dpi:
self.log(
"debug",
@@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser):
ocr_args['image_dpi'] = dpi
elif settings.OCR_IMAGE_DPI:
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
elif a4_dpi:
ocr_args['image_dpi'] = a4_dpi
else:
raise ParseError(
f"Cannot produce archive PDF for image {document_path}, "
@@ -241,6 +262,9 @@ def strip_excess_whitespace(text):
def get_text_from_pdf(pdf_file):
if not os.path.isfile(pdf_file):
return None
with open(pdf_file, "rb") as f:
try:
pdf = pdftotext.PDF(f)

View File

@@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase):
self.assertRaises(ParseError, f)
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
def test_image_calc_a4_dpi(self, m):
parser = RasterisedDocumentParser(None)
def test_image_no_dpi_fail(self):
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
m.assert_called_once()
args, kwargs = m.call_args
self.assertEqual(kwargs['image_dpi'], 62)
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
def test_image_dpi_fail(self, m):
m.return_value = None
parser = RasterisedDocumentParser(None)
def f():