Merge branch 'dev' into feature-bulk-edit

This commit is contained in:
jonaswinkler
2020-12-13 00:52:36 +01:00
14 changed files with 152 additions and 51 deletions

View File

@@ -2,6 +2,7 @@ import logging
import tqdm
from django.core.management.base import BaseCommand
from django.db.models.signals import post_save
from documents.models import Document
from ...mixins import Renderable
@@ -24,5 +25,4 @@ class Command(Renderable, BaseCommand):
logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm(Document.objects.all()):
# Saving the document again will generate a new filename and rename
document.save()
post_save.send(Document, instance=document)

View File

@@ -7,6 +7,7 @@ from django.contrib.admin.models import ADDITION, LogEntry
from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.db import models, DatabaseError
from django.db.models import Q
from django.dispatch import receiver
from django.utils import timezone
from filelock import FileLock
@@ -121,11 +122,14 @@ def set_tags(sender,
classifier=None,
replace=False,
**kwargs):
if replace:
document.tags.clear()
current_tags = set([])
else:
current_tags = set(document.tags.all())
Document.tags.through.objects.filter(document=document).exclude(
Q(tag__is_inbox_tag=True)).exclude(
Q(tag__match="") & ~Q(tag__matching_algorithm=Tag.MATCH_AUTO)
).delete()
current_tags = set(document.tags.all())
matched_tags = matching.match_tags(document.content, classifier)

View File

@@ -14,6 +14,12 @@ class TestRetagger(DirectoriesMixin, TestCase):
self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
self.tag_inbox = Tag.objects.create(name="test", is_inbox_tag=True)
self.tag_no_match = Tag.objects.create(name="test2")
self.d3.tags.add(self.tag_inbox)
self.d3.tags.add(self.tag_no_match)
self.correspondent_first = Correspondent.objects.create(
name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY)
@@ -38,7 +44,7 @@ class TestRetagger(DirectoriesMixin, TestCase):
self.assertEqual(d_first.tags.count(), 1)
self.assertEqual(d_second.tags.count(), 1)
self.assertEqual(d_unrelated.tags.count(), 0)
self.assertEqual(d_unrelated.tags.count(), 2)
self.assertEqual(d_first.tags.first(), self.tag_first)
self.assertEqual(d_second.tags.first(), self.tag_second)
@@ -56,3 +62,17 @@ class TestRetagger(DirectoriesMixin, TestCase):
self.assertEqual(d_first.correspondent, self.correspondent_first)
self.assertEqual(d_second.correspondent, self.correspondent_second)
def test_overwrite_preserve_inbox(self):
self.d1.tags.add(self.tag_second)
call_command('document_retagger', '--tags', '--overwrite')
d_first, d_second, d_unrelated = self.get_updated_docs()
self.assertIsNotNone(Tag.objects.get(id=self.tag_second.id))
self.assertCountEqual([tag.id for tag in d_first.tags.all()], [self.tag_first.id])
self.assertCountEqual([tag.id for tag in d_second.tags.all()], [self.tag_second.id])
self.assertCountEqual([tag.id for tag in d_unrelated.tags.all()], [self.tag_inbox.id, self.tag_no_match.id])

View File

@@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser):
f"Error while getting DPI from image {image}: {e}")
return None
def calculate_a4_dpi(self, image):
try:
with Image.open(image) as im:
width, height = im.size
# divide image width by A4 width (210mm) in inches.
dpi = int(width / (21 / 2.54))
self.log(
'debug',
f"Estimated DPI {dpi} based on image width {width}"
)
return dpi
except Exception as e:
self.log(
'warning',
f"Error while calculating DPI for image {image}: {e}")
return None
def parse(self, document_path, mime_type):
mode = settings.OCR_MODE
@@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
if self.is_image(mime_type):
dpi = self.get_dpi(document_path)
a4_dpi = self.calculate_a4_dpi(document_path)
if dpi:
self.log(
"debug",
@@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser):
ocr_args['image_dpi'] = dpi
elif settings.OCR_IMAGE_DPI:
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
elif a4_dpi:
ocr_args['image_dpi'] = a4_dpi
else:
raise ParseError(
f"Cannot produce archive PDF for image {document_path}, "
@@ -241,6 +262,9 @@ def strip_excess_whitespace(text):
def get_text_from_pdf(pdf_file):
if not os.path.isfile(pdf_file):
return None
with open(pdf_file, "rb") as f:
try:
pdf = pdftotext.PDF(f)

View File

@@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase):
self.assertRaises(ParseError, f)
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
def test_image_calc_a4_dpi(self, m):
parser = RasterisedDocumentParser(None)
def test_image_no_dpi_fail(self):
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
m.assert_called_once()
args, kwargs = m.call_args
self.assertEqual(kwargs['image_dpi'], 62)
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
def test_image_dpi_fail(self, m):
m.return_value = None
parser = RasterisedDocumentParser(None)
def f():