mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'dev' into feature-bulk-edit
This commit is contained in:
@@ -2,6 +2,7 @@ import logging
|
||||
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db.models.signals import post_save
|
||||
|
||||
from documents.models import Document
|
||||
from ...mixins import Renderable
|
||||
@@ -24,5 +25,4 @@ class Command(Renderable, BaseCommand):
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
for document in tqdm.tqdm(Document.objects.all()):
|
||||
# Saving the document again will generate a new filename and rename
|
||||
document.save()
|
||||
post_save.send(Document, instance=document)
|
||||
|
@@ -7,6 +7,7 @@ from django.contrib.admin.models import ADDITION, LogEntry
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.db import models, DatabaseError
|
||||
from django.db.models import Q
|
||||
from django.dispatch import receiver
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
@@ -121,11 +122,14 @@ def set_tags(sender,
|
||||
classifier=None,
|
||||
replace=False,
|
||||
**kwargs):
|
||||
|
||||
if replace:
|
||||
document.tags.clear()
|
||||
current_tags = set([])
|
||||
else:
|
||||
current_tags = set(document.tags.all())
|
||||
Document.tags.through.objects.filter(document=document).exclude(
|
||||
Q(tag__is_inbox_tag=True)).exclude(
|
||||
Q(tag__match="") & ~Q(tag__matching_algorithm=Tag.MATCH_AUTO)
|
||||
).delete()
|
||||
|
||||
current_tags = set(document.tags.all())
|
||||
|
||||
matched_tags = matching.match_tags(document.content, classifier)
|
||||
|
||||
|
@@ -14,6 +14,12 @@ class TestRetagger(DirectoriesMixin, TestCase):
|
||||
|
||||
self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
|
||||
self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
|
||||
self.tag_inbox = Tag.objects.create(name="test", is_inbox_tag=True)
|
||||
self.tag_no_match = Tag.objects.create(name="test2")
|
||||
|
||||
self.d3.tags.add(self.tag_inbox)
|
||||
self.d3.tags.add(self.tag_no_match)
|
||||
|
||||
|
||||
self.correspondent_first = Correspondent.objects.create(
|
||||
name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY)
|
||||
@@ -38,7 +44,7 @@ class TestRetagger(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertEqual(d_first.tags.count(), 1)
|
||||
self.assertEqual(d_second.tags.count(), 1)
|
||||
self.assertEqual(d_unrelated.tags.count(), 0)
|
||||
self.assertEqual(d_unrelated.tags.count(), 2)
|
||||
|
||||
self.assertEqual(d_first.tags.first(), self.tag_first)
|
||||
self.assertEqual(d_second.tags.first(), self.tag_second)
|
||||
@@ -56,3 +62,17 @@ class TestRetagger(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertEqual(d_first.correspondent, self.correspondent_first)
|
||||
self.assertEqual(d_second.correspondent, self.correspondent_second)
|
||||
|
||||
def test_overwrite_preserve_inbox(self):
|
||||
self.d1.tags.add(self.tag_second)
|
||||
|
||||
call_command('document_retagger', '--tags', '--overwrite')
|
||||
|
||||
d_first, d_second, d_unrelated = self.get_updated_docs()
|
||||
|
||||
self.assertIsNotNone(Tag.objects.get(id=self.tag_second.id))
|
||||
|
||||
self.assertCountEqual([tag.id for tag in d_first.tags.all()], [self.tag_first.id])
|
||||
self.assertCountEqual([tag.id for tag in d_second.tags.all()], [self.tag_second.id])
|
||||
self.assertCountEqual([tag.id for tag in d_unrelated.tags.all()], [self.tag_inbox.id, self.tag_no_match.id])
|
||||
|
||||
|
@@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def calculate_a4_dpi(self, image):
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
width, height = im.size
|
||||
# divide image width by A4 width (210mm) in inches.
|
||||
dpi = int(width / (21 / 2.54))
|
||||
self.log(
|
||||
'debug',
|
||||
f"Estimated DPI {dpi} based on image width {width}"
|
||||
)
|
||||
return dpi
|
||||
|
||||
except Exception as e:
|
||||
self.log(
|
||||
'warning',
|
||||
f"Error while calculating DPI for image {image}: {e}")
|
||||
return None
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
mode = settings.OCR_MODE
|
||||
|
||||
@@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
if self.is_image(mime_type):
|
||||
dpi = self.get_dpi(document_path)
|
||||
a4_dpi = self.calculate_a4_dpi(document_path)
|
||||
if dpi:
|
||||
self.log(
|
||||
"debug",
|
||||
@@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
ocr_args['image_dpi'] = dpi
|
||||
elif settings.OCR_IMAGE_DPI:
|
||||
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
|
||||
elif a4_dpi:
|
||||
ocr_args['image_dpi'] = a4_dpi
|
||||
else:
|
||||
raise ParseError(
|
||||
f"Cannot produce archive PDF for image {document_path}, "
|
||||
@@ -241,6 +262,9 @@ def strip_excess_whitespace(text):
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
|
||||
if not os.path.isfile(pdf_file):
|
||||
return None
|
||||
|
||||
with open(pdf_file, "rb") as f:
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
|
@@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
|
||||
def test_image_calc_a4_dpi(self, m):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def test_image_no_dpi_fail(self):
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
|
||||
self.assertEqual(kwargs['image_dpi'], 62)
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
|
||||
def test_image_dpi_fail(self, m):
|
||||
m.return_value = None
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def f():
|
||||
|
Reference in New Issue
Block a user