mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
fixes bauerj/paperless_app#23 and most of all other scanner apps out there.
This commit is contained in:
parent
bf9051e44d
commit
a0631413d6
@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
f"Error while getting DPI from image {image}: {e}")
|
f"Error while getting DPI from image {image}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def calculate_a4_dpi(self, image):
|
||||||
|
try:
|
||||||
|
with Image.open(image) as im:
|
||||||
|
width, height = im.size
|
||||||
|
# divide image width by A4 width (210mm) in inches.
|
||||||
|
dpi = int(width / (21 / 2.54))
|
||||||
|
self.log(
|
||||||
|
'debug',
|
||||||
|
f"Estimated DPI {dpi} based on image width {width}"
|
||||||
|
)
|
||||||
|
return dpi
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.log(
|
||||||
|
'warning',
|
||||||
|
f"Error while calculating DPI for image {image}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def parse(self, document_path, mime_type):
|
def parse(self, document_path, mime_type):
|
||||||
mode = settings.OCR_MODE
|
mode = settings.OCR_MODE
|
||||||
|
|
||||||
@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
if self.is_image(mime_type):
|
if self.is_image(mime_type):
|
||||||
dpi = self.get_dpi(document_path)
|
dpi = self.get_dpi(document_path)
|
||||||
|
a4_dpi = self.calculate_a4_dpi(document_path)
|
||||||
if dpi:
|
if dpi:
|
||||||
self.log(
|
self.log(
|
||||||
"debug",
|
"debug",
|
||||||
@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
ocr_args['image_dpi'] = dpi
|
ocr_args['image_dpi'] = dpi
|
||||||
elif settings.OCR_IMAGE_DPI:
|
elif settings.OCR_IMAGE_DPI:
|
||||||
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
|
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
|
||||||
|
elif a4_dpi:
|
||||||
|
ocr_args['image_dpi'] = a4_dpi
|
||||||
else:
|
else:
|
||||||
raise ParseError(
|
raise ParseError(
|
||||||
f"Cannot produce archive PDF for image {document_path}, "
|
f"Cannot produce archive PDF for image {document_path}, "
|
||||||
@ -241,6 +262,9 @@ def strip_excess_whitespace(text):
|
|||||||
|
|
||||||
def get_text_from_pdf(pdf_file):
|
def get_text_from_pdf(pdf_file):
|
||||||
|
|
||||||
|
if not os.path.isfile(pdf_file):
|
||||||
|
return None
|
||||||
|
|
||||||
with open(pdf_file, "rb") as f:
|
with open(pdf_file, "rb") as f:
|
||||||
try:
|
try:
|
||||||
pdf = pdftotext.PDF(f)
|
pdf = pdftotext.PDF(f)
|
||||||
|
@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
|
|
||||||
self.assertRaises(ParseError, f)
|
self.assertRaises(ParseError, f)
|
||||||
|
|
||||||
|
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
|
||||||
|
def test_image_calc_a4_dpi(self, m):
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
|
||||||
def test_image_no_dpi_fail(self):
|
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
|
||||||
|
|
||||||
|
m.assert_called_once()
|
||||||
|
|
||||||
|
args, kwargs = m.call_args
|
||||||
|
|
||||||
|
self.assertEqual(kwargs['image_dpi'], 62)
|
||||||
|
|
||||||
|
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
|
||||||
|
def test_image_dpi_fail(self, m):
|
||||||
|
m.return_value = None
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
|
|
||||||
def f():
|
def f():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user