mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
fixes bauerj/paperless_app#23 and most of all other scanner apps out there.
This commit is contained in:
parent
bf9051e44d
commit
a0631413d6
@ -110,6 +110,24 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def calculate_a4_dpi(self, image):
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
width, height = im.size
|
||||
# divide image width by A4 width (210mm) in inches.
|
||||
dpi = int(width / (21 / 2.54))
|
||||
self.log(
|
||||
'debug',
|
||||
f"Estimated DPI {dpi} based on image width {width}"
|
||||
)
|
||||
return dpi
|
||||
|
||||
except Exception as e:
|
||||
self.log(
|
||||
'warning',
|
||||
f"Error while calculating DPI for image {image}: {e}")
|
||||
return None
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
mode = settings.OCR_MODE
|
||||
|
||||
@ -162,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
if self.is_image(mime_type):
|
||||
dpi = self.get_dpi(document_path)
|
||||
a4_dpi = self.calculate_a4_dpi(document_path)
|
||||
if dpi:
|
||||
self.log(
|
||||
"debug",
|
||||
@ -170,6 +189,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
ocr_args['image_dpi'] = dpi
|
||||
elif settings.OCR_IMAGE_DPI:
|
||||
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
|
||||
elif a4_dpi:
|
||||
ocr_args['image_dpi'] = a4_dpi
|
||||
else:
|
||||
raise ParseError(
|
||||
f"Cannot produce archive PDF for image {document_path}, "
|
||||
@ -241,6 +262,9 @@ def strip_excess_whitespace(text):
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
|
||||
if not os.path.isfile(pdf_file):
|
||||
return None
|
||||
|
||||
with open(pdf_file, "rb") as f:
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
|
@ -164,8 +164,21 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
|
||||
def test_image_calc_a4_dpi(self, m):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def test_image_no_dpi_fail(self):
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
|
||||
self.assertEqual(kwargs['image_dpi'], 62)
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
|
||||
def test_image_dpi_fail(self, m):
|
||||
m.return_value = None
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def f():
|
||||
|
Loading…
x
Reference in New Issue
Block a user