new setting: PAPERLESS_OCR_PAGES

This commit is contained in:
Jonas Winkler
2020-11-22 12:54:08 +01:00
parent ea089de3b3
commit fec9e54049
6 changed files with 54 additions and 5 deletions

View File

@@ -147,18 +147,25 @@ class RasterisedDocumentParser(DocumentParser):
Greyscale images are easier for Tesseract to OCR
"""
# Convert PDF to multiple PNMs
input_file = self.document_path
if settings.OCR_PAGES == 1:
input_file += "[0]"
elif settings.OCR_PAGES > 1:
input_file += f"[0-{settings.OCR_PAGES - 1}]"
self.log(
"debug",
f"Converting document {self.document_path} into greyscale images")
f"Converting document {input_file} into greyscale images")
# Convert PDF to multiple PNMs
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
run_convert(density=settings.CONVERT_DENSITY,
depth="8",
type="grayscale",
input_file=self.document_path,
output_file=pnm,
input_file=input_file,
output_file=output_files,
logging_group=self.logging_group)
# Get a list of converted images