From cac63494f04b8ab6278db2f4ce862e11efde2712 Mon Sep 17 00:00:00 2001 From: Joshua Taillon Date: Wed, 5 Sep 2018 15:18:35 -0400 Subject: [PATCH] change tesseract parser to only convert first page to save (potentially) massive amounts of work --- src/paperless_tesseract/parsers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index add65985a..4216ec230 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -50,10 +50,11 @@ class RasterisedDocumentParser(DocumentParser): self.CONVERT, "-scale", "500x5000", "-alpha", "remove", - self.document_path, os.path.join(self.tempdir, "convert-%04d.png") + "{}[0]".format(self.document_path), + os.path.join(self.tempdir, "convert.png") ) - return os.path.join(self.tempdir, "convert-0000.png") + return os.path.join(self.tempdir, "convert.png") def _is_ocred(self):