change tesseract parser to only convert first page to save (potentially) massive amounts of work

This commit is contained in:
Joshua Taillon 2018-09-05 15:18:35 -04:00
parent 939a67bd4b
commit cac63494f0

View File

@ -50,10 +50,11 @@ class RasterisedDocumentParser(DocumentParser):
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
"{}[0]".format(self.document_path),
os.path.join(self.tempdir, "convert.png")
)
return os.path.join(self.tempdir, "convert-0000.png")
return os.path.join(self.tempdir, "convert.png")
def _is_ocred(self):