mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Don't use the sidecar file when redoing the OCR, it only contains new text
This commit is contained in:
parent
54f20b381e
commit
b897d6de2e
@ -95,7 +95,13 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def extract_text(self, sidecar_file, pdf_file):
|
def extract_text(self, sidecar_file, pdf_file):
|
||||||
if sidecar_file and os.path.isfile(sidecar_file):
|
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||||
|
# the whole text, so do not utilize it in that case
|
||||||
|
if (
|
||||||
|
sidecar_file is not None
|
||||||
|
and os.path.isfile(sidecar_file)
|
||||||
|
and settings.OCR_MODE != "redo"
|
||||||
|
):
|
||||||
with open(sidecar_file) as f:
|
with open(sidecar_file) as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
@ -142,7 +148,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
"input_file": input_file,
|
"input_file": input_file,
|
||||||
"output_file": output_file,
|
"output_file": output_file,
|
||||||
# need to use threads, since this will be run in daemonized
|
# need to use threads, since this will be run in daemonized
|
||||||
# processes by django-q.
|
# processes via the task library.
|
||||||
"use_threads": True,
|
"use_threads": True,
|
||||||
"jobs": settings.THREADS_PER_WORKER,
|
"jobs": settings.THREADS_PER_WORKER,
|
||||||
"language": settings.OCR_LANGUAGE,
|
"language": settings.OCR_LANGUAGE,
|
||||||
@ -165,9 +171,11 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
if settings.OCR_MODE == "redo":
|
if settings.OCR_MODE == "redo":
|
||||||
ocrmypdf_args["clean"] = True
|
ocrmypdf_args["clean"] = True
|
||||||
else:
|
else:
|
||||||
|
# --clean-final is not compatible with --redo-ocr
|
||||||
ocrmypdf_args["clean_final"] = True
|
ocrmypdf_args["clean_final"] = True
|
||||||
|
|
||||||
if settings.OCR_DESKEW and not settings.OCR_MODE == "redo":
|
if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
|
||||||
|
# --deskew is not compatible with --redo-ocr
|
||||||
ocrmypdf_args["deskew"] = True
|
ocrmypdf_args["deskew"] = True
|
||||||
|
|
||||||
if settings.OCR_ROTATE_PAGES:
|
if settings.OCR_ROTATE_PAGES:
|
||||||
@ -263,7 +271,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
# Either no text was in the original or there should be an archive
|
# Either no text was in the original or there should be an archive
|
||||||
# file created, so OCR the file and create an archive with any
|
# file created, so OCR the file and create an archive with any
|
||||||
# test located via OCR
|
# text located via OCR
|
||||||
|
|
||||||
import ocrmypdf
|
import ocrmypdf
|
||||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||||
|
Loading…
x
Reference in New Issue
Block a user