mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-19 10:19:27 -05:00
If extracting text from a fallback file (ie forced), allow the text to be used
This commit is contained in:
parent
28b26eb4c7
commit
26c7fad005
@ -1,6 +1,8 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
@ -99,7 +101,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.log("warning", f"Error while calculating DPI for image {image}: {e}")
|
self.log("warning", f"Error while calculating DPI for image {image}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def extract_text(self, sidecar_file, pdf_file):
|
def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
|
||||||
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||||
# the whole text, so do not utilize it in that case
|
# the whole text, so do not utilize it in that case
|
||||||
if (
|
if (
|
||||||
@ -139,11 +141,15 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
self.log("debug", f"Detected language {lang}")
|
self.log("debug", f"Detected language {lang}")
|
||||||
|
|
||||||
if lang in {
|
if (
|
||||||
|
lang
|
||||||
|
in {
|
||||||
"ar", # Arabic
|
"ar", # Arabic
|
||||||
"he", # Hebrew,
|
"he", # Hebrew,
|
||||||
"fa", # Persian
|
"fa", # Persian
|
||||||
}:
|
}
|
||||||
|
and pdf_file.name != "archive-fallback.pdf"
|
||||||
|
):
|
||||||
raise RtlLanguageException()
|
raise RtlLanguageException()
|
||||||
return stripped
|
return stripped
|
||||||
except RtlLanguageException:
|
except RtlLanguageException:
|
||||||
@ -275,7 +281,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
return ocrmypdf_args
|
return ocrmypdf_args
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None):
|
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||||
# This forces tesseract to use one core per page.
|
# This forces tesseract to use one core per page.
|
||||||
os.environ["OMP_THREAD_LIMIT"] = "1"
|
os.environ["OMP_THREAD_LIMIT"] = "1"
|
||||||
|
|
||||||
@ -300,8 +306,8 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
import ocrmypdf
|
import ocrmypdf
|
||||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||||
|
|
||||||
archive_path = os.path.join(self.tempdir, "archive.pdf")
|
archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
|
||||||
sidecar_file = os.path.join(self.tempdir, "sidecar.txt")
|
sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))
|
||||||
|
|
||||||
args = self.construct_ocrmypdf_parameters(
|
args = self.construct_ocrmypdf_parameters(
|
||||||
document_path,
|
document_path,
|
||||||
@ -335,8 +341,12 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
f"Attempting force OCR to get the text.",
|
f"Attempting force OCR to get the text.",
|
||||||
)
|
)
|
||||||
|
|
||||||
archive_path_fallback = os.path.join(self.tempdir, "archive-fallback.pdf")
|
archive_path_fallback = Path(
|
||||||
sidecar_file_fallback = os.path.join(self.tempdir, "sidecar-fallback.txt")
|
os.path.join(self.tempdir, "archive-fallback.pdf"),
|
||||||
|
)
|
||||||
|
sidecar_file_fallback = Path(
|
||||||
|
os.path.join(self.tempdir, "sidecar-fallback.txt"),
|
||||||
|
)
|
||||||
|
|
||||||
# Attempt to run OCR with safe settings.
|
# Attempt to run OCR with safe settings.
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user