also apply \0 removal to sidecar contents

2026-02-18 00:29:35 -06:00 · 2021-03-22 23:08:34 +01:00
parent fda2bfbea7
commit 0e596bd1fc
2 changed files with 10 additions and 12 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -104,7 +104,7 @@ class RasterisedDocumentParser(DocumentParser):
                # This happens when there's already text in the input file.
                # The sidecar file will only contain text for OCR'ed pages.
                self.log("debug", "Using text from sidecar file")
-                return text
+                return post_process_text(text)
            else:
                self.log("debug", "Incomplete sidecar file: discarding.")

@@ -113,13 +113,11 @@ class RasterisedDocumentParser(DocumentParser):
        if not os.path.isfile(pdf_file):
            return None

-        from pdfminer.high_level import extract_text
+        from pdfminer.high_level import extract_text as pdfminer_extract_text
        from pdfminer.pdftypes import PDFException

        try:
-            text = extract_text(pdf_file)
-            stripped = strip_excess_whitespace(text)
-            stripped = stripped.replace("\0", " ")
+            stripped = post_process_text(pdfminer_extract_text(pdf_file))

            self.log("debug", f"Extracted text from PDF file {pdf_file}")
            return stripped
@@ -296,7 +294,7 @@ class RasterisedDocumentParser(DocumentParser):
                self.text = ""


-def strip_excess_whitespace(text):
+def post_process_text(text):
    if not text:
        return None

@@ -307,4 +305,6 @@ def strip_excess_whitespace(text):
        r"([^\S\n\r]+)$", '', no_leading_whitespace)

    # TODO: this needs a rework
-    return no_trailing_whitespace.strip()
+    # replace \0 prevents issues with saving to postgres.
+    # text may contain \0 when this character is present in PDF files.
+    return no_trailing_whitespace.strip().replace("\0", " ")