From d26c46e034d75ce5f04e16de3b171aa542bda5ab Mon Sep 17 00:00:00 2001 From: jonaswinkler <17569239+jonaswinkler@users.noreply.github.com> Date: Mon, 22 Mar 2021 22:46:35 +0100 Subject: [PATCH] fixes #794 --- src/paperless_tesseract/parsers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 589b25e37..944bda601 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -119,6 +119,8 @@ class RasterisedDocumentParser(DocumentParser): try: text = extract_text(pdf_file) stripped = strip_excess_whitespace(text) + stripped = stripped.replace("\0", " ") + self.log("debug", f"Extracted text from PDF file {pdf_file}") return stripped except PDFException: