also apply \0 removal to sidecar contents

This commit is contained in:
jonaswinkler 2021-03-22 23:08:34 +01:00
parent fda2bfbea7
commit 0e596bd1fc
2 changed files with 10 additions and 12 deletions

View File

@ -104,7 +104,7 @@ class RasterisedDocumentParser(DocumentParser):
# This happens when there's already text in the input file.
# The sidecar file will only contain text for OCR'ed pages.
self.log("debug", "Using text from sidecar file")
return text
return post_process_text(text)
else:
self.log("debug", "Incomplete sidecar file: discarding.")
@ -113,13 +113,11 @@ class RasterisedDocumentParser(DocumentParser):
if not os.path.isfile(pdf_file):
return None
from pdfminer.high_level import extract_text
from pdfminer.high_level import extract_text as pdfminer_extract_text
from pdfminer.pdftypes import PDFException
try:
text = extract_text(pdf_file)
stripped = strip_excess_whitespace(text)
stripped = stripped.replace("\0", " ")
stripped = post_process_text(pdfminer_extract_text(pdf_file))
self.log("debug", f"Extracted text from PDF file {pdf_file}")
return stripped
@ -296,7 +294,7 @@ class RasterisedDocumentParser(DocumentParser):
self.text = ""
def strip_excess_whitespace(text):
def post_process_text(text):
if not text:
return None
@ -307,4 +305,6 @@ def strip_excess_whitespace(text):
r"([^\S\n\r]+)$", '', no_leading_whitespace)
# TODO: this needs a rework
return no_trailing_whitespace.strip()
# replace \0 prevents issues with saving to postgres.
# text may contain \0 when this character is present in PDF files.
return no_trailing_whitespace.strip().replace("\0", " ")

View File

@ -7,7 +7,7 @@ from django.test import TestCase, override_settings
from documents.parsers import ParseError, run_convert
from documents.tests.utils import DirectoriesMixin
from paperless_tesseract.parsers import RasterisedDocumentParser, strip_excess_whitespace
from paperless_tesseract.parsers import RasterisedDocumentParser, post_process_text
image_to_string_calls = []
@ -32,8 +32,6 @@ class FakeImageFile(ContextManager):
return os.path.basename(self.fname)
class TestParser(DirectoriesMixin, TestCase):
def assertContainsStrings(self, content, strings):
@ -58,9 +56,9 @@ class TestParser(DirectoriesMixin, TestCase):
)
]
def test_strip_excess_whitespace(self):
def test_post_process_text(self):
for source, result in self.text_cases:
actual_result = strip_excess_whitespace(source)
actual_result = post_process_text(source)
self.assertEqual(
result,
actual_result,