mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
also apply \0 removal to sidecar contents
This commit is contained in:
parent
fda2bfbea7
commit
0e596bd1fc
@ -104,7 +104,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
# This happens when there's already text in the input file.
|
||||
# The sidecar file will only contain text for OCR'ed pages.
|
||||
self.log("debug", "Using text from sidecar file")
|
||||
return text
|
||||
return post_process_text(text)
|
||||
else:
|
||||
self.log("debug", "Incomplete sidecar file: discarding.")
|
||||
|
||||
@ -113,13 +113,11 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
if not os.path.isfile(pdf_file):
|
||||
return None
|
||||
|
||||
from pdfminer.high_level import extract_text
|
||||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||
from pdfminer.pdftypes import PDFException
|
||||
|
||||
try:
|
||||
text = extract_text(pdf_file)
|
||||
stripped = strip_excess_whitespace(text)
|
||||
stripped = stripped.replace("\0", " ")
|
||||
stripped = post_process_text(pdfminer_extract_text(pdf_file))
|
||||
|
||||
self.log("debug", f"Extracted text from PDF file {pdf_file}")
|
||||
return stripped
|
||||
@ -296,7 +294,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.text = ""
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
def post_process_text(text):
|
||||
if not text:
|
||||
return None
|
||||
|
||||
@ -307,4 +305,6 @@ def strip_excess_whitespace(text):
|
||||
r"([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
|
||||
# TODO: this needs a rework
|
||||
return no_trailing_whitespace.strip()
|
||||
# replace \0 prevents issues with saving to postgres.
|
||||
# text may contain \0 when this character is present in PDF files.
|
||||
return no_trailing_whitespace.strip().replace("\0", " ")
|
||||
|
@ -7,7 +7,7 @@ from django.test import TestCase, override_settings
|
||||
|
||||
from documents.parsers import ParseError, run_convert
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser, strip_excess_whitespace
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser, post_process_text
|
||||
|
||||
image_to_string_calls = []
|
||||
|
||||
@ -32,8 +32,6 @@ class FakeImageFile(ContextManager):
|
||||
return os.path.basename(self.fname)
|
||||
|
||||
|
||||
|
||||
|
||||
class TestParser(DirectoriesMixin, TestCase):
|
||||
|
||||
def assertContainsStrings(self, content, strings):
|
||||
@ -58,9 +56,9 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
)
|
||||
]
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
def test_post_process_text(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
actual_result = post_process_text(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
|
Loading…
x
Reference in New Issue
Block a user