mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
also apply \0 removal to sidecar contents
This commit is contained in:
parent
fda2bfbea7
commit
0e596bd1fc
@ -104,7 +104,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
# This happens when there's already text in the input file.
|
# This happens when there's already text in the input file.
|
||||||
# The sidecar file will only contain text for OCR'ed pages.
|
# The sidecar file will only contain text for OCR'ed pages.
|
||||||
self.log("debug", "Using text from sidecar file")
|
self.log("debug", "Using text from sidecar file")
|
||||||
return text
|
return post_process_text(text)
|
||||||
else:
|
else:
|
||||||
self.log("debug", "Incomplete sidecar file: discarding.")
|
self.log("debug", "Incomplete sidecar file: discarding.")
|
||||||
|
|
||||||
@ -113,13 +113,11 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
if not os.path.isfile(pdf_file):
|
if not os.path.isfile(pdf_file):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
from pdfminer.high_level import extract_text
|
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||||
from pdfminer.pdftypes import PDFException
|
from pdfminer.pdftypes import PDFException
|
||||||
|
|
||||||
try:
|
try:
|
||||||
text = extract_text(pdf_file)
|
stripped = post_process_text(pdfminer_extract_text(pdf_file))
|
||||||
stripped = strip_excess_whitespace(text)
|
|
||||||
stripped = stripped.replace("\0", " ")
|
|
||||||
|
|
||||||
self.log("debug", f"Extracted text from PDF file {pdf_file}")
|
self.log("debug", f"Extracted text from PDF file {pdf_file}")
|
||||||
return stripped
|
return stripped
|
||||||
@ -296,7 +294,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.text = ""
|
self.text = ""
|
||||||
|
|
||||||
|
|
||||||
def strip_excess_whitespace(text):
|
def post_process_text(text):
|
||||||
if not text:
|
if not text:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -307,4 +305,6 @@ def strip_excess_whitespace(text):
|
|||||||
r"([^\S\n\r]+)$", '', no_leading_whitespace)
|
r"([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||||
|
|
||||||
# TODO: this needs a rework
|
# TODO: this needs a rework
|
||||||
return no_trailing_whitespace.strip()
|
# replace \0 prevents issues with saving to postgres.
|
||||||
|
# text may contain \0 when this character is present in PDF files.
|
||||||
|
return no_trailing_whitespace.strip().replace("\0", " ")
|
||||||
|
@ -7,7 +7,7 @@ from django.test import TestCase, override_settings
|
|||||||
|
|
||||||
from documents.parsers import ParseError, run_convert
|
from documents.parsers import ParseError, run_convert
|
||||||
from documents.tests.utils import DirectoriesMixin
|
from documents.tests.utils import DirectoriesMixin
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser, strip_excess_whitespace
|
from paperless_tesseract.parsers import RasterisedDocumentParser, post_process_text
|
||||||
|
|
||||||
image_to_string_calls = []
|
image_to_string_calls = []
|
||||||
|
|
||||||
@ -32,8 +32,6 @@ class FakeImageFile(ContextManager):
|
|||||||
return os.path.basename(self.fname)
|
return os.path.basename(self.fname)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TestParser(DirectoriesMixin, TestCase):
|
class TestParser(DirectoriesMixin, TestCase):
|
||||||
|
|
||||||
def assertContainsStrings(self, content, strings):
|
def assertContainsStrings(self, content, strings):
|
||||||
@ -58,9 +56,9 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_strip_excess_whitespace(self):
|
def test_post_process_text(self):
|
||||||
for source, result in self.text_cases:
|
for source, result in self.text_cases:
|
||||||
actual_result = strip_excess_whitespace(source)
|
actual_result = post_process_text(source)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
result,
|
result,
|
||||||
actual_result,
|
actual_result,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user