mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-30 18:27:45 -05:00
also apply \0 removal to sidecar contents
This commit is contained in:
@@ -104,7 +104,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
# This happens when there's already text in the input file.
|
||||
# The sidecar file will only contain text for OCR'ed pages.
|
||||
self.log("debug", "Using text from sidecar file")
|
||||
return text
|
||||
return post_process_text(text)
|
||||
else:
|
||||
self.log("debug", "Incomplete sidecar file: discarding.")
|
||||
|
||||
@@ -113,13 +113,11 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
if not os.path.isfile(pdf_file):
|
||||
return None
|
||||
|
||||
from pdfminer.high_level import extract_text
|
||||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||
from pdfminer.pdftypes import PDFException
|
||||
|
||||
try:
|
||||
text = extract_text(pdf_file)
|
||||
stripped = strip_excess_whitespace(text)
|
||||
stripped = stripped.replace("\0", " ")
|
||||
stripped = post_process_text(pdfminer_extract_text(pdf_file))
|
||||
|
||||
self.log("debug", f"Extracted text from PDF file {pdf_file}")
|
||||
return stripped
|
||||
@@ -296,7 +294,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.text = ""
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
def post_process_text(text):
|
||||
if not text:
|
||||
return None
|
||||
|
||||
@@ -307,4 +305,6 @@ def strip_excess_whitespace(text):
|
||||
r"([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
|
||||
# TODO: this needs a rework
|
||||
return no_trailing_whitespace.strip()
|
||||
# replace \0 prevents issues with saving to postgres.
|
||||
# text may contain \0 when this character is present in PDF files.
|
||||
return no_trailing_whitespace.strip().replace("\0", " ")
|
||||
|
Reference in New Issue
Block a user