mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	also apply \0 removal to sidecar contents
This commit is contained in:
		| @@ -104,7 +104,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 # This happens when there's already text in the input file. | ||||
|                 # The sidecar file will only contain text for OCR'ed pages. | ||||
|                 self.log("debug", "Using text from sidecar file") | ||||
|                 return text | ||||
|                 return post_process_text(text) | ||||
|             else: | ||||
|                 self.log("debug", "Incomplete sidecar file: discarding.") | ||||
|  | ||||
| @@ -113,13 +113,11 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         if not os.path.isfile(pdf_file): | ||||
|             return None | ||||
|  | ||||
|         from pdfminer.high_level import extract_text | ||||
|         from pdfminer.high_level import extract_text as pdfminer_extract_text | ||||
|         from pdfminer.pdftypes import PDFException | ||||
|  | ||||
|         try: | ||||
|             text = extract_text(pdf_file) | ||||
|             stripped = strip_excess_whitespace(text) | ||||
|             stripped = stripped.replace("\0", " ") | ||||
|             stripped = post_process_text(pdfminer_extract_text(pdf_file)) | ||||
|  | ||||
|             self.log("debug", f"Extracted text from PDF file {pdf_file}") | ||||
|             return stripped | ||||
| @@ -296,7 +294,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 self.text = "" | ||||
|  | ||||
|  | ||||
| def strip_excess_whitespace(text): | ||||
| def post_process_text(text): | ||||
|     if not text: | ||||
|         return None | ||||
|  | ||||
| @@ -307,4 +305,6 @@ def strip_excess_whitespace(text): | ||||
|         r"([^\S\n\r]+)$", '', no_leading_whitespace) | ||||
|  | ||||
|     # TODO: this needs a rework | ||||
|     return no_trailing_whitespace.strip() | ||||
|     # replace \0 prevents issues with saving to postgres. | ||||
|     # text may contain \0 when this character is present in PDF files. | ||||
|     return no_trailing_whitespace.strip().replace("\0", " ") | ||||
|   | ||||
| @@ -7,7 +7,7 @@ from django.test import TestCase, override_settings | ||||
|  | ||||
| from documents.parsers import ParseError, run_convert | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, strip_excess_whitespace | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, post_process_text | ||||
|  | ||||
| image_to_string_calls = [] | ||||
|  | ||||
| @@ -32,8 +32,6 @@ class FakeImageFile(ContextManager): | ||||
|         return os.path.basename(self.fname) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def assertContainsStrings(self, content, strings): | ||||
| @@ -58,9 +56,9 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|         ) | ||||
|     ] | ||||
|  | ||||
|     def test_strip_excess_whitespace(self): | ||||
|     def test_post_process_text(self): | ||||
|         for source, result in self.text_cases: | ||||
|             actual_result = strip_excess_whitespace(source) | ||||
|             actual_result = post_process_text(source) | ||||
|             self.assertEqual( | ||||
|                 result, | ||||
|                 actual_result, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler