mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	also apply \0 removal to sidecar contents
This commit is contained in:
		| @@ -104,7 +104,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                 # This happens when there's already text in the input file. |                 # This happens when there's already text in the input file. | ||||||
|                 # The sidecar file will only contain text for OCR'ed pages. |                 # The sidecar file will only contain text for OCR'ed pages. | ||||||
|                 self.log("debug", "Using text from sidecar file") |                 self.log("debug", "Using text from sidecar file") | ||||||
|                 return text |                 return post_process_text(text) | ||||||
|             else: |             else: | ||||||
|                 self.log("debug", "Incomplete sidecar file: discarding.") |                 self.log("debug", "Incomplete sidecar file: discarding.") | ||||||
|  |  | ||||||
| @@ -113,13 +113,11 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|         if not os.path.isfile(pdf_file): |         if not os.path.isfile(pdf_file): | ||||||
|             return None |             return None | ||||||
|  |  | ||||||
|         from pdfminer.high_level import extract_text |         from pdfminer.high_level import extract_text as pdfminer_extract_text | ||||||
|         from pdfminer.pdftypes import PDFException |         from pdfminer.pdftypes import PDFException | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             text = extract_text(pdf_file) |             stripped = post_process_text(pdfminer_extract_text(pdf_file)) | ||||||
|             stripped = strip_excess_whitespace(text) |  | ||||||
|             stripped = stripped.replace("\0", " ") |  | ||||||
|  |  | ||||||
|             self.log("debug", f"Extracted text from PDF file {pdf_file}") |             self.log("debug", f"Extracted text from PDF file {pdf_file}") | ||||||
|             return stripped |             return stripped | ||||||
| @@ -296,7 +294,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                 self.text = "" |                 self.text = "" | ||||||
|  |  | ||||||
|  |  | ||||||
| def strip_excess_whitespace(text): | def post_process_text(text): | ||||||
|     if not text: |     if not text: | ||||||
|         return None |         return None | ||||||
|  |  | ||||||
| @@ -307,4 +305,6 @@ def strip_excess_whitespace(text): | |||||||
|         r"([^\S\n\r]+)$", '', no_leading_whitespace) |         r"([^\S\n\r]+)$", '', no_leading_whitespace) | ||||||
|  |  | ||||||
|     # TODO: this needs a rework |     # TODO: this needs a rework | ||||||
|     return no_trailing_whitespace.strip() |     # replace \0 prevents issues with saving to postgres. | ||||||
|  |     # text may contain \0 when this character is present in PDF files. | ||||||
|  |     return no_trailing_whitespace.strip().replace("\0", " ") | ||||||
|   | |||||||
| @@ -7,7 +7,7 @@ from django.test import TestCase, override_settings | |||||||
|  |  | ||||||
| from documents.parsers import ParseError, run_convert | from documents.parsers import ParseError, run_convert | ||||||
| from documents.tests.utils import DirectoriesMixin | from documents.tests.utils import DirectoriesMixin | ||||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, strip_excess_whitespace | from paperless_tesseract.parsers import RasterisedDocumentParser, post_process_text | ||||||
|  |  | ||||||
| image_to_string_calls = [] | image_to_string_calls = [] | ||||||
|  |  | ||||||
| @@ -32,8 +32,6 @@ class FakeImageFile(ContextManager): | |||||||
|         return os.path.basename(self.fname) |         return os.path.basename(self.fname) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestParser(DirectoriesMixin, TestCase): | class TestParser(DirectoriesMixin, TestCase): | ||||||
|  |  | ||||||
|     def assertContainsStrings(self, content, strings): |     def assertContainsStrings(self, content, strings): | ||||||
| @@ -58,9 +56,9 @@ class TestParser(DirectoriesMixin, TestCase): | |||||||
|         ) |         ) | ||||||
|     ] |     ] | ||||||
|  |  | ||||||
|     def test_strip_excess_whitespace(self): |     def test_post_process_text(self): | ||||||
|         for source, result in self.text_cases: |         for source, result in self.text_cases: | ||||||
|             actual_result = strip_excess_whitespace(source) |             actual_result = post_process_text(source) | ||||||
|             self.assertEqual( |             self.assertEqual( | ||||||
|                 result, |                 result, | ||||||
|                 actual_result, |                 actual_result, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler