mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	more work on barcode
Signed-off-by: florian on nixos (Florian Brandes) <florian.brandes@posteo.de>
This commit is contained in:
		 florian on nixos (Florian Brandes)
					florian on nixos (Florian Brandes)
				
			
				
					committed by
					
						 Florian Brandes
						Florian Brandes
					
				
			
			
				
	
			
			
			 Florian Brandes
						Florian Brandes
					
				
			
						parent
						
							b787971421
						
					
				
				
					commit
					7736934e43
				
			| @@ -22,6 +22,7 @@ from pyzbar import pyzbar | |||||||
| from pdf2image import convert_from_path | from pdf2image import convert_from_path | ||||||
| import tempfile | import tempfile | ||||||
| from pikepdf import Pdf | from pikepdf import Pdf | ||||||
|  | import shutil | ||||||
|  |  | ||||||
| logger = logging.getLogger("paperless.tasks") | logger = logging.getLogger("paperless.tasks") | ||||||
|  |  | ||||||
| @@ -91,25 +92,25 @@ def barcode_reader(image) -> list: | |||||||
|     return barcodes |     return barcodes | ||||||
|  |  | ||||||
|  |  | ||||||
| def scan_file_for_seperating_barcodes(filepath: str) -> list: | def scan_file_for_separating_barcodes(filepath: str) -> list: | ||||||
|     """ |     """ | ||||||
|     Scan the provided file for page seperating barcodes |     Scan the provided file for page separating barcodes | ||||||
|     Returns a list of pagenumbers, which seperate the file |     Returns a list of pagenumbers, which separate the file | ||||||
|     """ |     """ | ||||||
|     seperator_page_numbers = [] |     separator_page_numbers = [] | ||||||
|     # use a temporary directory in case the file os too big to handle in memory |     # use a temporary directory in case the file os too big to handle in memory | ||||||
|     with tempfile.TemporaryDirectory() as path: |     with tempfile.TemporaryDirectory() as path: | ||||||
|         pages_from_path = convert_from_path(filepath, output_folder=path) |         pages_from_path = convert_from_path(filepath, output_folder=path) | ||||||
|         for current_page_number, page in enumerate(pages_from_path): |         for current_page_number, page in enumerate(pages_from_path): | ||||||
|             current_barcodes = barcode_reader(page) |             current_barcodes = barcode_reader(page) | ||||||
|             if "b'PATCHT'" in current_barcodes: |             if "b'PATCHT'" in current_barcodes: | ||||||
|                 seperator_page_numbers = seperator_page_numbers + [current_page_number] |                 separator_page_numbers = separator_page_numbers + [current_page_number] | ||||||
|     return seperator_page_numbers |     return separator_page_numbers | ||||||
|  |  | ||||||
|  |  | ||||||
| def seperate_pages(filepath: str, pages_to_split_on: list) -> list: | def separate_pages(filepath: str, pages_to_split_on: list) -> list: | ||||||
|     """ |     """ | ||||||
|     Seperate the provided file on the pages_to_split_on. |     Separate the provided file on the pages_to_split_on. | ||||||
|     The pages which are defined by page_numbers will be removed. |     The pages which are defined by page_numbers will be removed. | ||||||
|     Returns a list of (temporary) filepaths to consume. |     Returns a list of (temporary) filepaths to consume. | ||||||
|     These will need to be deleted later. |     These will need to be deleted later. | ||||||
| @@ -156,6 +157,14 @@ def seperate_pages(filepath: str, pages_to_split_on: list) -> list: | |||||||
|     logger.debug(f"Temp files are {str(document_paths)}") |     logger.debug(f"Temp files are {str(document_paths)}") | ||||||
|     return document_paths |     return document_paths | ||||||
|  |  | ||||||
|  | def save_to_dir(filepath, target_dir=settings.CONSUMPTION_DIR): | ||||||
|  |     """ | ||||||
|  |     Copies filepath to target_dir. | ||||||
|  |     """ | ||||||
|  |     if os.path.isfile(filepath) and os.path.isdir(target_dir): | ||||||
|  |         shutil.copy(filepath, target_dir) | ||||||
|  |     else: | ||||||
|  |         logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.") | ||||||
|  |  | ||||||
| def consume_file( | def consume_file( | ||||||
|     path, |     path, | ||||||
| @@ -167,10 +176,10 @@ def consume_file( | |||||||
|     task_id=None, |     task_id=None, | ||||||
| ): | ): | ||||||
|  |  | ||||||
|     # check for seperators in current document |     # check for separators in current document | ||||||
|     seperator_page_numbers = scan_file_for_seperating_barcodes(path) |     separator_page_numbers = scan_file_for_separating_barcodes(path) | ||||||
|     if seperator_page_numbers != []: |     if separator_page_numbers != []: | ||||||
|         logger.debug(f"Pages with seperators found: {str(seperator_page_numbers)}") |         logger.debug(f"Pages with separators found: {str(separator_page_numbers)}") | ||||||
|  |  | ||||||
|     document = Consumer().try_consume_file( |     document = Consumer().try_consume_file( | ||||||
|         path, |         path, | ||||||
|   | |||||||
| @@ -14,6 +14,7 @@ from documents.sanity_checker import SanityCheckMessages | |||||||
| from documents.tests.utils import DirectoriesMixin | from documents.tests.utils import DirectoriesMixin | ||||||
|  |  | ||||||
| from PIL import Image | from PIL import Image | ||||||
|  | import tempfile | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestTasks(DirectoriesMixin, TestCase): | class TestTasks(DirectoriesMixin, TestCase): | ||||||
| @@ -103,33 +104,72 @@ class TestTasks(DirectoriesMixin, TestCase): | |||||||
|         img = Image.open(test_file) |         img = Image.open(test_file) | ||||||
|         self.assertEqual(tasks.barcode_reader(img), []) |         self.assertEqual(tasks.barcode_reader(img), []) | ||||||
|  |  | ||||||
|     def test_scan_file_for_seperating_barcodes(self): |     def test_scan_file_for_separating_barcodes(self): | ||||||
|         test_file = os.path.join( |         test_file = os.path.join( | ||||||
|             os.path.dirname(__file__), "samples", "patch-code-t.pdf" |             os.path.dirname(__file__), "samples", "patch-code-t.pdf" | ||||||
|         ) |         ) | ||||||
|         pages = tasks.scan_file_for_seperating_barcodes(test_file) |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|         self.assertEqual(pages, [0]) |         self.assertEqual(pages, [0]) | ||||||
|  |  | ||||||
|     def test_scan_file_for_seperating_barcodes2(self): |     def test_scan_file_for_separating_barcodes2(self): | ||||||
|         test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") |         test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") | ||||||
|         pages = tasks.scan_file_for_seperating_barcodes(test_file) |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|         self.assertEqual(pages, []) |         self.assertEqual(pages, []) | ||||||
|  |  | ||||||
|     def test_scan_file_for_seperating_barcodes3(self): |     def test_scan_file_for_separating_barcodes3(self): | ||||||
|         test_file = os.path.join( |         test_file = os.path.join( | ||||||
|             os.path.dirname(__file__), "samples", "patch-code-t-middle.pdf" |             os.path.dirname(__file__), "samples", "patch-code-t-middle.pdf" | ||||||
|         ) |         ) | ||||||
|         pages = tasks.scan_file_for_seperating_barcodes(test_file) |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|         self.assertEqual(pages, [1]) |         self.assertEqual(pages, [1]) | ||||||
|  |  | ||||||
|     def test_seperate_pages(self): |     def test_separate_pages(self): | ||||||
|         test_file = os.path.join( |         test_file = os.path.join( | ||||||
|             os.path.dirname(__file__), "samples", "patch-code-t-middle.pdf" |             os.path.dirname(__file__), "samples", "patch-code-t-middle.pdf" | ||||||
|         ) |         ) | ||||||
|         pages = tasks.seperate_pages(test_file, [1]) |         pages = tasks.separate_pages(test_file, [1]) | ||||||
|  |  | ||||||
|         self.assertEqual(len(pages), 2) |         self.assertEqual(len(pages), 2) | ||||||
|  |  | ||||||
|  |     def test_save_to_dir(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), "samples", "patch-code-t.pdf" | ||||||
|  |         ) | ||||||
|  |         tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||||
|  |         tasks.save_to_dir(test_file, tempdir) | ||||||
|  |         target_file = os.path.join(tempdir, "patch-code-t.pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(target_file)) | ||||||
|  |  | ||||||
|  |     def test_save_to_dir2(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), "samples", "patch-code-t.pdf" | ||||||
|  |         ) | ||||||
|  |         nonexistingdir = "/nowhere" | ||||||
|  |         if not os.path.isdir(nonexistingdir): | ||||||
|  |             with self.assertLogs("paperless.tasks", level="WARNING") as cm: | ||||||
|  |                 tasks.save_to_dir(test_file, nonexistingdir) | ||||||
|  |             self.assertEqual( | ||||||
|  |                 cm.output, | ||||||
|  |                 [ | ||||||
|  |                     f"WARNING:paperless.tasks:{str(test_file)} or {str(nonexistingdir)} don't exist." | ||||||
|  |                 ], | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |     def test_barcode_splitter(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), "samples", "patch-code-t-middle.pdf" | ||||||
|  |         ) | ||||||
|  |         tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||||
|  |         separators = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertTrue(separators != []) | ||||||
|  |         document_list = tasks.separate_pages(test_file, separators) | ||||||
|  |         self.assertTrue(document_list != []) | ||||||
|  |         for document in document_list: | ||||||
|  |             tasks.save_to_dir(document, tempdir) | ||||||
|  |         target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") | ||||||
|  |         target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(target_file1)) | ||||||
|  |         self.assertTrue(os.path.isfile(target_file2)) | ||||||
|  |  | ||||||
|     @mock.patch("documents.tasks.sanity_checker.check_sanity") |     @mock.patch("documents.tasks.sanity_checker.check_sanity") | ||||||
|     def test_sanity_check_success(self, m): |     def test_sanity_check_success(self, m): | ||||||
|         m.return_value = SanityCheckMessages() |         m.return_value = SanityCheckMessages() | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user