mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Saves work on a new management comment to re-ocr a file
This commit is contained in:
		 Trenton Holmes
					Trenton Holmes
				
			
				
					committed by
					
						 Michael Shamoon
						Michael Shamoon
					
				
			
			
				
	
			
			
			 Michael Shamoon
						Michael Shamoon
					
				
			
						parent
						
							5dbea504b7
						
					
				
				
					commit
					823e8e73e1
				
			
							
								
								
									
										69
									
								
								src/documents/management/commands/document_redo_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								src/documents/management/commands/document_redo_ocr.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,69 @@ | |||||||
|  | import logging | ||||||
|  | import shutil | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Type | ||||||
|  |  | ||||||
|  | from django.core.exceptions import ObjectDoesNotExist | ||||||
|  | from django.core.management.base import BaseCommand | ||||||
|  | from documents.models import Document | ||||||
|  | from documents.parsers import DocumentParser | ||||||
|  | from documents.parsers import get_parser_class_for_mime_type | ||||||
|  | from documents.parsers import ParseError | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Command(BaseCommand): | ||||||
|  |  | ||||||
|  |     help = """ | ||||||
|  |         This will rename all documents to match the latest filename format. | ||||||
|  |     """.replace( | ||||||
|  |         "    ", | ||||||
|  |         "", | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     def add_arguments(self, parser): | ||||||
|  |         parser.add_argument( | ||||||
|  |             "documents", | ||||||
|  |             nargs="+", | ||||||
|  |             help="Document primary keys for re-processing OCR on", | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def handle(self, *args, **options): | ||||||
|  |  | ||||||
|  |         logging.getLogger().handlers[0].level = logging.ERROR | ||||||
|  |  | ||||||
|  |         all_docs = Document.objects.all() | ||||||
|  |  | ||||||
|  |         for doc_pk in args.documents: | ||||||
|  |             try: | ||||||
|  |                 self.stdout.write(f"Parsing document {doc_pk}") | ||||||
|  |                 doc: Document = all_docs.get(pk=doc_pk) | ||||||
|  |             except ObjectDoesNotExist: | ||||||
|  |                 self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist")) | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             # Get the correct parser for this mime type | ||||||
|  |             parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( | ||||||
|  |                 doc.mime_type, | ||||||
|  |             ) | ||||||
|  |             document_parser: DocumentParser = parser_class( | ||||||
|  |                 "redo-ocr", | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |             # Create a file path to copy the original file to for working on | ||||||
|  |             temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() | ||||||
|  |  | ||||||
|  |             shutil.copy(doc.source_path, temp_file) | ||||||
|  |  | ||||||
|  |             try: | ||||||
|  |                 # Try to re-parse the document into text | ||||||
|  |                 document_parser.parse(str(temp_file), doc.mime_type) | ||||||
|  |  | ||||||
|  |                 doc.content = document_parser.get_text() | ||||||
|  |                 doc.save() | ||||||
|  |  | ||||||
|  |             except ParseError as e: | ||||||
|  |                 self.stdout.write(self.style.ERROR(f"Error parsing document: {e}")) | ||||||
|  |             finally: | ||||||
|  |                 # Remove the file path if it was created | ||||||
|  |                 if temp_file.exists() and temp_file.is_file(): | ||||||
|  |                     temp_file.unlink() | ||||||
		Reference in New Issue
	
	Block a user