mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Saves work on a new management comment to re-ocr a file
This commit is contained in:
		 Trenton Holmes
					Trenton Holmes
				
			
				
					committed by
					
						 Michael Shamoon
						Michael Shamoon
					
				
			
			
				
	
			
			
			 Michael Shamoon
						Michael Shamoon
					
				
			
						parent
						
							5dbea504b7
						
					
				
				
					commit
					823e8e73e1
				
			
							
								
								
									
										69
									
								
								src/documents/management/commands/document_redo_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								src/documents/management/commands/document_redo_ocr.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,69 @@ | ||||
| import logging | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from typing import Type | ||||
|  | ||||
| from django.core.exceptions import ObjectDoesNotExist | ||||
| from django.core.management.base import BaseCommand | ||||
| from documents.models import Document | ||||
| from documents.parsers import DocumentParser | ||||
| from documents.parsers import get_parser_class_for_mime_type | ||||
| from documents.parsers import ParseError | ||||
|  | ||||
|  | ||||
| class Command(BaseCommand): | ||||
|  | ||||
|     help = """ | ||||
|         This will rename all documents to match the latest filename format. | ||||
|     """.replace( | ||||
|         "    ", | ||||
|         "", | ||||
|     ) | ||||
|  | ||||
|     def add_arguments(self, parser): | ||||
|         parser.add_argument( | ||||
|             "documents", | ||||
|             nargs="+", | ||||
|             help="Document primary keys for re-processing OCR on", | ||||
|         ) | ||||
|  | ||||
|     def handle(self, *args, **options): | ||||
|  | ||||
|         logging.getLogger().handlers[0].level = logging.ERROR | ||||
|  | ||||
|         all_docs = Document.objects.all() | ||||
|  | ||||
|         for doc_pk in args.documents: | ||||
|             try: | ||||
|                 self.stdout.write(f"Parsing document {doc_pk}") | ||||
|                 doc: Document = all_docs.get(pk=doc_pk) | ||||
|             except ObjectDoesNotExist: | ||||
|                 self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist")) | ||||
|                 continue | ||||
|  | ||||
|             # Get the correct parser for this mime type | ||||
|             parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( | ||||
|                 doc.mime_type, | ||||
|             ) | ||||
|             document_parser: DocumentParser = parser_class( | ||||
|                 "redo-ocr", | ||||
|             ) | ||||
|  | ||||
|             # Create a file path to copy the original file to for working on | ||||
|             temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() | ||||
|  | ||||
|             shutil.copy(doc.source_path, temp_file) | ||||
|  | ||||
|             try: | ||||
|                 # Try to re-parse the document into text | ||||
|                 document_parser.parse(str(temp_file), doc.mime_type) | ||||
|  | ||||
|                 doc.content = document_parser.get_text() | ||||
|                 doc.save() | ||||
|  | ||||
|             except ParseError as e: | ||||
|                 self.stdout.write(self.style.ERROR(f"Error parsing document: {e}")) | ||||
|             finally: | ||||
|                 # Remove the file path if it was created | ||||
|                 if temp_file.exists() and temp_file.is_file(): | ||||
|                     temp_file.unlink() | ||||
		Reference in New Issue
	
	Block a user