mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	A handy script to redo ocr on all documents,
This commit is contained in:
		| @@ -12,9 +12,8 @@ from django.utils import timezone | ||||
| from paperless.db import GnuPG | ||||
| from .classifier import DocumentClassifier | ||||
| from .models import Document, FileInfo | ||||
| from .parsers import ParseError | ||||
| from .parsers import ParseError, get_parser_class | ||||
| from .signals import ( | ||||
|     document_consumer_declaration, | ||||
|     document_consumption_finished, | ||||
|     document_consumption_started | ||||
| ) | ||||
| @@ -61,15 +60,6 @@ class Consumer: | ||||
|             raise ConsumerError( | ||||
|                 "Consumption directory {} does not exist".format(self.consume)) | ||||
|  | ||||
|         self.parsers = [] | ||||
|         for response in document_consumer_declaration.send(self): | ||||
|             self.parsers.append(response[1]) | ||||
|  | ||||
|         if not self.parsers: | ||||
|             raise ConsumerError( | ||||
|                 "No parsers could be found, not even the default.  " | ||||
|                 "This is a problem." | ||||
|             ) | ||||
|  | ||||
|     def log(self, level, message): | ||||
|         getattr(self.logger, level)(message, extra={ | ||||
| @@ -82,6 +72,8 @@ class Consumer: | ||||
|         Return True if file was consumed | ||||
|         """ | ||||
|  | ||||
|         self.logging_group = uuid.uuid4() | ||||
|  | ||||
|         if not re.match(FileInfo.REGEXES["title"], file): | ||||
|             return False | ||||
|  | ||||
| @@ -96,13 +88,13 @@ class Consumer: | ||||
|  | ||||
|         self.log("info", "Consuming {}".format(doc)) | ||||
|  | ||||
|         parser_class = self._get_parser_class(doc) | ||||
|         parser_class = get_parser_class(doc) | ||||
|         if not parser_class: | ||||
|             self.log( | ||||
|                 "error", "No parsers could be found for {}".format(doc)) | ||||
|             return False | ||||
|  | ||||
|         self.logging_group = uuid.uuid4() | ||||
|         else: | ||||
|             self.log("info", "Parser: {}".format(parser_class.__name__)) | ||||
|  | ||||
|  | ||||
|         document_consumption_started.send( | ||||
| @@ -114,6 +106,7 @@ class Consumer: | ||||
|         document_parser = parser_class(doc, self.logging_group) | ||||
|  | ||||
|         try: | ||||
|             self.log("info", "Generating thumbnail for {}...".format(doc)) | ||||
|             thumbnail = document_parser.get_optimised_thumbnail() | ||||
|             date = document_parser.get_date() | ||||
|             document = self._store( | ||||
| @@ -154,31 +147,6 @@ class Consumer: | ||||
|             ) | ||||
|             return True | ||||
|  | ||||
|     def _get_parser_class(self, doc): | ||||
|         """ | ||||
|         Determine the appropriate parser class based on the file | ||||
|         """ | ||||
|  | ||||
|         options = [] | ||||
|         for parser in self.parsers: | ||||
|             result = parser(doc) | ||||
|             if result: | ||||
|                 options.append(result) | ||||
|  | ||||
|         self.log( | ||||
|             "info", | ||||
|             "Parsers available: {}".format( | ||||
|                 ", ".join([str(o["parser"].__name__) for o in options]) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|         if not options: | ||||
|             return None | ||||
|  | ||||
|         # Return the parser with the highest weight. | ||||
|         return sorted( | ||||
|             options, key=lambda _: _["weight"], reverse=True)[0]["parser"] | ||||
|  | ||||
|     def _store(self, text, doc, thumbnail, date): | ||||
|  | ||||
|         file_info = FileInfo.from_path(doc) | ||||
| @@ -211,10 +179,9 @@ class Consumer: | ||||
|         self._write(document, doc, document.source_path) | ||||
|         self._write(document, thumbnail, document.thumbnail_path) | ||||
|  | ||||
|         #TODO: why do we need to save the document again? | ||||
|         document.save() | ||||
|  | ||||
|         self.log("debug", "Completed") | ||||
|  | ||||
|         return document | ||||
|  | ||||
|     def _write(self, document, source, target): | ||||
|   | ||||
							
								
								
									
										60
									
								
								src/documents/management/commands/document_rerun_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								src/documents/management/commands/document_rerun_ocr.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,60 @@ | ||||
| import argparse | ||||
| import threading | ||||
| from multiprocessing import Pool | ||||
| from multiprocessing.pool import ThreadPool | ||||
|  | ||||
| from django.core.management.base import BaseCommand | ||||
|  | ||||
| from documents.consumer import Consumer | ||||
| from documents.models import Log, Document | ||||
| from documents.parsers import get_parser_class | ||||
|  | ||||
|  | ||||
| def process_document(doc): | ||||
|     parser_class = get_parser_class(doc.file_name) | ||||
|     if not parser_class: | ||||
|         print("no parser available") | ||||
|     else: | ||||
|         print("Parser: {}".format(parser_class.__name__)) | ||||
|         parser = parser_class(doc.source_path, None) | ||||
|         try: | ||||
|             text = parser.get_text() | ||||
|             doc.content = text | ||||
|             doc.save() | ||||
|         finally: | ||||
|             parser.cleanup() | ||||
|  | ||||
|  | ||||
| def document_index(value): | ||||
|     ivalue = int(value) | ||||
|     if not (1 <= ivalue <= Document.objects.count()): | ||||
|         raise argparse.ArgumentTypeError( | ||||
|             "{} is not a valid document index (out of range)".format(value)) | ||||
|  | ||||
|     return ivalue | ||||
|  | ||||
|  | ||||
| class Command(BaseCommand): | ||||
|  | ||||
|     help = "Performs OCR on all documents again!" | ||||
|  | ||||
|  | ||||
|     def add_arguments(self, parser): | ||||
|         parser.add_argument( | ||||
|             "-s", "--start_index", | ||||
|             default=None, | ||||
|             type=document_index | ||||
|         ) | ||||
|  | ||||
|     def handle(self, *args, **options): | ||||
|  | ||||
|         docs = Document.objects.all().order_by("added") | ||||
|  | ||||
|         indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs)) | ||||
|  | ||||
|         for i in indices: | ||||
|             doc = docs[i] | ||||
|             print("==================================") | ||||
|             print("{} out of {}: {}".format(i+1, len(docs), doc.file_name)) | ||||
|             print("==================================") | ||||
|             process_document(doc) | ||||
| @@ -20,6 +20,8 @@ from django.utils import timezone | ||||
| # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
| # - MONTH ZZZZ, with ZZZZ being 4 digits | ||||
| # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits | ||||
| from documents.signals import document_consumer_declaration | ||||
|  | ||||
| DATE_REGEX = re.compile( | ||||
|     r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' +  # NOQA: E501 | ||||
|     r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' +  # NOQA: E501 | ||||
| @@ -32,6 +34,31 @@ DATE_REGEX = re.compile( | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def get_parser_class(doc): | ||||
|     """ | ||||
|     Determine the appropriate parser class based on the file | ||||
|     """ | ||||
|  | ||||
|     parsers = [] | ||||
|     for response in document_consumer_declaration.send(None): | ||||
|         parsers.append(response[1]) | ||||
|  | ||||
|     #TODO: add a check that checks parser availability. | ||||
|  | ||||
|     options = [] | ||||
|     for parser in parsers: | ||||
|         result = parser(doc) | ||||
|         if result: | ||||
|             options.append(result) | ||||
|  | ||||
|     if not options: | ||||
|         return None | ||||
|  | ||||
|     # Return the parser with the highest weight. | ||||
|     return sorted( | ||||
|         options, key=lambda _: _["weight"], reverse=True)[0]["parser"] | ||||
|  | ||||
|  | ||||
| def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None): | ||||
|     environment = os.environ.copy() | ||||
|     if settings.CONVERT_MEMORY_LIMIT: | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler