mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Extract redo ocr to task
This commit is contained in:
		@@ -1,14 +1,6 @@
 | 
			
		||||
import shutil
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import Type
 | 
			
		||||
 | 
			
		||||
import tqdm
 | 
			
		||||
from django.core.exceptions import ObjectDoesNotExist
 | 
			
		||||
from django.core.management.base import BaseCommand
 | 
			
		||||
from documents.models import Document
 | 
			
		||||
from documents.parsers import DocumentParser
 | 
			
		||||
from documents.parsers import get_parser_class_for_mime_type
 | 
			
		||||
from documents.parsers import ParseError
 | 
			
		||||
from documents.tasks import redo_ocr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Command(BaseCommand):
 | 
			
		||||
@@ -36,47 +28,8 @@ class Command(BaseCommand):
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def handle(self, *args, **options):
 | 
			
		||||
 | 
			
		||||
        all_docs = Document.objects.all()
 | 
			
		||||
 | 
			
		||||
        for doc_pk in tqdm.tqdm(
 | 
			
		||||
        doc_pks = tqdm.tqdm(
 | 
			
		||||
            options["documents"],
 | 
			
		||||
            disable=options["no_progress_bar"],
 | 
			
		||||
        ):
 | 
			
		||||
            try:
 | 
			
		||||
                self.stdout.write(f"Parsing document {doc_pk}")
 | 
			
		||||
                doc: Document = all_docs.get(pk=doc_pk)
 | 
			
		||||
            except ObjectDoesNotExist:
 | 
			
		||||
                self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist"))
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            # Get the correct parser for this mime type
 | 
			
		||||
            parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
 | 
			
		||||
                doc.mime_type,
 | 
			
		||||
        )
 | 
			
		||||
            document_parser: DocumentParser = parser_class(
 | 
			
		||||
                "redo-ocr",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            # Create a file path to copy the original file to for working on
 | 
			
		||||
            temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
 | 
			
		||||
 | 
			
		||||
            shutil.copy(doc.source_path, temp_file)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                self.stdout.write(
 | 
			
		||||
                    f"Using {type(document_parser).__name__} for document",
 | 
			
		||||
                )
 | 
			
		||||
                # Try to re-parse the document into text
 | 
			
		||||
                document_parser.parse(str(temp_file), doc.mime_type)
 | 
			
		||||
 | 
			
		||||
                doc.content = document_parser.get_text()
 | 
			
		||||
                doc.save()
 | 
			
		||||
                self.stdout.write("Document OCR updated")
 | 
			
		||||
 | 
			
		||||
            except ParseError as e:
 | 
			
		||||
                self.stdout.write(self.style.ERROR(f"Error parsing document: {e}"))
 | 
			
		||||
            finally:
 | 
			
		||||
                # Remove the file path if it was created
 | 
			
		||||
                if temp_file.exists() and temp_file.is_file():
 | 
			
		||||
                    temp_file.unlink()
 | 
			
		||||
        redo_ocr(doc_pks)
 | 
			
		||||
 
 | 
			
		||||
@@ -2,13 +2,16 @@ import logging
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import List  # for type hinting. Can be removed, if only Python >3.8 is used
 | 
			
		||||
from typing import Type
 | 
			
		||||
 | 
			
		||||
import magic
 | 
			
		||||
import tqdm
 | 
			
		||||
from asgiref.sync import async_to_sync
 | 
			
		||||
from channels.layers import get_channel_layer
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.core.exceptions import ObjectDoesNotExist
 | 
			
		||||
from django.db.models.signals import post_save
 | 
			
		||||
from documents import index
 | 
			
		||||
from documents import sanity_checker
 | 
			
		||||
@@ -21,6 +24,9 @@ from documents.models import Document
 | 
			
		||||
from documents.models import DocumentType
 | 
			
		||||
from documents.models import StoragePath
 | 
			
		||||
from documents.models import Tag
 | 
			
		||||
from documents.parsers import DocumentParser
 | 
			
		||||
from documents.parsers import get_parser_class_for_mime_type
 | 
			
		||||
from documents.parsers import ParseError
 | 
			
		||||
from documents.sanity_checker import SanityCheckFailedException
 | 
			
		||||
from pdf2image import convert_from_path
 | 
			
		||||
from pikepdf import Pdf
 | 
			
		||||
@@ -359,3 +365,46 @@ def bulk_update_documents(document_ids):
 | 
			
		||||
    with AsyncWriter(ix) as writer:
 | 
			
		||||
        for doc in documents:
 | 
			
		||||
            index.update_document(writer, doc)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def redo_ocr(document_ids):
 | 
			
		||||
    all_docs = Document.objects.all()
 | 
			
		||||
 | 
			
		||||
    for doc_pk in document_ids:
 | 
			
		||||
        try:
 | 
			
		||||
            logger.info(f"Parsing document {doc_pk}")
 | 
			
		||||
            doc: Document = all_docs.get(pk=doc_pk)
 | 
			
		||||
        except ObjectDoesNotExist:
 | 
			
		||||
            logger.error(f"Document {doc_pk} does not exist")
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        # Get the correct parser for this mime type
 | 
			
		||||
        parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
 | 
			
		||||
            doc.mime_type,
 | 
			
		||||
        )
 | 
			
		||||
        document_parser: DocumentParser = parser_class(
 | 
			
		||||
            "redo-ocr",
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # Create a file path to copy the original file to for working on
 | 
			
		||||
        temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
 | 
			
		||||
 | 
			
		||||
        shutil.copy(doc.source_path, temp_file)
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            logger.info(
 | 
			
		||||
                f"Using {type(document_parser).__name__} for document",
 | 
			
		||||
            )
 | 
			
		||||
            # Try to re-parse the document into text
 | 
			
		||||
            document_parser.parse(str(temp_file), doc.mime_type)
 | 
			
		||||
 | 
			
		||||
            doc.content = document_parser.get_text()
 | 
			
		||||
            doc.save()
 | 
			
		||||
            logger.info("Document OCR updated")
 | 
			
		||||
 | 
			
		||||
        except ParseError as e:
 | 
			
		||||
            logger.error(f"Error parsing document: {e}")
 | 
			
		||||
        finally:
 | 
			
		||||
            # Remove the file path if it was created
 | 
			
		||||
            if temp_file.exists() and temp_file.is_file():
 | 
			
		||||
                temp_file.unlink()
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user