mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Extract redo ocr to task
This commit is contained in:
parent
a090cf7a10
commit
13ffe468df
@ -1,14 +1,6 @@
|
|||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Type
|
|
||||||
|
|
||||||
import tqdm
|
import tqdm
|
||||||
from django.core.exceptions import ObjectDoesNotExist
|
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from documents.models import Document
|
from documents.tasks import redo_ocr
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
|
||||||
from documents.parsers import ParseError
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
@ -36,47 +28,8 @@ class Command(BaseCommand):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
|
doc_pks = tqdm.tqdm(
|
||||||
all_docs = Document.objects.all()
|
|
||||||
|
|
||||||
for doc_pk in tqdm.tqdm(
|
|
||||||
options["documents"],
|
options["documents"],
|
||||||
disable=options["no_progress_bar"],
|
disable=options["no_progress_bar"],
|
||||||
):
|
|
||||||
try:
|
|
||||||
self.stdout.write(f"Parsing document {doc_pk}")
|
|
||||||
doc: Document = all_docs.get(pk=doc_pk)
|
|
||||||
except ObjectDoesNotExist:
|
|
||||||
self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist"))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get the correct parser for this mime type
|
|
||||||
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
|
|
||||||
doc.mime_type,
|
|
||||||
)
|
)
|
||||||
document_parser: DocumentParser = parser_class(
|
redo_ocr(doc_pks)
|
||||||
"redo-ocr",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create a file path to copy the original file to for working on
|
|
||||||
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
|
|
||||||
|
|
||||||
shutil.copy(doc.source_path, temp_file)
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.stdout.write(
|
|
||||||
f"Using {type(document_parser).__name__} for document",
|
|
||||||
)
|
|
||||||
# Try to re-parse the document into text
|
|
||||||
document_parser.parse(str(temp_file), doc.mime_type)
|
|
||||||
|
|
||||||
doc.content = document_parser.get_text()
|
|
||||||
doc.save()
|
|
||||||
self.stdout.write("Document OCR updated")
|
|
||||||
|
|
||||||
except ParseError as e:
|
|
||||||
self.stdout.write(self.style.ERROR(f"Error parsing document: {e}"))
|
|
||||||
finally:
|
|
||||||
# Remove the file path if it was created
|
|
||||||
if temp_file.exists() and temp_file.is_file():
|
|
||||||
temp_file.unlink()
|
|
||||||
|
@ -2,13 +2,16 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
from typing import List # for type hinting. Can be removed, if only Python >3.8 is used
|
from typing import List # for type hinting. Can be removed, if only Python >3.8 is used
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
import tqdm
|
import tqdm
|
||||||
from asgiref.sync import async_to_sync
|
from asgiref.sync import async_to_sync
|
||||||
from channels.layers import get_channel_layer
|
from channels.layers import get_channel_layer
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.core.exceptions import ObjectDoesNotExist
|
||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
from documents import index
|
from documents import index
|
||||||
from documents import sanity_checker
|
from documents import sanity_checker
|
||||||
@ -21,6 +24,9 @@ from documents.models import Document
|
|||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
|
from documents.parsers import DocumentParser
|
||||||
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
|
from documents.parsers import ParseError
|
||||||
from documents.sanity_checker import SanityCheckFailedException
|
from documents.sanity_checker import SanityCheckFailedException
|
||||||
from pdf2image import convert_from_path
|
from pdf2image import convert_from_path
|
||||||
from pikepdf import Pdf
|
from pikepdf import Pdf
|
||||||
@ -359,3 +365,46 @@ def bulk_update_documents(document_ids):
|
|||||||
with AsyncWriter(ix) as writer:
|
with AsyncWriter(ix) as writer:
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
index.update_document(writer, doc)
|
index.update_document(writer, doc)
|
||||||
|
|
||||||
|
|
||||||
|
def redo_ocr(document_ids):
|
||||||
|
all_docs = Document.objects.all()
|
||||||
|
|
||||||
|
for doc_pk in document_ids:
|
||||||
|
try:
|
||||||
|
logger.info(f"Parsing document {doc_pk}")
|
||||||
|
doc: Document = all_docs.get(pk=doc_pk)
|
||||||
|
except ObjectDoesNotExist:
|
||||||
|
logger.error(f"Document {doc_pk} does not exist")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the correct parser for this mime type
|
||||||
|
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
|
||||||
|
doc.mime_type,
|
||||||
|
)
|
||||||
|
document_parser: DocumentParser = parser_class(
|
||||||
|
"redo-ocr",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a file path to copy the original file to for working on
|
||||||
|
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
|
||||||
|
|
||||||
|
shutil.copy(doc.source_path, temp_file)
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(
|
||||||
|
f"Using {type(document_parser).__name__} for document",
|
||||||
|
)
|
||||||
|
# Try to re-parse the document into text
|
||||||
|
document_parser.parse(str(temp_file), doc.mime_type)
|
||||||
|
|
||||||
|
doc.content = document_parser.get_text()
|
||||||
|
doc.save()
|
||||||
|
logger.info("Document OCR updated")
|
||||||
|
|
||||||
|
except ParseError as e:
|
||||||
|
logger.error(f"Error parsing document: {e}")
|
||||||
|
finally:
|
||||||
|
# Remove the file path if it was created
|
||||||
|
if temp_file.exists() and temp_file.is_file():
|
||||||
|
temp_file.unlink()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user