mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #1139 from paperless-ngx/feature-redo-ocr
Feature: Management command to redo OCR
This commit is contained in:
commit
4bea4c69a4
@ -66,23 +66,30 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="col-auto ms-auto mb-2 mb-xl-0 d-flex">
|
<div class="col-auto ms-auto mb-2 mb-xl-0 d-flex">
|
||||||
<div class="btn-group btn-group-sm me-2">
|
<div class="btn-group btn-group-sm me-2">
|
||||||
<button type="button" [disabled]="awaitingDownload" class="btn btn-outline-primary btn-sm" (click)="downloadSelected()">
|
|
||||||
<svg *ngIf="!awaitingDownload" width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor">
|
<div ngbDropdown class="me-2 d-flex">
|
||||||
<use xlink:href="assets/bootstrap-icons.svg#download" />
|
<button class="btn btn-sm btn-outline-primary" id="dropdownSelect" ngbDropdownToggle>
|
||||||
</svg>
|
<svg class="toolbaricon" fill="currentColor">
|
||||||
<div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status">
|
<use xlink:href="assets/bootstrap-icons.svg#three-dots" />
|
||||||
<span class="visually-hidden">Preparing download...</span>
|
</svg>
|
||||||
</div>
|
<div class="d-none d-sm-inline"> <ng-container i18n>Actions</ng-container></div>
|
||||||
|
</button>
|
||||||
<ng-container i18n>Download</ng-container>
|
<div ngbDropdownMenu aria-labelledby="dropdownSelect" class="shadow">
|
||||||
</button>
|
<button ngbDropdownItem [disabled]="awaitingDownload" (click)="downloadSelected()" i18n>
|
||||||
<div class="btn-group" ngbDropdown role="group" aria-label="Button group with nested dropdown">
|
Download
|
||||||
<button [disabled]="awaitingDownload" class="btn btn-outline-primary btn-sm dropdown-toggle-split" ngbDropdownToggle></button>
|
<div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status">
|
||||||
<div class="dropdown-menu shadow" ngbDropdownMenu>
|
<span class="visually-hidden">Preparing download...</span>
|
||||||
<button ngbDropdownItem i18n (click)="downloadSelected('originals')">Download originals</button>
|
</div>
|
||||||
|
</button>
|
||||||
|
<button ngbDropdownItem [disabled]="awaitingDownload" (click)="downloadSelected('originals')" i18n>
|
||||||
|
Download originals
|
||||||
|
<div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status">
|
||||||
|
<span class="visually-hidden">Preparing download...</span>
|
||||||
|
</div>
|
||||||
|
</button>
|
||||||
|
<button ngbDropdownItem (click)="redoOcrSelected()" i18n>Redo OCR</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
|
|
||||||
<button type="button" class="btn btn-sm btn-outline-danger" (click)="applyDelete()">
|
<button type="button" class="btn btn-sm btn-outline-danger" (click)="applyDelete()">
|
||||||
<svg width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor">
|
<svg width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor">
|
||||||
|
@ -379,4 +379,19 @@ export class BulkEditorComponent {
|
|||||||
this.awaitingDownload = false
|
this.awaitingDownload = false
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
redoOcrSelected() {
|
||||||
|
let modal = this.modalService.open(ConfirmDialogComponent, {
|
||||||
|
backdrop: 'static',
|
||||||
|
})
|
||||||
|
modal.componentInstance.title = $localize`Redo OCR confirm`
|
||||||
|
modal.componentInstance.messageBold = $localize`This operation will permanently redo OCR for ${this.list.selected.size} selected document(s).`
|
||||||
|
modal.componentInstance.message = $localize`This operation cannot be undone.`
|
||||||
|
modal.componentInstance.btnClass = 'btn-danger'
|
||||||
|
modal.componentInstance.btnCaption = $localize`Proceed`
|
||||||
|
modal.componentInstance.confirmClicked.subscribe(() => {
|
||||||
|
modal.componentInstance.buttonsEnabled = false
|
||||||
|
this.executeBulkOperation(modal, 'redo_ocr', {})
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -118,3 +118,10 @@ def delete(doc_ids):
|
|||||||
index.remove_document_by_id(writer, id)
|
index.remove_document_by_id(writer, id)
|
||||||
|
|
||||||
return "OK"
|
return "OK"
|
||||||
|
|
||||||
|
|
||||||
|
def redo_ocr(doc_ids):
|
||||||
|
|
||||||
|
async_task("documents.tasks.redo_ocr", document_ids=doc_ids)
|
||||||
|
|
||||||
|
return "OK"
|
||||||
|
35
src/documents/management/commands/document_redo_ocr.py
Normal file
35
src/documents/management/commands/document_redo_ocr.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import tqdm
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from documents.tasks import redo_ocr
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
|
||||||
|
help = """
|
||||||
|
This will rename all documents to match the latest filename format.
|
||||||
|
""".replace(
|
||||||
|
" ",
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-progress-bar",
|
||||||
|
default=False,
|
||||||
|
action="store_true",
|
||||||
|
help="If set, the progress bar will not be shown",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"documents",
|
||||||
|
nargs="+",
|
||||||
|
help="Document primary keys for re-processing OCR on",
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
doc_pks = tqdm.tqdm(
|
||||||
|
options["documents"],
|
||||||
|
disable=options["no_progress_bar"],
|
||||||
|
)
|
||||||
|
redo_ocr(doc_pks)
|
@ -323,6 +323,7 @@ class BulkEditSerializer(DocumentListSerializer):
|
|||||||
"remove_tag",
|
"remove_tag",
|
||||||
"modify_tags",
|
"modify_tags",
|
||||||
"delete",
|
"delete",
|
||||||
|
"redo_ocr",
|
||||||
],
|
],
|
||||||
label="Method",
|
label="Method",
|
||||||
write_only=True,
|
write_only=True,
|
||||||
@ -356,6 +357,8 @@ class BulkEditSerializer(DocumentListSerializer):
|
|||||||
return bulk_edit.modify_tags
|
return bulk_edit.modify_tags
|
||||||
elif method == "delete":
|
elif method == "delete":
|
||||||
return bulk_edit.delete
|
return bulk_edit.delete
|
||||||
|
elif method == "redo_ocr":
|
||||||
|
return bulk_edit.redo_ocr
|
||||||
else:
|
else:
|
||||||
raise serializers.ValidationError("Unsupported method.")
|
raise serializers.ValidationError("Unsupported method.")
|
||||||
|
|
||||||
|
@ -1,10 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
import tqdm
|
import tqdm
|
||||||
from asgiref.sync import async_to_sync
|
from asgiref.sync import async_to_sync
|
||||||
from channels.layers import get_channel_layer
|
from channels.layers import get_channel_layer
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.core.exceptions import ObjectDoesNotExist
|
||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
from documents import barcodes
|
from documents import barcodes
|
||||||
from documents import index
|
from documents import index
|
||||||
@ -18,6 +22,9 @@ from documents.models import Document
|
|||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
from documents.models import StoragePath
|
from documents.models import StoragePath
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
|
from documents.parsers import DocumentParser
|
||||||
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
|
from documents.parsers import ParseError
|
||||||
from documents.sanity_checker import SanityCheckFailedException
|
from documents.sanity_checker import SanityCheckFailedException
|
||||||
from whoosh.writing import AsyncWriter
|
from whoosh.writing import AsyncWriter
|
||||||
|
|
||||||
@ -198,3 +205,46 @@ def bulk_update_documents(document_ids):
|
|||||||
with AsyncWriter(ix) as writer:
|
with AsyncWriter(ix) as writer:
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
index.update_document(writer, doc)
|
index.update_document(writer, doc)
|
||||||
|
|
||||||
|
|
||||||
|
def redo_ocr(document_ids):
|
||||||
|
all_docs = Document.objects.all()
|
||||||
|
|
||||||
|
for doc_pk in document_ids:
|
||||||
|
try:
|
||||||
|
logger.info(f"Parsing document {doc_pk}")
|
||||||
|
doc: Document = all_docs.get(pk=doc_pk)
|
||||||
|
except ObjectDoesNotExist:
|
||||||
|
logger.error(f"Document {doc_pk} does not exist")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the correct parser for this mime type
|
||||||
|
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
|
||||||
|
doc.mime_type,
|
||||||
|
)
|
||||||
|
document_parser: DocumentParser = parser_class(
|
||||||
|
"redo-ocr",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a file path to copy the original file to for working on
|
||||||
|
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
|
||||||
|
|
||||||
|
shutil.copy(doc.source_path, temp_file)
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(
|
||||||
|
f"Using {type(document_parser).__name__} for document",
|
||||||
|
)
|
||||||
|
# Try to re-parse the document into text
|
||||||
|
document_parser.parse(str(temp_file), doc.mime_type)
|
||||||
|
|
||||||
|
doc.content = document_parser.get_text()
|
||||||
|
doc.save()
|
||||||
|
logger.info("Document OCR updated")
|
||||||
|
|
||||||
|
except ParseError as e:
|
||||||
|
logger.error(f"Error parsing document: {e}")
|
||||||
|
finally:
|
||||||
|
# Remove the file path if it was created
|
||||||
|
if temp_file.exists() and temp_file.is_file():
|
||||||
|
temp_file.unlink()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user