Merge pull request #1139 from paperless-ngx/feature-redo-ocr

Feature: Management command to redo OCR
This commit is contained in:
shamoon 2022-07-02 09:02:32 -07:00 committed by GitHub
commit 4bea4c69a4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 132 additions and 15 deletions

View File

@ -66,23 +66,30 @@
</div> </div>
<div class="col-auto ms-auto mb-2 mb-xl-0 d-flex"> <div class="col-auto ms-auto mb-2 mb-xl-0 d-flex">
<div class="btn-group btn-group-sm me-2"> <div class="btn-group btn-group-sm me-2">
<button type="button" [disabled]="awaitingDownload" class="btn btn-outline-primary btn-sm" (click)="downloadSelected()">
<svg *ngIf="!awaitingDownload" width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor"> <div ngbDropdown class="me-2 d-flex">
<use xlink:href="assets/bootstrap-icons.svg#download" /> <button class="btn btn-sm btn-outline-primary" id="dropdownSelect" ngbDropdownToggle>
</svg> <svg class="toolbaricon" fill="currentColor">
<div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status"> <use xlink:href="assets/bootstrap-icons.svg#three-dots" />
<span class="visually-hidden">Preparing download...</span> </svg>
</div> <div class="d-none d-sm-inline">&nbsp;<ng-container i18n>Actions</ng-container></div>
&nbsp; </button>
<ng-container i18n>Download</ng-container> <div ngbDropdownMenu aria-labelledby="dropdownSelect" class="shadow">
</button> <button ngbDropdownItem [disabled]="awaitingDownload" (click)="downloadSelected()" i18n>
<div class="btn-group" ngbDropdown role="group" aria-label="Button group with nested dropdown"> Download
<button [disabled]="awaitingDownload" class="btn btn-outline-primary btn-sm dropdown-toggle-split" ngbDropdownToggle></button> <div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status">
<div class="dropdown-menu shadow" ngbDropdownMenu> <span class="visually-hidden">Preparing download...</span>
<button ngbDropdownItem i18n (click)="downloadSelected('originals')">Download originals</button> </div>
</button>
<button ngbDropdownItem [disabled]="awaitingDownload" (click)="downloadSelected('originals')" i18n>
Download originals
<div *ngIf="awaitingDownload" class="spinner-border spinner-border-sm" role="status">
<span class="visually-hidden">Preparing download...</span>
</div>
</button>
<button ngbDropdownItem (click)="redoOcrSelected()" i18n>Redo OCR</button>
</div> </div>
</div> </div>
</div>
<button type="button" class="btn btn-sm btn-outline-danger" (click)="applyDelete()"> <button type="button" class="btn btn-sm btn-outline-danger" (click)="applyDelete()">
<svg width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor"> <svg width="1em" height="1em" viewBox="0 0 16 16" fill="currentColor">

View File

@ -379,4 +379,19 @@ export class BulkEditorComponent {
this.awaitingDownload = false this.awaitingDownload = false
}) })
} }
redoOcrSelected() {
let modal = this.modalService.open(ConfirmDialogComponent, {
backdrop: 'static',
})
modal.componentInstance.title = $localize`Redo OCR confirm`
modal.componentInstance.messageBold = $localize`This operation will permanently redo OCR for ${this.list.selected.size} selected document(s).`
modal.componentInstance.message = $localize`This operation cannot be undone.`
modal.componentInstance.btnClass = 'btn-danger'
modal.componentInstance.btnCaption = $localize`Proceed`
modal.componentInstance.confirmClicked.subscribe(() => {
modal.componentInstance.buttonsEnabled = false
this.executeBulkOperation(modal, 'redo_ocr', {})
})
}
} }

View File

@ -118,3 +118,10 @@ def delete(doc_ids):
index.remove_document_by_id(writer, id) index.remove_document_by_id(writer, id)
return "OK" return "OK"
def redo_ocr(doc_ids):
async_task("documents.tasks.redo_ocr", document_ids=doc_ids)
return "OK"

View File

@ -0,0 +1,35 @@
import tqdm
from django.core.management.base import BaseCommand
from documents.tasks import redo_ocr
class Command(BaseCommand):
help = """
This will rename all documents to match the latest filename format.
""".replace(
" ",
"",
)
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"documents",
nargs="+",
help="Document primary keys for re-processing OCR on",
)
def handle(self, *args, **options):
doc_pks = tqdm.tqdm(
options["documents"],
disable=options["no_progress_bar"],
)
redo_ocr(doc_pks)

View File

@ -323,6 +323,7 @@ class BulkEditSerializer(DocumentListSerializer):
"remove_tag", "remove_tag",
"modify_tags", "modify_tags",
"delete", "delete",
"redo_ocr",
], ],
label="Method", label="Method",
write_only=True, write_only=True,
@ -356,6 +357,8 @@ class BulkEditSerializer(DocumentListSerializer):
return bulk_edit.modify_tags return bulk_edit.modify_tags
elif method == "delete": elif method == "delete":
return bulk_edit.delete return bulk_edit.delete
elif method == "redo_ocr":
return bulk_edit.redo_ocr
else: else:
raise serializers.ValidationError("Unsupported method.") raise serializers.ValidationError("Unsupported method.")

View File

@ -1,10 +1,14 @@
import logging import logging
import os import os
import shutil
from pathlib import Path
from typing import Type
import tqdm import tqdm
from asgiref.sync import async_to_sync from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer from channels.layers import get_channel_layer
from django.conf import settings from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from django.db.models.signals import post_save from django.db.models.signals import post_save
from documents import barcodes from documents import barcodes
from documents import index from documents import index
@ -18,6 +22,9 @@ from documents.models import Document
from documents.models import DocumentType from documents.models import DocumentType
from documents.models import StoragePath from documents.models import StoragePath
from documents.models import Tag from documents.models import Tag
from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import ParseError
from documents.sanity_checker import SanityCheckFailedException from documents.sanity_checker import SanityCheckFailedException
from whoosh.writing import AsyncWriter from whoosh.writing import AsyncWriter
@ -198,3 +205,46 @@ def bulk_update_documents(document_ids):
with AsyncWriter(ix) as writer: with AsyncWriter(ix) as writer:
for doc in documents: for doc in documents:
index.update_document(writer, doc) index.update_document(writer, doc)
def redo_ocr(document_ids):
all_docs = Document.objects.all()
for doc_pk in document_ids:
try:
logger.info(f"Parsing document {doc_pk}")
doc: Document = all_docs.get(pk=doc_pk)
except ObjectDoesNotExist:
logger.error(f"Document {doc_pk} does not exist")
continue
# Get the correct parser for this mime type
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
doc.mime_type,
)
document_parser: DocumentParser = parser_class(
"redo-ocr",
)
# Create a file path to copy the original file to for working on
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
shutil.copy(doc.source_path, temp_file)
try:
logger.info(
f"Using {type(document_parser).__name__} for document",
)
# Try to re-parse the document into text
document_parser.parse(str(temp_file), doc.mime_type)
doc.content = document_parser.get_text()
doc.save()
logger.info("Document OCR updated")
except ParseError as e:
logger.error(f"Error parsing document: {e}")
finally:
# Remove the file path if it was created
if temp_file.exists() and temp_file.is_file():
temp_file.unlink()