mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
A handy script to redo ocr on all documents,
This commit is contained in:
parent
6f3d25d7b1
commit
f4cebda085
@ -12,9 +12,8 @@ from django.utils import timezone
|
|||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
from .classifier import DocumentClassifier
|
from .classifier import DocumentClassifier
|
||||||
from .models import Document, FileInfo
|
from .models import Document, FileInfo
|
||||||
from .parsers import ParseError
|
from .parsers import ParseError, get_parser_class
|
||||||
from .signals import (
|
from .signals import (
|
||||||
document_consumer_declaration,
|
|
||||||
document_consumption_finished,
|
document_consumption_finished,
|
||||||
document_consumption_started
|
document_consumption_started
|
||||||
)
|
)
|
||||||
@ -61,15 +60,6 @@ class Consumer:
|
|||||||
raise ConsumerError(
|
raise ConsumerError(
|
||||||
"Consumption directory {} does not exist".format(self.consume))
|
"Consumption directory {} does not exist".format(self.consume))
|
||||||
|
|
||||||
self.parsers = []
|
|
||||||
for response in document_consumer_declaration.send(self):
|
|
||||||
self.parsers.append(response[1])
|
|
||||||
|
|
||||||
if not self.parsers:
|
|
||||||
raise ConsumerError(
|
|
||||||
"No parsers could be found, not even the default. "
|
|
||||||
"This is a problem."
|
|
||||||
)
|
|
||||||
|
|
||||||
def log(self, level, message):
|
def log(self, level, message):
|
||||||
getattr(self.logger, level)(message, extra={
|
getattr(self.logger, level)(message, extra={
|
||||||
@ -82,6 +72,8 @@ class Consumer:
|
|||||||
Return True if file was consumed
|
Return True if file was consumed
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
self.logging_group = uuid.uuid4()
|
||||||
|
|
||||||
if not re.match(FileInfo.REGEXES["title"], file):
|
if not re.match(FileInfo.REGEXES["title"], file):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -96,13 +88,13 @@ class Consumer:
|
|||||||
|
|
||||||
self.log("info", "Consuming {}".format(doc))
|
self.log("info", "Consuming {}".format(doc))
|
||||||
|
|
||||||
parser_class = self._get_parser_class(doc)
|
parser_class = get_parser_class(doc)
|
||||||
if not parser_class:
|
if not parser_class:
|
||||||
self.log(
|
self.log(
|
||||||
"error", "No parsers could be found for {}".format(doc))
|
"error", "No parsers could be found for {}".format(doc))
|
||||||
return False
|
return False
|
||||||
|
else:
|
||||||
self.logging_group = uuid.uuid4()
|
self.log("info", "Parser: {}".format(parser_class.__name__))
|
||||||
|
|
||||||
|
|
||||||
document_consumption_started.send(
|
document_consumption_started.send(
|
||||||
@ -114,6 +106,7 @@ class Consumer:
|
|||||||
document_parser = parser_class(doc, self.logging_group)
|
document_parser = parser_class(doc, self.logging_group)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
self.log("info", "Generating thumbnail for {}...".format(doc))
|
||||||
thumbnail = document_parser.get_optimised_thumbnail()
|
thumbnail = document_parser.get_optimised_thumbnail()
|
||||||
date = document_parser.get_date()
|
date = document_parser.get_date()
|
||||||
document = self._store(
|
document = self._store(
|
||||||
@ -154,31 +147,6 @@ class Consumer:
|
|||||||
)
|
)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _get_parser_class(self, doc):
|
|
||||||
"""
|
|
||||||
Determine the appropriate parser class based on the file
|
|
||||||
"""
|
|
||||||
|
|
||||||
options = []
|
|
||||||
for parser in self.parsers:
|
|
||||||
result = parser(doc)
|
|
||||||
if result:
|
|
||||||
options.append(result)
|
|
||||||
|
|
||||||
self.log(
|
|
||||||
"info",
|
|
||||||
"Parsers available: {}".format(
|
|
||||||
", ".join([str(o["parser"].__name__) for o in options])
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if not options:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Return the parser with the highest weight.
|
|
||||||
return sorted(
|
|
||||||
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
|
||||||
|
|
||||||
def _store(self, text, doc, thumbnail, date):
|
def _store(self, text, doc, thumbnail, date):
|
||||||
|
|
||||||
file_info = FileInfo.from_path(doc)
|
file_info = FileInfo.from_path(doc)
|
||||||
@ -211,10 +179,9 @@ class Consumer:
|
|||||||
self._write(document, doc, document.source_path)
|
self._write(document, doc, document.source_path)
|
||||||
self._write(document, thumbnail, document.thumbnail_path)
|
self._write(document, thumbnail, document.thumbnail_path)
|
||||||
|
|
||||||
|
#TODO: why do we need to save the document again?
|
||||||
document.save()
|
document.save()
|
||||||
|
|
||||||
self.log("debug", "Completed")
|
|
||||||
|
|
||||||
return document
|
return document
|
||||||
|
|
||||||
def _write(self, document, source, target):
|
def _write(self, document, source, target):
|
||||||
|
60
src/documents/management/commands/document_rerun_ocr.py
Normal file
60
src/documents/management/commands/document_rerun_ocr.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import argparse
|
||||||
|
import threading
|
||||||
|
from multiprocessing import Pool
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from documents.consumer import Consumer
|
||||||
|
from documents.models import Log, Document
|
||||||
|
from documents.parsers import get_parser_class
|
||||||
|
|
||||||
|
|
||||||
|
def process_document(doc):
|
||||||
|
parser_class = get_parser_class(doc.file_name)
|
||||||
|
if not parser_class:
|
||||||
|
print("no parser available")
|
||||||
|
else:
|
||||||
|
print("Parser: {}".format(parser_class.__name__))
|
||||||
|
parser = parser_class(doc.source_path, None)
|
||||||
|
try:
|
||||||
|
text = parser.get_text()
|
||||||
|
doc.content = text
|
||||||
|
doc.save()
|
||||||
|
finally:
|
||||||
|
parser.cleanup()
|
||||||
|
|
||||||
|
|
||||||
|
def document_index(value):
|
||||||
|
ivalue = int(value)
|
||||||
|
if not (1 <= ivalue <= Document.objects.count()):
|
||||||
|
raise argparse.ArgumentTypeError(
|
||||||
|
"{} is not a valid document index (out of range)".format(value))
|
||||||
|
|
||||||
|
return ivalue
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
|
||||||
|
help = "Performs OCR on all documents again!"
|
||||||
|
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument(
|
||||||
|
"-s", "--start_index",
|
||||||
|
default=None,
|
||||||
|
type=document_index
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
|
||||||
|
docs = Document.objects.all().order_by("added")
|
||||||
|
|
||||||
|
indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
|
||||||
|
|
||||||
|
for i in indices:
|
||||||
|
doc = docs[i]
|
||||||
|
print("==================================")
|
||||||
|
print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
|
||||||
|
print("==================================")
|
||||||
|
process_document(doc)
|
@ -20,6 +20,8 @@ from django.utils import timezone
|
|||||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||||
|
from documents.signals import document_consumer_declaration
|
||||||
|
|
||||||
DATE_REGEX = re.compile(
|
DATE_REGEX = re.compile(
|
||||||
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
|
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
|
||||||
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
|
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
|
||||||
@ -32,6 +34,31 @@ DATE_REGEX = re.compile(
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser_class(doc):
|
||||||
|
"""
|
||||||
|
Determine the appropriate parser class based on the file
|
||||||
|
"""
|
||||||
|
|
||||||
|
parsers = []
|
||||||
|
for response in document_consumer_declaration.send(None):
|
||||||
|
parsers.append(response[1])
|
||||||
|
|
||||||
|
#TODO: add a check that checks parser availability.
|
||||||
|
|
||||||
|
options = []
|
||||||
|
for parser in parsers:
|
||||||
|
result = parser(doc)
|
||||||
|
if result:
|
||||||
|
options.append(result)
|
||||||
|
|
||||||
|
if not options:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Return the parser with the highest weight.
|
||||||
|
return sorted(
|
||||||
|
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||||
|
|
||||||
|
|
||||||
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
|
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
|
||||||
environment = os.environ.copy()
|
environment = os.environ.copy()
|
||||||
if settings.CONVERT_MEMORY_LIMIT:
|
if settings.CONVERT_MEMORY_LIMIT:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user