mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-30 18:27:45 -05:00
feat: refactor for pluggable consumers
I've broken out the OCR-specific code from the consumers and dumped it all into its own app, `paperless_tesseract`. This new app should serve as a sample of how to create one's own consumer for different file types. Documentation for how to do this isn't ready yet, but for the impatient: * Create a new app * containing a `parsers.py` for your parser modelled after `paperless_tesseract.parsers.RasterisedDocumentParser` * containing a `signals.py` with a handler moddelled after `paperless_tesseract.signals.ConsumerDeclaration` * connect the signal handler to `documents.signals.document_consumer_declaration` in `your_app.apps` * Install the app into Paperless by declaring `PAPERLESS_INSTALLED_APPS=your_app`. Additional apps should be separated with commas. * Restart the consumer
This commit is contained in:
0
src/paperless_tesseract/__init__.py
Normal file
0
src/paperless_tesseract/__init__.py
Normal file
16
src/paperless_tesseract/apps.py
Normal file
16
src/paperless_tesseract/apps.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class PaperlessTesseractConfig(AppConfig):
|
||||
|
||||
name = "paperless_tesseract"
|
||||
|
||||
def ready(self):
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
from .signals import ConsumerDeclaration
|
||||
|
||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
||||
|
||||
AppConfig.ready(self)
|
194
src/paperless_tesseract/languages.py
Normal file
194
src/paperless_tesseract/languages.py
Normal file
@@ -0,0 +1,194 @@
|
||||
# Thanks to the Library of Congress and some creative use of sed and awk:
|
||||
# http://www.loc.gov/standards/iso639-2/php/English_list.php
|
||||
|
||||
ISO639 = {
|
||||
|
||||
"aa": "aar",
|
||||
"ab": "abk",
|
||||
"ae": "ave",
|
||||
"af": "afr",
|
||||
"ak": "aka",
|
||||
"am": "amh",
|
||||
"an": "arg",
|
||||
"ar": "ara",
|
||||
"as": "asm",
|
||||
"av": "ava",
|
||||
"ay": "aym",
|
||||
"az": "aze",
|
||||
"ba": "bak",
|
||||
"be": "bel",
|
||||
"bg": "bul",
|
||||
"bh": "bih",
|
||||
"bi": "bis",
|
||||
"bm": "bam",
|
||||
"bn": "ben",
|
||||
"bo": "bod",
|
||||
"br": "bre",
|
||||
"bs": "bos",
|
||||
"ca": "cat",
|
||||
"ce": "che",
|
||||
"ch": "cha",
|
||||
"co": "cos",
|
||||
"cr": "cre",
|
||||
"cs": "ces",
|
||||
"cu": "chu",
|
||||
"cv": "chv",
|
||||
"cy": "cym",
|
||||
"da": "dan",
|
||||
"de": "deu",
|
||||
"dv": "div",
|
||||
"dz": "dzo",
|
||||
"ee": "ewe",
|
||||
"el": "ell",
|
||||
"en": "eng",
|
||||
"eo": "epo",
|
||||
"es": "spa",
|
||||
"et": "est",
|
||||
"eu": "eus",
|
||||
"fa": "fas",
|
||||
"ff": "ful",
|
||||
"fi": "fin",
|
||||
"fj": "fij",
|
||||
"fo": "fao",
|
||||
"fr": "fra",
|
||||
"fy": "fry",
|
||||
"ga": "gle",
|
||||
"gd": "gla",
|
||||
"gl": "glg",
|
||||
"gn": "grn",
|
||||
"gu": "guj",
|
||||
"gv": "glv",
|
||||
"ha": "hau",
|
||||
"he": "heb",
|
||||
"hi": "hin",
|
||||
"ho": "hmo",
|
||||
"hr": "hrv",
|
||||
"ht": "hat",
|
||||
"hu": "hun",
|
||||
"hy": "hye",
|
||||
"hz": "her",
|
||||
"ia": "ina",
|
||||
"id": "ind",
|
||||
"ie": "ile",
|
||||
"ig": "ibo",
|
||||
"ii": "iii",
|
||||
"ik": "ipk",
|
||||
"io": "ido",
|
||||
"is": "isl",
|
||||
"it": "ita",
|
||||
"iu": "iku",
|
||||
"ja": "jpn",
|
||||
"jv": "jav",
|
||||
"ka": "kat",
|
||||
"kg": "kon",
|
||||
"ki": "kik",
|
||||
"kj": "kua",
|
||||
"kk": "kaz",
|
||||
"kl": "kal",
|
||||
"km": "khm",
|
||||
"kn": "kan",
|
||||
"ko": "kor",
|
||||
"kr": "kau",
|
||||
"ks": "kas",
|
||||
"ku": "kur",
|
||||
"kv": "kom",
|
||||
"kw": "cor",
|
||||
"ky": "kir",
|
||||
"la": "lat",
|
||||
"lb": "ltz",
|
||||
"lg": "lug",
|
||||
"li": "lim",
|
||||
"ln": "lin",
|
||||
"lo": "lao",
|
||||
"lt": "lit",
|
||||
"lu": "lub",
|
||||
"lv": "lav",
|
||||
"mg": "mlg",
|
||||
"mh": "mah",
|
||||
"mi": "mri",
|
||||
"mk": "mkd",
|
||||
"ml": "mal",
|
||||
"mn": "mon",
|
||||
"mr": "mar",
|
||||
"ms": "msa",
|
||||
"mt": "mlt",
|
||||
"my": "mya",
|
||||
"na": "nau",
|
||||
"nb": "nob",
|
||||
"nd": "nde",
|
||||
"ne": "nep",
|
||||
"ng": "ndo",
|
||||
"nl": "nld",
|
||||
"no": "nor",
|
||||
"nr": "nbl",
|
||||
"nv": "nav",
|
||||
"ny": "nya",
|
||||
"oc": "oci",
|
||||
"oj": "oji",
|
||||
"om": "orm",
|
||||
"or": "ori",
|
||||
"os": "oss",
|
||||
"pa": "pan",
|
||||
"pi": "pli",
|
||||
"pl": "pol",
|
||||
"ps": "pus",
|
||||
"pt": "por",
|
||||
"qu": "que",
|
||||
"rm": "roh",
|
||||
"rn": "run",
|
||||
"ro": "ron",
|
||||
"ru": "rus",
|
||||
"rw": "kin",
|
||||
"sa": "san",
|
||||
"sc": "srd",
|
||||
"sd": "snd",
|
||||
"se": "sme",
|
||||
"sg": "sag",
|
||||
"si": "sin",
|
||||
"sk": "slk",
|
||||
"sl": "slv",
|
||||
"sm": "smo",
|
||||
"sn": "sna",
|
||||
"so": "som",
|
||||
"sq": "sqi",
|
||||
"sr": "srp",
|
||||
"ss": "ssw",
|
||||
"st": "sot",
|
||||
"su": "sun",
|
||||
"sv": "swe",
|
||||
"sw": "swa",
|
||||
"ta": "tam",
|
||||
"te": "tel",
|
||||
"tg": "tgk",
|
||||
"th": "tha",
|
||||
"ti": "tir",
|
||||
"tk": "tuk",
|
||||
"tl": "tgl",
|
||||
"tn": "tsn",
|
||||
"to": "ton",
|
||||
"tr": "tur",
|
||||
"ts": "tso",
|
||||
"tt": "tat",
|
||||
"tw": "twi",
|
||||
"ty": "tah",
|
||||
"ug": "uig",
|
||||
"uk": "ukr",
|
||||
"ur": "urd",
|
||||
"uz": "uzb",
|
||||
"ve": "ven",
|
||||
"vi": "vie",
|
||||
"vo": "vol",
|
||||
"wa": "wln",
|
||||
"wo": "wol",
|
||||
"xh": "xho",
|
||||
"yi": "yid",
|
||||
"yo": "yor",
|
||||
"za": "zha",
|
||||
|
||||
# Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I
|
||||
# have no idea which one is better, so I just picked the bigger file.
|
||||
"zh": "chi_tra",
|
||||
|
||||
"zu": "zul"
|
||||
|
||||
}
|
214
src/paperless_tesseract/parsers.py
Normal file
214
src/paperless_tesseract/parsers.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import itertools
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import langdetect
|
||||
import pyocr
|
||||
from django.conf import settings
|
||||
from documents.parsers import DocumentParser, ParseError
|
||||
from PIL import Image
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
"""
|
||||
This parser uses Tesseract to try and get some text out of a rasterised
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
CONVERT = settings.CONVERT_BINARY
|
||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-scale", "500x5000",
|
||||
"-alpha", "remove",
|
||||
self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
|
||||
)
|
||||
|
||||
return os.path.join(self.tempdir, "convert-0000.png")
|
||||
|
||||
def get_text(self):
|
||||
|
||||
images = self._get_greyscale()
|
||||
|
||||
try:
|
||||
|
||||
return self._get_ocr(images)
|
||||
except OCRError as e:
|
||||
raise ParseError(e)
|
||||
|
||||
def _get_greyscale(self):
|
||||
"""
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
"""
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-density", str(self.DENSITY),
|
||||
"-depth", "8",
|
||||
"-type", "grayscale",
|
||||
self.document_path, pnm,
|
||||
)
|
||||
|
||||
# Get a list of converted images
|
||||
pnms = []
|
||||
for f in os.listdir(self.tempdir):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
||||
|
||||
# Return list of converted images, processed with unpaper
|
||||
pnms = []
|
||||
for f in os.listdir(self.tempdir):
|
||||
if f.endswith(".unpaper.pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
|
||||
def _guess_language(self, text):
|
||||
try:
|
||||
guess = langdetect.detect(text)
|
||||
self.log("debug", "Language detected: {}".format(guess))
|
||||
return guess
|
||||
except Exception as e:
|
||||
self.log("warning", "Language detection error: {}".format(e))
|
||||
|
||||
def _get_ocr(self, imgs):
|
||||
"""
|
||||
Attempts to do the best job possible OCR'ing the document based on
|
||||
simple language detection trial & error.
|
||||
"""
|
||||
|
||||
if not imgs:
|
||||
raise OCRError("No images found")
|
||||
|
||||
self.log("info", "OCRing the document")
|
||||
|
||||
# Since the division gets rounded down by int, this calculation works
|
||||
# for every edge-case, i.e. 1
|
||||
middle = int(len(imgs) / 2)
|
||||
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||
|
||||
guessed_language = self._guess_language(raw_text)
|
||||
|
||||
if not guessed_language or guessed_language not in ISO639:
|
||||
self.log("warning", "Language detection failed!")
|
||||
if settings.FORGIVING_OCR:
|
||||
self.log(
|
||||
"warning",
|
||||
"As FORGIVING_OCR is enabled, we're going to make the "
|
||||
"best with what we have."
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError("Language detection failed")
|
||||
|
||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
|
||||
try:
|
||||
return self._ocr(imgs, ISO639[guessed_language])
|
||||
except pyocr.pyocr.tesseract.TesseractError:
|
||||
if settings.FORGIVING_OCR:
|
||||
self.log(
|
||||
"warning",
|
||||
"OCR for {} failed, but we're going to stick with what "
|
||||
"we've got since FORGIVING_OCR is enabled.".format(
|
||||
guessed_language
|
||||
)
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError(
|
||||
"The guessed language is not available in this instance of "
|
||||
"Tesseract."
|
||||
)
|
||||
|
||||
def _ocr(self, imgs, lang):
|
||||
"""
|
||||
Performs a single OCR attempt.
|
||||
"""
|
||||
|
||||
if not imgs:
|
||||
return ""
|
||||
|
||||
self.log("info", "Parsing for {}".format(lang))
|
||||
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
r = " ".join(r)
|
||||
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
return strip_excess_whitespace(r)
|
||||
|
||||
def _assemble_ocr_sections(self, imgs, middle, text):
|
||||
"""
|
||||
Given a `middle` value and the text that middle page represents, we OCR
|
||||
the remainder of the document and return the whole thing.
|
||||
"""
|
||||
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||
return text
|
||||
|
||||
|
||||
def run_convert(*args):
|
||||
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||
if settings.CONVERT_TMPDIR:
|
||||
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
||||
|
||||
subprocess.Popen(args, env=environment).wait()
|
||||
|
||||
|
||||
def run_unpaper(args):
|
||||
unpaper, pnm = args
|
||||
subprocess.Popen(
|
||||
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(
|
||||
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
with Image.open(os.path.join(RasterisedDocumentParser.SCRATCH, img)) as f:
|
||||
if ocr.can_detect_orientation():
|
||||
try:
|
||||
orientation = ocr.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except (TesseractError, OtherTesseractError):
|
||||
pass
|
||||
return ocr.image_to_string(f, lang=lang)
|
23
src/paperless_tesseract/signals.py
Normal file
23
src/paperless_tesseract/signals.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import re
|
||||
|
||||
from .parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class ConsumerDeclaration(object):
|
||||
|
||||
MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$")
|
||||
|
||||
@classmethod
|
||||
def handle(cls, sender, **kwargs):
|
||||
return cls.test
|
||||
|
||||
@classmethod
|
||||
def test(cls, doc):
|
||||
|
||||
if cls.MATCHING_FILES.match(doc):
|
||||
return {
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0
|
||||
}
|
||||
|
||||
return None
|
0
src/paperless_tesseract/tests/__init__.py
Normal file
0
src/paperless_tesseract/tests/__init__.py
Normal file
BIN
src/paperless_tesseract/tests/samples/no-text.png
Normal file
BIN
src/paperless_tesseract/tests/samples/no-text.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 32 KiB |
80
src/paperless_tesseract/tests/test_ocr.py
Normal file
80
src/paperless_tesseract/tests/test_ocr.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import os
|
||||
from unittest import mock, skipIf
|
||||
|
||||
import pyocr
|
||||
from django.test import TestCase
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
|
||||
from ..parsers import image_to_string, strip_excess_whitespace
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise OtherTesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
return "This is test text"
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
class TestOCR(TestCase):
|
||||
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
(
|
||||
"simple newline\n testing string",
|
||||
"simple newline\ntesting string"
|
||||
),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце"
|
||||
)
|
||||
]
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||
source,
|
||||
result,
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
||||
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
|
||||
def test_image_to_string_with_text_free_page(self):
|
||||
"""
|
||||
This test is sort of silly, since it's really just reproducing an odd
|
||||
exception thrown by pyocr when it encounters a page with no text.
|
||||
Actually running this test against an installation of Tesseract results
|
||||
in a segmentation fault rooted somewhere deep inside pyocr where I
|
||||
don't care to dig. Regardless, if you run the consumer normally,
|
||||
text-free pages are now handled correctly so long as we work around
|
||||
this weird exception.
|
||||
"""
|
||||
image_to_string(["no-text.png", "en"])
|
Reference in New Issue
Block a user