mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #197 from danielquinn/pluggable-consumers
Pluggable consumers
This commit is contained in:
commit
b7cb708053
@ -8,7 +8,9 @@ matrix:
|
|||||||
env: TOXENV=py34
|
env: TOXENV=py34
|
||||||
- python: 3.5
|
- python: 3.5
|
||||||
env: TOXENV=py35
|
env: TOXENV=py35
|
||||||
- python: 3.5
|
- python: 3.6
|
||||||
|
env: TOXENV=py36
|
||||||
|
- python: 3.6
|
||||||
env: TOXENV=pep8
|
env: TOXENV=pep8
|
||||||
|
|
||||||
install:
|
install:
|
||||||
|
@ -4,6 +4,14 @@ Changelog
|
|||||||
* 0.3.6
|
* 0.3.6
|
||||||
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
|
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
|
||||||
correspondent or the tags for a document.
|
correspondent or the tags for a document.
|
||||||
|
* The ``content`` field is now optional, to allow for the edge case of a
|
||||||
|
purely graphical document.
|
||||||
|
* You can no longer add documents via the admin. This never worked in the
|
||||||
|
first place, so all I've done here is remove the link to the broken form.
|
||||||
|
* The consumer code has been heavily refactored to support a pluggable
|
||||||
|
interface. Install a paperless consumer via pip and tell paperless about
|
||||||
|
it with an environment variable, and you're good to go. Proper
|
||||||
|
documentation is on its way.
|
||||||
|
|
||||||
* 0.3.5
|
* 0.3.5
|
||||||
* A serious facelift for the documents listing page wherein we drop the
|
* A serious facelift for the documents listing page wherein we drop the
|
||||||
|
@ -67,6 +67,7 @@ class DocumentAdmin(CommonAdmin):
|
|||||||
|
|
||||||
def created_(self, obj):
|
def created_(self, obj):
|
||||||
return obj.created.date().strftime("%Y-%m-%d")
|
return obj.created.date().strftime("%Y-%m-%d")
|
||||||
|
created_.short_description = "Created"
|
||||||
|
|
||||||
def thumbnail(self, obj):
|
def thumbnail(self, obj):
|
||||||
png_img = self._html_tag(
|
png_img = self._html_tag(
|
||||||
|
@ -1,35 +1,21 @@
|
|||||||
|
import datetime
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
import shutil
|
|
||||||
import hashlib
|
|
||||||
import logging
|
|
||||||
import datetime
|
|
||||||
import tempfile
|
|
||||||
import itertools
|
|
||||||
import subprocess
|
|
||||||
from multiprocessing.pool import Pool
|
|
||||||
|
|
||||||
import pyocr
|
|
||||||
import langdetect
|
|
||||||
from PIL import Image
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
from pyocr.tesseract import TesseractError
|
|
||||||
from pyocr.libtesseract.tesseract_raw import \
|
|
||||||
TesseractError as OtherTesseractError
|
|
||||||
|
|
||||||
from .models import Tag, Document, FileInfo
|
from .models import Document, FileInfo, Tag
|
||||||
|
from .parsers import ParseError
|
||||||
from .signals import (
|
from .signals import (
|
||||||
document_consumption_started,
|
document_consumer_declaration,
|
||||||
document_consumption_finished
|
document_consumption_finished,
|
||||||
|
document_consumption_started
|
||||||
)
|
)
|
||||||
from .languages import ISO639
|
|
||||||
|
|
||||||
|
|
||||||
class OCRError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class ConsumerError(Exception):
|
class ConsumerError(Exception):
|
||||||
@ -47,13 +33,7 @@ class Consumer(object):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
SCRATCH = settings.SCRATCH_DIR
|
SCRATCH = settings.SCRATCH_DIR
|
||||||
CONVERT = settings.CONVERT_BINARY
|
|
||||||
UNPAPER = settings.UNPAPER_BINARY
|
|
||||||
CONSUME = settings.CONSUMPTION_DIR
|
CONSUME = settings.CONSUMPTION_DIR
|
||||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
|
||||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
|
||||||
|
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
@ -78,6 +58,16 @@ class Consumer(object):
|
|||||||
raise ConsumerError(
|
raise ConsumerError(
|
||||||
"Consumption directory {} does not exist".format(self.CONSUME))
|
"Consumption directory {} does not exist".format(self.CONSUME))
|
||||||
|
|
||||||
|
self.parsers = []
|
||||||
|
for response in document_consumer_declaration.send(self):
|
||||||
|
self.parsers.append(response[1])
|
||||||
|
|
||||||
|
if not self.parsers:
|
||||||
|
raise ConsumerError(
|
||||||
|
"No parsers could be found, not even the default. "
|
||||||
|
"This is a problem."
|
||||||
|
)
|
||||||
|
|
||||||
def log(self, level, message):
|
def log(self, level, message):
|
||||||
getattr(self.logger, level)(message, extra={
|
getattr(self.logger, level)(message, extra={
|
||||||
"group": self.logging_group
|
"group": self.logging_group
|
||||||
@ -109,6 +99,13 @@ class Consumer(object):
|
|||||||
self._ignore.append(doc)
|
self._ignore.append(doc)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
parser_class = self._get_parser_class(doc)
|
||||||
|
if not parser_class:
|
||||||
|
self.log(
|
||||||
|
"info", "No parsers could be found for {}".format(doc))
|
||||||
|
self._ignore.append(doc)
|
||||||
|
continue
|
||||||
|
|
||||||
self.logging_group = uuid.uuid4()
|
self.logging_group = uuid.uuid4()
|
||||||
|
|
||||||
self.log("info", "Consuming {}".format(doc))
|
self.log("info", "Consuming {}".format(doc))
|
||||||
@ -119,25 +116,26 @@ class Consumer(object):
|
|||||||
logging_group=self.logging_group
|
logging_group=self.logging_group
|
||||||
)
|
)
|
||||||
|
|
||||||
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
parsed_document = parser_class(doc)
|
||||||
imgs = self._get_greyscale(tempdir, doc)
|
thumbnail = parsed_document.get_thumbnail()
|
||||||
thumbnail = self._get_thumbnail(tempdir, doc)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
document = self._store(
|
||||||
document = self._store(self._get_ocr(imgs), doc, thumbnail)
|
parsed_document.get_text(),
|
||||||
|
doc,
|
||||||
except OCRError as e:
|
thumbnail
|
||||||
|
)
|
||||||
|
except ParseError as e:
|
||||||
|
|
||||||
self._ignore.append(doc)
|
self._ignore.append(doc)
|
||||||
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
|
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
|
||||||
self._cleanup_tempdir(tempdir)
|
parsed_document.cleanup()
|
||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
||||||
self._cleanup_tempdir(tempdir)
|
parsed_document.cleanup()
|
||||||
self._cleanup_doc(doc)
|
self._cleanup_doc(doc)
|
||||||
|
|
||||||
self.log(
|
self.log(
|
||||||
@ -151,142 +149,20 @@ class Consumer(object):
|
|||||||
logging_group=self.logging_group
|
logging_group=self.logging_group
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_greyscale(self, tempdir, doc):
|
def _get_parser_class(self, doc):
|
||||||
"""
|
"""
|
||||||
Greyscale images are easier for Tesseract to OCR
|
Determine the appropriate parser class based on the file
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.log("info", "Generating greyscale image from {}".format(doc))
|
options = []
|
||||||
|
for parser in self.parsers:
|
||||||
|
result = parser(doc)
|
||||||
|
if result:
|
||||||
|
options.append(result)
|
||||||
|
|
||||||
# Convert PDF to multiple PNMs
|
# Return the parser with the highest weight.
|
||||||
pnm = os.path.join(tempdir, "convert-%04d.pnm")
|
return sorted(
|
||||||
run_convert(
|
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||||
self.CONVERT,
|
|
||||||
"-density", str(self.DENSITY),
|
|
||||||
"-depth", "8",
|
|
||||||
"-type", "grayscale",
|
|
||||||
doc, pnm,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get a list of converted images
|
|
||||||
pnms = []
|
|
||||||
for f in os.listdir(tempdir):
|
|
||||||
if f.endswith(".pnm"):
|
|
||||||
pnms.append(os.path.join(tempdir, f))
|
|
||||||
|
|
||||||
# Run unpaper in parallel on converted images
|
|
||||||
with Pool(processes=self.THREADS) as pool:
|
|
||||||
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
|
||||||
|
|
||||||
# Return list of converted images, processed with unpaper
|
|
||||||
pnms = []
|
|
||||||
for f in os.listdir(tempdir):
|
|
||||||
if f.endswith(".unpaper.pnm"):
|
|
||||||
pnms.append(os.path.join(tempdir, f))
|
|
||||||
|
|
||||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
|
||||||
|
|
||||||
def _get_thumbnail(self, tempdir, doc):
|
|
||||||
"""
|
|
||||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.log("info", "Generating the thumbnail")
|
|
||||||
|
|
||||||
run_convert(
|
|
||||||
self.CONVERT,
|
|
||||||
"-scale", "500x5000",
|
|
||||||
"-alpha", "remove",
|
|
||||||
doc, os.path.join(tempdir, "convert-%04d.png")
|
|
||||||
)
|
|
||||||
|
|
||||||
return os.path.join(tempdir, "convert-0000.png")
|
|
||||||
|
|
||||||
def _guess_language(self, text):
|
|
||||||
try:
|
|
||||||
guess = langdetect.detect(text)
|
|
||||||
self.log("debug", "Language detected: {}".format(guess))
|
|
||||||
return guess
|
|
||||||
except Exception as e:
|
|
||||||
self.log("warning", "Language detection error: {}".format(e))
|
|
||||||
|
|
||||||
def _get_ocr(self, imgs):
|
|
||||||
"""
|
|
||||||
Attempts to do the best job possible OCR'ing the document based on
|
|
||||||
simple language detection trial & error.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not imgs:
|
|
||||||
raise OCRError("No images found")
|
|
||||||
|
|
||||||
self.log("info", "OCRing the document")
|
|
||||||
|
|
||||||
# Since the division gets rounded down by int, this calculation works
|
|
||||||
# for every edge-case, i.e. 1
|
|
||||||
middle = int(len(imgs) / 2)
|
|
||||||
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
|
||||||
|
|
||||||
guessed_language = self._guess_language(raw_text)
|
|
||||||
|
|
||||||
if not guessed_language or guessed_language not in ISO639:
|
|
||||||
self.log("warning", "Language detection failed!")
|
|
||||||
if settings.FORGIVING_OCR:
|
|
||||||
self.log(
|
|
||||||
"warning",
|
|
||||||
"As FORGIVING_OCR is enabled, we're going to make the "
|
|
||||||
"best with what we have."
|
|
||||||
)
|
|
||||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
|
||||||
return raw_text
|
|
||||||
raise OCRError("Language detection failed")
|
|
||||||
|
|
||||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
|
||||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
|
||||||
return raw_text
|
|
||||||
|
|
||||||
try:
|
|
||||||
return self._ocr(imgs, ISO639[guessed_language])
|
|
||||||
except pyocr.pyocr.tesseract.TesseractError:
|
|
||||||
if settings.FORGIVING_OCR:
|
|
||||||
self.log(
|
|
||||||
"warning",
|
|
||||||
"OCR for {} failed, but we're going to stick with what "
|
|
||||||
"we've got since FORGIVING_OCR is enabled.".format(
|
|
||||||
guessed_language
|
|
||||||
)
|
|
||||||
)
|
|
||||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
|
||||||
return raw_text
|
|
||||||
raise OCRError(
|
|
||||||
"The guessed language is not available in this instance of "
|
|
||||||
"Tesseract."
|
|
||||||
)
|
|
||||||
|
|
||||||
def _assemble_ocr_sections(self, imgs, middle, text):
|
|
||||||
"""
|
|
||||||
Given a `middle` value and the text that middle page represents, we OCR
|
|
||||||
the remainder of the document and return the whole thing.
|
|
||||||
"""
|
|
||||||
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
|
||||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
|
||||||
return text
|
|
||||||
|
|
||||||
def _ocr(self, imgs, lang):
|
|
||||||
"""
|
|
||||||
Performs a single OCR attempt.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not imgs:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
self.log("info", "Parsing for {}".format(lang))
|
|
||||||
|
|
||||||
with Pool(processes=self.THREADS) as pool:
|
|
||||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
|
||||||
r = " ".join(r)
|
|
||||||
|
|
||||||
# Strip out excess white space to allow matching to go smoother
|
|
||||||
return strip_excess_whitespace(r)
|
|
||||||
|
|
||||||
def _store(self, text, doc, thumbnail):
|
def _store(self, text, doc, thumbnail):
|
||||||
|
|
||||||
@ -332,10 +208,6 @@ class Consumer(object):
|
|||||||
|
|
||||||
return document
|
return document
|
||||||
|
|
||||||
def _cleanup_tempdir(self, d):
|
|
||||||
self.log("debug", "Deleting directory {}".format(d))
|
|
||||||
shutil.rmtree(d)
|
|
||||||
|
|
||||||
def _cleanup_doc(self, doc):
|
def _cleanup_doc(self, doc):
|
||||||
self.log("debug", "Deleting document {}".format(doc))
|
self.log("debug", "Deleting document {}".format(doc))
|
||||||
os.unlink(doc)
|
os.unlink(doc)
|
||||||
@ -361,41 +233,3 @@ class Consumer(object):
|
|||||||
with open(doc, "rb") as f:
|
with open(doc, "rb") as f:
|
||||||
checksum = hashlib.md5(f.read()).hexdigest()
|
checksum = hashlib.md5(f.read()).hexdigest()
|
||||||
return Document.objects.filter(checksum=checksum).exists()
|
return Document.objects.filter(checksum=checksum).exists()
|
||||||
|
|
||||||
|
|
||||||
def strip_excess_whitespace(text):
|
|
||||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
|
||||||
no_leading_whitespace = re.sub(
|
|
||||||
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
|
||||||
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
|
||||||
return no_trailing_whitespace
|
|
||||||
|
|
||||||
|
|
||||||
def image_to_string(args):
|
|
||||||
img, lang = args
|
|
||||||
ocr = pyocr.get_available_tools()[0]
|
|
||||||
with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
|
|
||||||
if ocr.can_detect_orientation():
|
|
||||||
try:
|
|
||||||
orientation = ocr.detect_orientation(f, lang=lang)
|
|
||||||
f = f.rotate(orientation["angle"], expand=1)
|
|
||||||
except (TesseractError, OtherTesseractError):
|
|
||||||
pass
|
|
||||||
return ocr.image_to_string(f, lang=lang)
|
|
||||||
|
|
||||||
|
|
||||||
def run_unpaper(args):
|
|
||||||
unpaper, pnm = args
|
|
||||||
subprocess.Popen(
|
|
||||||
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
|
|
||||||
|
|
||||||
|
|
||||||
def run_convert(*args):
|
|
||||||
|
|
||||||
environment = os.environ.copy()
|
|
||||||
if settings.CONVERT_MEMORY_LIMIT:
|
|
||||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
|
||||||
if settings.CONVERT_TMPDIR:
|
|
||||||
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
|
||||||
|
|
||||||
subprocess.Popen(args, env=environment).wait()
|
|
||||||
|
@ -158,13 +158,22 @@ class Document(models.Model):
|
|||||||
|
|
||||||
correspondent = models.ForeignKey(
|
correspondent = models.ForeignKey(
|
||||||
Correspondent, blank=True, null=True, related_name="documents")
|
Correspondent, blank=True, null=True, related_name="documents")
|
||||||
|
|
||||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||||
content = models.TextField(db_index=True)
|
|
||||||
|
content = models.TextField(
|
||||||
|
db_index=True,
|
||||||
|
blank=True,
|
||||||
|
help_text="The raw, text-only data of the document. This field is "
|
||||||
|
"primarily used for searching."
|
||||||
|
)
|
||||||
|
|
||||||
file_type = models.CharField(
|
file_type = models.CharField(
|
||||||
max_length=4,
|
max_length=4,
|
||||||
editable=False,
|
editable=False,
|
||||||
choices=tuple([(t, t.upper()) for t in TYPES])
|
choices=tuple([(t, t.upper()) for t in TYPES])
|
||||||
)
|
)
|
||||||
|
|
||||||
tags = models.ManyToManyField(
|
tags = models.ManyToManyField(
|
||||||
Tag, related_name="documents", blank=True)
|
Tag, related_name="documents", blank=True)
|
||||||
|
|
||||||
|
45
src/documents/parsers.py
Normal file
45
src/documents/parsers.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
|
||||||
|
class ParseError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentParser(object):
|
||||||
|
"""
|
||||||
|
Subclass this to make your own parser. Have a look at
|
||||||
|
`paperless_tesseract.parsers` for inspiration.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SCRATCH = settings.SCRATCH_DIR
|
||||||
|
|
||||||
|
def __init__(self, path):
|
||||||
|
self.document_path = path
|
||||||
|
self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
self.logging_group = None
|
||||||
|
|
||||||
|
def get_thumbnail(self):
|
||||||
|
"""
|
||||||
|
Returns the path to a file we can use as a thumbnail for this document.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
"""
|
||||||
|
Returns the text from the document and only the text.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def log(self, level, message):
|
||||||
|
getattr(self.logger, level)(message, extra={
|
||||||
|
"group": self.logging_group
|
||||||
|
})
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||||
|
shutil.rmtree(self.tempdir)
|
@ -2,3 +2,4 @@ from django.dispatch import Signal
|
|||||||
|
|
||||||
document_consumption_started = Signal(providing_args=["filename"])
|
document_consumption_started = Signal(providing_args=["filename"])
|
||||||
document_consumption_finished = Signal(providing_args=["document"])
|
document_consumption_finished = Signal(providing_args=["document"])
|
||||||
|
document_consumer_declaration = Signal(providing_args=[])
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from subprocess import Popen
|
from subprocess import Popen
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
@ -158,7 +158,7 @@
|
|||||||
|
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
// We nee to re-build the select-all functionality as the old logic pointed
|
// We need to re-build the select-all functionality as the old logic pointed
|
||||||
// to a table and we're using divs now.
|
// to a table and we're using divs now.
|
||||||
django.jQuery("#action-toggle").on("change", function(){
|
django.jQuery("#action-toggle").on("change", function(){
|
||||||
django.jQuery(".grid .box .result .checkbox input")
|
django.jQuery(".grid .box .result .checkbox input")
|
||||||
|
@ -1,13 +1,6 @@
|
|||||||
import os
|
|
||||||
from unittest import mock, skipIf
|
|
||||||
|
|
||||||
import pyocr
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from pyocr.libtesseract.tesseract_raw import \
|
|
||||||
TesseractError as OtherTesseractError
|
|
||||||
|
|
||||||
from ..models import FileInfo
|
from ..models import FileInfo
|
||||||
from ..consumer import image_to_string, strip_excess_whitespace
|
|
||||||
|
|
||||||
|
|
||||||
class TestAttributes(TestCase):
|
class TestAttributes(TestCase):
|
||||||
@ -308,71 +301,3 @@ class TestFieldPermutations(TestCase):
|
|||||||
}
|
}
|
||||||
self._test_guessed_attributes(
|
self._test_guessed_attributes(
|
||||||
template.format(**spec), **spec)
|
template.format(**spec), **spec)
|
||||||
|
|
||||||
|
|
||||||
class FakeTesseract(object):
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def can_detect_orientation():
|
|
||||||
return True
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def detect_orientation(file_handle, lang):
|
|
||||||
raise OtherTesseractError("arbitrary status", "message")
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def image_to_string(file_handle, lang):
|
|
||||||
return "This is test text"
|
|
||||||
|
|
||||||
|
|
||||||
class FakePyOcr(object):
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_available_tools():
|
|
||||||
return [FakeTesseract]
|
|
||||||
|
|
||||||
|
|
||||||
class TestOCR(TestCase):
|
|
||||||
|
|
||||||
text_cases = [
|
|
||||||
("simple string", "simple string"),
|
|
||||||
(
|
|
||||||
"simple newline\n testing string",
|
|
||||||
"simple newline\ntesting string"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"utf-8 строка с пробелами в конце ",
|
|
||||||
"utf-8 строка с пробелами в конце"
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
|
||||||
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
|
||||||
|
|
||||||
def test_strip_excess_whitespace(self):
|
|
||||||
for source, result in self.text_cases:
|
|
||||||
actual_result = strip_excess_whitespace(source)
|
|
||||||
self.assertEqual(
|
|
||||||
result,
|
|
||||||
actual_result,
|
|
||||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
|
||||||
source,
|
|
||||||
result,
|
|
||||||
actual_result
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
|
||||||
@mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
|
|
||||||
@mock.patch("documents.consumer.pyocr", FakePyOcr)
|
|
||||||
def test_image_to_string_with_text_free_page(self):
|
|
||||||
"""
|
|
||||||
This test is sort of silly, since it's really just reproducing an odd
|
|
||||||
exception thrown by pyocr when it encounters a page with no text.
|
|
||||||
Actually running this test against an installation of Tesseract results
|
|
||||||
in a segmentation fault rooted somewhere deep inside pyocr where I
|
|
||||||
don't care to dig. Regardless, if you run the consumer normally,
|
|
||||||
text-free pages are now handled correctly so long as we work around
|
|
||||||
this weird exception.
|
|
||||||
"""
|
|
||||||
image_to_string(["no-text.png", "en"])
|
|
||||||
|
@ -61,6 +61,7 @@ INSTALLED_APPS = [
|
|||||||
"django_extensions",
|
"django_extensions",
|
||||||
|
|
||||||
"documents.apps.DocumentsConfig",
|
"documents.apps.DocumentsConfig",
|
||||||
|
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||||
|
|
||||||
"flat_responsive",
|
"flat_responsive",
|
||||||
"django.contrib.admin",
|
"django.contrib.admin",
|
||||||
@ -70,6 +71,9 @@ INSTALLED_APPS = [
|
|||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if os.getenv("PAPERLESS_INSTALLED_APPS"):
|
||||||
|
INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",")
|
||||||
|
|
||||||
MIDDLEWARE_CLASSES = [
|
MIDDLEWARE_CLASSES = [
|
||||||
'django.middleware.security.SecurityMiddleware',
|
'django.middleware.security.SecurityMiddleware',
|
||||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = (0, 3, 5)
|
__version__ = (0, 3, 6)
|
||||||
|
0
src/paperless_tesseract/__init__.py
Normal file
0
src/paperless_tesseract/__init__.py
Normal file
16
src/paperless_tesseract/apps.py
Normal file
16
src/paperless_tesseract/apps.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class PaperlessTesseractConfig(AppConfig):
|
||||||
|
|
||||||
|
name = "paperless_tesseract"
|
||||||
|
|
||||||
|
def ready(self):
|
||||||
|
|
||||||
|
from documents.signals import document_consumer_declaration
|
||||||
|
|
||||||
|
from .signals import ConsumerDeclaration
|
||||||
|
|
||||||
|
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
||||||
|
|
||||||
|
AppConfig.ready(self)
|
214
src/paperless_tesseract/parsers.py
Normal file
214
src/paperless_tesseract/parsers.py
Normal file
@ -0,0 +1,214 @@
|
|||||||
|
import itertools
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
from multiprocessing.pool import Pool
|
||||||
|
|
||||||
|
import langdetect
|
||||||
|
import pyocr
|
||||||
|
from django.conf import settings
|
||||||
|
from documents.parsers import DocumentParser, ParseError
|
||||||
|
from PIL import Image
|
||||||
|
from pyocr.libtesseract.tesseract_raw import \
|
||||||
|
TesseractError as OtherTesseractError
|
||||||
|
from pyocr.tesseract import TesseractError
|
||||||
|
|
||||||
|
from .languages import ISO639
|
||||||
|
|
||||||
|
|
||||||
|
class OCRError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RasterisedDocumentParser(DocumentParser):
|
||||||
|
"""
|
||||||
|
This parser uses Tesseract to try and get some text out of a rasterised
|
||||||
|
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||||
|
"""
|
||||||
|
|
||||||
|
CONVERT = settings.CONVERT_BINARY
|
||||||
|
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||||
|
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||||
|
UNPAPER = settings.UNPAPER_BINARY
|
||||||
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
|
|
||||||
|
def get_thumbnail(self):
|
||||||
|
"""
|
||||||
|
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||||
|
"""
|
||||||
|
|
||||||
|
run_convert(
|
||||||
|
self.CONVERT,
|
||||||
|
"-scale", "500x5000",
|
||||||
|
"-alpha", "remove",
|
||||||
|
self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
|
||||||
|
)
|
||||||
|
|
||||||
|
return os.path.join(self.tempdir, "convert-0000.png")
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
|
||||||
|
images = self._get_greyscale()
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
return self._get_ocr(images)
|
||||||
|
except OCRError as e:
|
||||||
|
raise ParseError(e)
|
||||||
|
|
||||||
|
def _get_greyscale(self):
|
||||||
|
"""
|
||||||
|
Greyscale images are easier for Tesseract to OCR
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Convert PDF to multiple PNMs
|
||||||
|
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||||
|
run_convert(
|
||||||
|
self.CONVERT,
|
||||||
|
"-density", str(self.DENSITY),
|
||||||
|
"-depth", "8",
|
||||||
|
"-type", "grayscale",
|
||||||
|
self.document_path, pnm,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get a list of converted images
|
||||||
|
pnms = []
|
||||||
|
for f in os.listdir(self.tempdir):
|
||||||
|
if f.endswith(".pnm"):
|
||||||
|
pnms.append(os.path.join(self.tempdir, f))
|
||||||
|
|
||||||
|
# Run unpaper in parallel on converted images
|
||||||
|
with Pool(processes=self.THREADS) as pool:
|
||||||
|
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
||||||
|
|
||||||
|
# Return list of converted images, processed with unpaper
|
||||||
|
pnms = []
|
||||||
|
for f in os.listdir(self.tempdir):
|
||||||
|
if f.endswith(".unpaper.pnm"):
|
||||||
|
pnms.append(os.path.join(self.tempdir, f))
|
||||||
|
|
||||||
|
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||||
|
|
||||||
|
def _guess_language(self, text):
|
||||||
|
try:
|
||||||
|
guess = langdetect.detect(text)
|
||||||
|
self.log("debug", "Language detected: {}".format(guess))
|
||||||
|
return guess
|
||||||
|
except Exception as e:
|
||||||
|
self.log("warning", "Language detection error: {}".format(e))
|
||||||
|
|
||||||
|
def _get_ocr(self, imgs):
|
||||||
|
"""
|
||||||
|
Attempts to do the best job possible OCR'ing the document based on
|
||||||
|
simple language detection trial & error.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not imgs:
|
||||||
|
raise OCRError("No images found")
|
||||||
|
|
||||||
|
self.log("info", "OCRing the document")
|
||||||
|
|
||||||
|
# Since the division gets rounded down by int, this calculation works
|
||||||
|
# for every edge-case, i.e. 1
|
||||||
|
middle = int(len(imgs) / 2)
|
||||||
|
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||||
|
|
||||||
|
guessed_language = self._guess_language(raw_text)
|
||||||
|
|
||||||
|
if not guessed_language or guessed_language not in ISO639:
|
||||||
|
self.log("warning", "Language detection failed!")
|
||||||
|
if settings.FORGIVING_OCR:
|
||||||
|
self.log(
|
||||||
|
"warning",
|
||||||
|
"As FORGIVING_OCR is enabled, we're going to make the "
|
||||||
|
"best with what we have."
|
||||||
|
)
|
||||||
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
|
return raw_text
|
||||||
|
raise OCRError("Language detection failed")
|
||||||
|
|
||||||
|
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||||
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
|
return raw_text
|
||||||
|
|
||||||
|
try:
|
||||||
|
return self._ocr(imgs, ISO639[guessed_language])
|
||||||
|
except pyocr.pyocr.tesseract.TesseractError:
|
||||||
|
if settings.FORGIVING_OCR:
|
||||||
|
self.log(
|
||||||
|
"warning",
|
||||||
|
"OCR for {} failed, but we're going to stick with what "
|
||||||
|
"we've got since FORGIVING_OCR is enabled.".format(
|
||||||
|
guessed_language
|
||||||
|
)
|
||||||
|
)
|
||||||
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
|
return raw_text
|
||||||
|
raise OCRError(
|
||||||
|
"The guessed language is not available in this instance of "
|
||||||
|
"Tesseract."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _ocr(self, imgs, lang):
|
||||||
|
"""
|
||||||
|
Performs a single OCR attempt.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not imgs:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
self.log("info", "Parsing for {}".format(lang))
|
||||||
|
|
||||||
|
with Pool(processes=self.THREADS) as pool:
|
||||||
|
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||||
|
r = " ".join(r)
|
||||||
|
|
||||||
|
# Strip out excess white space to allow matching to go smoother
|
||||||
|
return strip_excess_whitespace(r)
|
||||||
|
|
||||||
|
def _assemble_ocr_sections(self, imgs, middle, text):
|
||||||
|
"""
|
||||||
|
Given a `middle` value and the text that middle page represents, we OCR
|
||||||
|
the remainder of the document and return the whole thing.
|
||||||
|
"""
|
||||||
|
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
||||||
|
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def run_convert(*args):
|
||||||
|
|
||||||
|
environment = os.environ.copy()
|
||||||
|
if settings.CONVERT_MEMORY_LIMIT:
|
||||||
|
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||||
|
if settings.CONVERT_TMPDIR:
|
||||||
|
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
||||||
|
|
||||||
|
subprocess.Popen(args, env=environment).wait()
|
||||||
|
|
||||||
|
|
||||||
|
def run_unpaper(args):
|
||||||
|
unpaper, pnm = args
|
||||||
|
subprocess.Popen(
|
||||||
|
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
|
||||||
|
|
||||||
|
|
||||||
|
def strip_excess_whitespace(text):
|
||||||
|
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||||
|
no_leading_whitespace = re.sub(
|
||||||
|
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||||
|
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||||
|
return no_trailing_whitespace
|
||||||
|
|
||||||
|
|
||||||
|
def image_to_string(args):
|
||||||
|
img, lang = args
|
||||||
|
ocr = pyocr.get_available_tools()[0]
|
||||||
|
with Image.open(os.path.join(RasterisedDocumentParser.SCRATCH, img)) as f:
|
||||||
|
if ocr.can_detect_orientation():
|
||||||
|
try:
|
||||||
|
orientation = ocr.detect_orientation(f, lang=lang)
|
||||||
|
f = f.rotate(orientation["angle"], expand=1)
|
||||||
|
except (TesseractError, OtherTesseractError):
|
||||||
|
pass
|
||||||
|
return ocr.image_to_string(f, lang=lang)
|
23
src/paperless_tesseract/signals.py
Normal file
23
src/paperless_tesseract/signals.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from .parsers import RasterisedDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class ConsumerDeclaration(object):
|
||||||
|
|
||||||
|
MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def handle(cls, sender, **kwargs):
|
||||||
|
return cls.test
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def test(cls, doc):
|
||||||
|
|
||||||
|
if cls.MATCHING_FILES.match(doc):
|
||||||
|
return {
|
||||||
|
"parser": RasterisedDocumentParser,
|
||||||
|
"weight": 0
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
0
src/paperless_tesseract/tests/__init__.py
Normal file
0
src/paperless_tesseract/tests/__init__.py
Normal file
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
80
src/paperless_tesseract/tests/test_ocr.py
Normal file
80
src/paperless_tesseract/tests/test_ocr.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import os
|
||||||
|
from unittest import mock, skipIf
|
||||||
|
|
||||||
|
import pyocr
|
||||||
|
from django.test import TestCase
|
||||||
|
from pyocr.libtesseract.tesseract_raw import \
|
||||||
|
TesseractError as OtherTesseractError
|
||||||
|
|
||||||
|
from ..parsers import image_to_string, strip_excess_whitespace
|
||||||
|
|
||||||
|
|
||||||
|
class FakeTesseract(object):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def can_detect_orientation():
|
||||||
|
return True
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def detect_orientation(file_handle, lang):
|
||||||
|
raise OtherTesseractError("arbitrary status", "message")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def image_to_string(file_handle, lang):
|
||||||
|
return "This is test text"
|
||||||
|
|
||||||
|
|
||||||
|
class FakePyOcr(object):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_available_tools():
|
||||||
|
return [FakeTesseract]
|
||||||
|
|
||||||
|
|
||||||
|
class TestOCR(TestCase):
|
||||||
|
|
||||||
|
text_cases = [
|
||||||
|
("simple string", "simple string"),
|
||||||
|
(
|
||||||
|
"simple newline\n testing string",
|
||||||
|
"simple newline\ntesting string"
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"utf-8 строка с пробелами в конце ",
|
||||||
|
"utf-8 строка с пробелами в конце"
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||||
|
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
||||||
|
|
||||||
|
def test_strip_excess_whitespace(self):
|
||||||
|
for source, result in self.text_cases:
|
||||||
|
actual_result = strip_excess_whitespace(source)
|
||||||
|
self.assertEqual(
|
||||||
|
result,
|
||||||
|
actual_result,
|
||||||
|
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||||
|
source,
|
||||||
|
result,
|
||||||
|
actual_result
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
||||||
|
@mock.patch(
|
||||||
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||||
|
SAMPLE_FILES
|
||||||
|
)
|
||||||
|
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
|
||||||
|
def test_image_to_string_with_text_free_page(self):
|
||||||
|
"""
|
||||||
|
This test is sort of silly, since it's really just reproducing an odd
|
||||||
|
exception thrown by pyocr when it encounters a page with no text.
|
||||||
|
Actually running this test against an installation of Tesseract results
|
||||||
|
in a segmentation fault rooted somewhere deep inside pyocr where I
|
||||||
|
don't care to dig. Regardless, if you run the consumer normally,
|
||||||
|
text-free pages are now handled correctly so long as we work around
|
||||||
|
this weird exception.
|
||||||
|
"""
|
||||||
|
image_to_string(["no-text.png", "en"])
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
[tox]
|
[tox]
|
||||||
skipsdist = True
|
skipsdist = True
|
||||||
envlist = py34, py35, pep8
|
envlist = py34, py35, py36, pep8
|
||||||
|
|
||||||
[testenv]
|
[testenv]
|
||||||
commands = {envpython} manage.py test
|
commands = {envpython} manage.py test
|
||||||
|
Loading…
x
Reference in New Issue
Block a user