From 55e81ca4bb13c31327ff02ab725f0b087306bf62 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 11 Mar 2017 16:30:49 +0000 Subject: [PATCH 1/5] feat: refactor for pluggable consumers I've broken out the OCR-specific code from the consumers and dumped it all into its own app, `paperless_tesseract`. This new app should serve as a sample of how to create one's own consumer for different file types. Documentation for how to do this isn't ready yet, but for the impatient: * Create a new app * containing a `parsers.py` for your parser modelled after `paperless_tesseract.parsers.RasterisedDocumentParser` * containing a `signals.py` with a handler moddelled after `paperless_tesseract.signals.ConsumerDeclaration` * connect the signal handler to `documents.signals.document_consumer_declaration` in `your_app.apps` * Install the app into Paperless by declaring `PAPERLESS_INSTALLED_APPS=your_app`. Additional apps should be separated with commas. * Restart the consumer --- src/documents/consumer.py | 258 ++++-------------- src/documents/parsers.py | 45 +++ src/documents/signals/__init__.py | 1 + src/documents/signals/handlers.py | 1 - src/documents/tests/test_consumer.py | 73 ----- src/paperless/settings.py | 4 + src/paperless_tesseract/__init__.py | 0 src/paperless_tesseract/apps.py | 16 ++ .../languages.py | 0 src/paperless_tesseract/parsers.py | 214 +++++++++++++++ src/paperless_tesseract/signals.py | 23 ++ src/paperless_tesseract/tests/__init__.py | 0 .../tests/samples/no-text.png | Bin src/paperless_tesseract/tests/test_ocr.py | 80 ++++++ 14 files changed, 429 insertions(+), 286 deletions(-) create mode 100644 src/documents/parsers.py create mode 100644 src/paperless_tesseract/__init__.py create mode 100644 src/paperless_tesseract/apps.py rename src/{documents => paperless_tesseract}/languages.py (100%) create mode 100644 src/paperless_tesseract/parsers.py create mode 100644 src/paperless_tesseract/signals.py create mode 100644 src/paperless_tesseract/tests/__init__.py rename src/{documents => paperless_tesseract}/tests/samples/no-text.png (100%) create mode 100644 src/paperless_tesseract/tests/test_ocr.py diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 02397c118..65e74f3a8 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,35 +1,21 @@ +import datetime +import hashlib +import logging import os import re import uuid -import shutil -import hashlib -import logging -import datetime -import tempfile -import itertools -import subprocess -from multiprocessing.pool import Pool -import pyocr -import langdetect -from PIL import Image from django.conf import settings from django.utils import timezone from paperless.db import GnuPG -from pyocr.tesseract import TesseractError -from pyocr.libtesseract.tesseract_raw import \ - TesseractError as OtherTesseractError -from .models import Tag, Document, FileInfo +from .models import Document, FileInfo, Tag +from .parsers import ParseError from .signals import ( - document_consumption_started, - document_consumption_finished + document_consumer_declaration, + document_consumption_finished, + document_consumption_started ) -from .languages import ISO639 - - -class OCRError(Exception): - pass class ConsumerError(Exception): @@ -47,13 +33,7 @@ class Consumer(object): """ SCRATCH = settings.SCRATCH_DIR - CONVERT = settings.CONVERT_BINARY - UNPAPER = settings.UNPAPER_BINARY CONSUME = settings.CONSUMPTION_DIR - THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None - DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 - - DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE def __init__(self): @@ -78,6 +58,16 @@ class Consumer(object): raise ConsumerError( "Consumption directory {} does not exist".format(self.CONSUME)) + self.parsers = [] + for response in document_consumer_declaration.send(self): + self.parsers.append(response[1]) + + if not self.parsers: + raise ConsumerError( + "No parsers could be found, not even the default. " + "This is a problem." + ) + def log(self, level, message): getattr(self.logger, level)(message, extra={ "group": self.logging_group @@ -109,6 +99,13 @@ class Consumer(object): self._ignore.append(doc) continue + parser_class = self._get_parser_class(doc) + if not parser_class: + self.log( + "info", "No parsers could be found for {}".format(doc)) + self._ignore.append(doc) + continue + self.logging_group = uuid.uuid4() self.log("info", "Consuming {}".format(doc)) @@ -119,25 +116,26 @@ class Consumer(object): logging_group=self.logging_group ) - tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) - imgs = self._get_greyscale(tempdir, doc) - thumbnail = self._get_thumbnail(tempdir, doc) + parsed_document = parser_class(doc) + thumbnail = parsed_document.get_thumbnail() try: - - document = self._store(self._get_ocr(imgs), doc, thumbnail) - - except OCRError as e: + document = self._store( + parsed_document.get_text(), + doc, + thumbnail + ) + except ParseError as e: self._ignore.append(doc) - self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) - self._cleanup_tempdir(tempdir) + self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) + parsed_document.cleanup() continue else: - self._cleanup_tempdir(tempdir) + parsed_document.cleanup() self._cleanup_doc(doc) self.log( @@ -151,142 +149,20 @@ class Consumer(object): logging_group=self.logging_group ) - def _get_greyscale(self, tempdir, doc): + def _get_parser_class(self, doc): """ - Greyscale images are easier for Tesseract to OCR + Determine the appropriate parser class based on the file """ - self.log("info", "Generating greyscale image from {}".format(doc)) + options = [] + for parser in self.parsers: + result = parser(doc) + if result: + options.append(result) - # Convert PDF to multiple PNMs - pnm = os.path.join(tempdir, "convert-%04d.pnm") - run_convert( - self.CONVERT, - "-density", str(self.DENSITY), - "-depth", "8", - "-type", "grayscale", - doc, pnm, - ) - - # Get a list of converted images - pnms = [] - for f in os.listdir(tempdir): - if f.endswith(".pnm"): - pnms.append(os.path.join(tempdir, f)) - - # Run unpaper in parallel on converted images - with Pool(processes=self.THREADS) as pool: - pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) - - # Return list of converted images, processed with unpaper - pnms = [] - for f in os.listdir(tempdir): - if f.endswith(".unpaper.pnm"): - pnms.append(os.path.join(tempdir, f)) - - return sorted(filter(lambda __: os.path.isfile(__), pnms)) - - def _get_thumbnail(self, tempdir, doc): - """ - The thumbnail of a PDF is just a 500px wide image of the first page. - """ - - self.log("info", "Generating the thumbnail") - - run_convert( - self.CONVERT, - "-scale", "500x5000", - "-alpha", "remove", - doc, os.path.join(tempdir, "convert-%04d.png") - ) - - return os.path.join(tempdir, "convert-0000.png") - - def _guess_language(self, text): - try: - guess = langdetect.detect(text) - self.log("debug", "Language detected: {}".format(guess)) - return guess - except Exception as e: - self.log("warning", "Language detection error: {}".format(e)) - - def _get_ocr(self, imgs): - """ - Attempts to do the best job possible OCR'ing the document based on - simple language detection trial & error. - """ - - if not imgs: - raise OCRError("No images found") - - self.log("info", "OCRing the document") - - # Since the division gets rounded down by int, this calculation works - # for every edge-case, i.e. 1 - middle = int(len(imgs) / 2) - raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) - - guessed_language = self._guess_language(raw_text) - - if not guessed_language or guessed_language not in ISO639: - self.log("warning", "Language detection failed!") - if settings.FORGIVING_OCR: - self.log( - "warning", - "As FORGIVING_OCR is enabled, we're going to make the " - "best with what we have." - ) - raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) - return raw_text - raise OCRError("Language detection failed") - - if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: - raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) - return raw_text - - try: - return self._ocr(imgs, ISO639[guessed_language]) - except pyocr.pyocr.tesseract.TesseractError: - if settings.FORGIVING_OCR: - self.log( - "warning", - "OCR for {} failed, but we're going to stick with what " - "we've got since FORGIVING_OCR is enabled.".format( - guessed_language - ) - ) - raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) - return raw_text - raise OCRError( - "The guessed language is not available in this instance of " - "Tesseract." - ) - - def _assemble_ocr_sections(self, imgs, middle, text): - """ - Given a `middle` value and the text that middle page represents, we OCR - the remainder of the document and return the whole thing. - """ - text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text - text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) - return text - - def _ocr(self, imgs, lang): - """ - Performs a single OCR attempt. - """ - - if not imgs: - return "" - - self.log("info", "Parsing for {}".format(lang)) - - with Pool(processes=self.THREADS) as pool: - r = pool.map(image_to_string, itertools.product(imgs, [lang])) - r = " ".join(r) - - # Strip out excess white space to allow matching to go smoother - return strip_excess_whitespace(r) + # Return the parser with the highest weight. + return sorted( + options, key=lambda _: _["weight"], reverse=True)[0]["parser"] def _store(self, text, doc, thumbnail): @@ -332,10 +208,6 @@ class Consumer(object): return document - def _cleanup_tempdir(self, d): - self.log("debug", "Deleting directory {}".format(d)) - shutil.rmtree(d) - def _cleanup_doc(self, doc): self.log("debug", "Deleting document {}".format(doc)) os.unlink(doc) @@ -361,41 +233,3 @@ class Consumer(object): with open(doc, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() return Document.objects.filter(checksum=checksum).exists() - - -def strip_excess_whitespace(text): - collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) - no_leading_whitespace = re.sub( - "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) - no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) - return no_trailing_whitespace - - -def image_to_string(args): - img, lang = args - ocr = pyocr.get_available_tools()[0] - with Image.open(os.path.join(Consumer.SCRATCH, img)) as f: - if ocr.can_detect_orientation(): - try: - orientation = ocr.detect_orientation(f, lang=lang) - f = f.rotate(orientation["angle"], expand=1) - except (TesseractError, OtherTesseractError): - pass - return ocr.image_to_string(f, lang=lang) - - -def run_unpaper(args): - unpaper, pnm = args - subprocess.Popen( - (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() - - -def run_convert(*args): - - environment = os.environ.copy() - if settings.CONVERT_MEMORY_LIMIT: - environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT - if settings.CONVERT_TMPDIR: - environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR - - subprocess.Popen(args, env=environment).wait() diff --git a/src/documents/parsers.py b/src/documents/parsers.py new file mode 100644 index 000000000..9f63cbbcd --- /dev/null +++ b/src/documents/parsers.py @@ -0,0 +1,45 @@ +import logging +import shutil +import tempfile + +from django.conf import settings + + +class ParseError(Exception): + pass + + +class DocumentParser(object): + """ + Subclass this to make your own parser. Have a look at + `paperless_tesseract.parsers` for inspiration. + """ + + SCRATCH = settings.SCRATCH_DIR + + def __init__(self, path): + self.document_path = path + self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) + self.logger = logging.getLogger(__name__) + self.logging_group = None + + def get_thumbnail(self): + """ + Returns the path to a file we can use as a thumbnail for this document. + """ + raise NotImplementedError() + + def get_text(self): + """ + Returns the text from the document and only the text. + """ + raise NotImplementedError() + + def log(self, level, message): + getattr(self.logger, level)(message, extra={ + "group": self.logging_group + }) + + def cleanup(self): + self.log("debug", "Deleting directory {}".format(self.tempdir)) + shutil.rmtree(self.tempdir) diff --git a/src/documents/signals/__init__.py b/src/documents/signals/__init__.py index 257a20d46..810f14f49 100644 --- a/src/documents/signals/__init__.py +++ b/src/documents/signals/__init__.py @@ -2,3 +2,4 @@ from django.dispatch import Signal document_consumption_started = Signal(providing_args=["filename"]) document_consumption_finished = Signal(providing_args=["document"]) +document_consumer_declaration = Signal(providing_args=[]) diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index a4096154b..e6fecbf6a 100644 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -1,6 +1,5 @@ import logging import os - from subprocess import Popen from django.conf import settings diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 1ec4e3945..873b50d5b 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,13 +1,6 @@ -import os -from unittest import mock, skipIf - -import pyocr from django.test import TestCase -from pyocr.libtesseract.tesseract_raw import \ - TesseractError as OtherTesseractError from ..models import FileInfo -from ..consumer import image_to_string, strip_excess_whitespace class TestAttributes(TestCase): @@ -310,69 +303,3 @@ class TestFieldPermutations(TestCase): template.format(**spec), **spec) -class FakeTesseract(object): - - @staticmethod - def can_detect_orientation(): - return True - - @staticmethod - def detect_orientation(file_handle, lang): - raise OtherTesseractError("arbitrary status", "message") - - @staticmethod - def image_to_string(file_handle, lang): - return "This is test text" - - -class FakePyOcr(object): - - @staticmethod - def get_available_tools(): - return [FakeTesseract] - - -class TestOCR(TestCase): - - text_cases = [ - ("simple string", "simple string"), - ( - "simple newline\n testing string", - "simple newline\ntesting string" - ), - ( - "utf-8 строка с пробелами в конце ", - "utf-8 строка с пробелами в конце" - ) - ] - - SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") - TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) - - def test_strip_excess_whitespace(self): - for source, result in self.text_cases: - actual_result = strip_excess_whitespace(source) - self.assertEqual( - result, - actual_result, - "strip_exceess_whitespace({}) != '{}', but '{}'".format( - source, - result, - actual_result - ) - ) - - @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") - @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES) - @mock.patch("documents.consumer.pyocr", FakePyOcr) - def test_image_to_string_with_text_free_page(self): - """ - This test is sort of silly, since it's really just reproducing an odd - exception thrown by pyocr when it encounters a page with no text. - Actually running this test against an installation of Tesseract results - in a segmentation fault rooted somewhere deep inside pyocr where I - don't care to dig. Regardless, if you run the consumer normally, - text-free pages are now handled correctly so long as we work around - this weird exception. - """ - image_to_string(["no-text.png", "en"]) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 8f03942dd..edd0da9f3 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -61,6 +61,7 @@ INSTALLED_APPS = [ "django_extensions", "documents.apps.DocumentsConfig", + "paperless_tesseract.apps.PaperlessTesseractConfig", "flat_responsive", "django.contrib.admin", @@ -70,6 +71,9 @@ INSTALLED_APPS = [ ] +if os.getenv("PAPERLESS_INSTALLED_APPS"): + INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",") + MIDDLEWARE_CLASSES = [ 'django.middleware.security.SecurityMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware', diff --git a/src/paperless_tesseract/__init__.py b/src/paperless_tesseract/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_tesseract/apps.py b/src/paperless_tesseract/apps.py new file mode 100644 index 000000000..bdb430bea --- /dev/null +++ b/src/paperless_tesseract/apps.py @@ -0,0 +1,16 @@ +from django.apps import AppConfig + + +class PaperlessTesseractConfig(AppConfig): + + name = "paperless_tesseract" + + def ready(self): + + from documents.signals import document_consumer_declaration + + from .signals import ConsumerDeclaration + + document_consumer_declaration.connect(ConsumerDeclaration.handle) + + AppConfig.ready(self) diff --git a/src/documents/languages.py b/src/paperless_tesseract/languages.py similarity index 100% rename from src/documents/languages.py rename to src/paperless_tesseract/languages.py diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py new file mode 100644 index 000000000..0c5d039e1 --- /dev/null +++ b/src/paperless_tesseract/parsers.py @@ -0,0 +1,214 @@ +import itertools +import os +import re +import subprocess +from multiprocessing.pool import Pool + +import langdetect +import pyocr +from django.conf import settings +from documents.parsers import DocumentParser, ParseError +from PIL import Image +from pyocr.libtesseract.tesseract_raw import \ + TesseractError as OtherTesseractError +from pyocr.tesseract import TesseractError + +from .languages import ISO639 + + +class OCRError(Exception): + pass + + +class RasterisedDocumentParser(DocumentParser): + """ + This parser uses Tesseract to try and get some text out of a rasterised + image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) + """ + + CONVERT = settings.CONVERT_BINARY + DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 + THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None + UNPAPER = settings.UNPAPER_BINARY + DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE + + def get_thumbnail(self): + """ + The thumbnail of a PDF is just a 500px wide image of the first page. + """ + + run_convert( + self.CONVERT, + "-scale", "500x5000", + "-alpha", "remove", + self.document_path, os.path.join(self.tempdir, "convert-%04d.png") + ) + + return os.path.join(self.tempdir, "convert-0000.png") + + def get_text(self): + + images = self._get_greyscale() + + try: + + return self._get_ocr(images) + except OCRError as e: + raise ParseError(e) + + def _get_greyscale(self): + """ + Greyscale images are easier for Tesseract to OCR + """ + + # Convert PDF to multiple PNMs + pnm = os.path.join(self.tempdir, "convert-%04d.pnm") + run_convert( + self.CONVERT, + "-density", str(self.DENSITY), + "-depth", "8", + "-type", "grayscale", + self.document_path, pnm, + ) + + # Get a list of converted images + pnms = [] + for f in os.listdir(self.tempdir): + if f.endswith(".pnm"): + pnms.append(os.path.join(self.tempdir, f)) + + # Run unpaper in parallel on converted images + with Pool(processes=self.THREADS) as pool: + pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) + + # Return list of converted images, processed with unpaper + pnms = [] + for f in os.listdir(self.tempdir): + if f.endswith(".unpaper.pnm"): + pnms.append(os.path.join(self.tempdir, f)) + + return sorted(filter(lambda __: os.path.isfile(__), pnms)) + + def _guess_language(self, text): + try: + guess = langdetect.detect(text) + self.log("debug", "Language detected: {}".format(guess)) + return guess + except Exception as e: + self.log("warning", "Language detection error: {}".format(e)) + + def _get_ocr(self, imgs): + """ + Attempts to do the best job possible OCR'ing the document based on + simple language detection trial & error. + """ + + if not imgs: + raise OCRError("No images found") + + self.log("info", "OCRing the document") + + # Since the division gets rounded down by int, this calculation works + # for every edge-case, i.e. 1 + middle = int(len(imgs) / 2) + raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) + + guessed_language = self._guess_language(raw_text) + + if not guessed_language or guessed_language not in ISO639: + self.log("warning", "Language detection failed!") + if settings.FORGIVING_OCR: + self.log( + "warning", + "As FORGIVING_OCR is enabled, we're going to make the " + "best with what we have." + ) + raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) + return raw_text + raise OCRError("Language detection failed") + + if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: + raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) + return raw_text + + try: + return self._ocr(imgs, ISO639[guessed_language]) + except pyocr.pyocr.tesseract.TesseractError: + if settings.FORGIVING_OCR: + self.log( + "warning", + "OCR for {} failed, but we're going to stick with what " + "we've got since FORGIVING_OCR is enabled.".format( + guessed_language + ) + ) + raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) + return raw_text + raise OCRError( + "The guessed language is not available in this instance of " + "Tesseract." + ) + + def _ocr(self, imgs, lang): + """ + Performs a single OCR attempt. + """ + + if not imgs: + return "" + + self.log("info", "Parsing for {}".format(lang)) + + with Pool(processes=self.THREADS) as pool: + r = pool.map(image_to_string, itertools.product(imgs, [lang])) + r = " ".join(r) + + # Strip out excess white space to allow matching to go smoother + return strip_excess_whitespace(r) + + def _assemble_ocr_sections(self, imgs, middle, text): + """ + Given a `middle` value and the text that middle page represents, we OCR + the remainder of the document and return the whole thing. + """ + text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text + text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) + return text + + +def run_convert(*args): + + environment = os.environ.copy() + if settings.CONVERT_MEMORY_LIMIT: + environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT + if settings.CONVERT_TMPDIR: + environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR + + subprocess.Popen(args, env=environment).wait() + + +def run_unpaper(args): + unpaper, pnm = args + subprocess.Popen( + (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() + + +def strip_excess_whitespace(text): + collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) + no_leading_whitespace = re.sub( + "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) + no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) + return no_trailing_whitespace + + +def image_to_string(args): + img, lang = args + ocr = pyocr.get_available_tools()[0] + with Image.open(os.path.join(RasterisedDocumentParser.SCRATCH, img)) as f: + if ocr.can_detect_orientation(): + try: + orientation = ocr.detect_orientation(f, lang=lang) + f = f.rotate(orientation["angle"], expand=1) + except (TesseractError, OtherTesseractError): + pass + return ocr.image_to_string(f, lang=lang) diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py new file mode 100644 index 000000000..3e5555383 --- /dev/null +++ b/src/paperless_tesseract/signals.py @@ -0,0 +1,23 @@ +import re + +from .parsers import RasterisedDocumentParser + + +class ConsumerDeclaration(object): + + MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$") + + @classmethod + def handle(cls, sender, **kwargs): + return cls.test + + @classmethod + def test(cls, doc): + + if cls.MATCHING_FILES.match(doc): + return { + "parser": RasterisedDocumentParser, + "weight": 0 + } + + return None diff --git a/src/paperless_tesseract/tests/__init__.py b/src/paperless_tesseract/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/documents/tests/samples/no-text.png b/src/paperless_tesseract/tests/samples/no-text.png similarity index 100% rename from src/documents/tests/samples/no-text.png rename to src/paperless_tesseract/tests/samples/no-text.png diff --git a/src/paperless_tesseract/tests/test_ocr.py b/src/paperless_tesseract/tests/test_ocr.py new file mode 100644 index 000000000..68ab64707 --- /dev/null +++ b/src/paperless_tesseract/tests/test_ocr.py @@ -0,0 +1,80 @@ +import os +from unittest import mock, skipIf + +import pyocr +from django.test import TestCase +from pyocr.libtesseract.tesseract_raw import \ + TesseractError as OtherTesseractError + +from ..parsers import image_to_string, strip_excess_whitespace + + +class FakeTesseract(object): + + @staticmethod + def can_detect_orientation(): + return True + + @staticmethod + def detect_orientation(file_handle, lang): + raise OtherTesseractError("arbitrary status", "message") + + @staticmethod + def image_to_string(file_handle, lang): + return "This is test text" + + +class FakePyOcr(object): + + @staticmethod + def get_available_tools(): + return [FakeTesseract] + + +class TestOCR(TestCase): + + text_cases = [ + ("simple string", "simple string"), + ( + "simple newline\n testing string", + "simple newline\ntesting string" + ), + ( + "utf-8 строка с пробелами в конце ", + "utf-8 строка с пробелами в конце" + ) + ] + + SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") + TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) + + def test_strip_excess_whitespace(self): + for source, result in self.text_cases: + actual_result = strip_excess_whitespace(source) + self.assertEqual( + result, + actual_result, + "strip_exceess_whitespace({}) != '{}', but '{}'".format( + source, + result, + actual_result + ) + ) + + @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) + def test_image_to_string_with_text_free_page(self): + """ + This test is sort of silly, since it's really just reproducing an odd + exception thrown by pyocr when it encounters a page with no text. + Actually running this test against an installation of Tesseract results + in a segmentation fault rooted somewhere deep inside pyocr where I + don't care to dig. Regardless, if you run the consumer normally, + text-free pages are now handled correctly so long as we work around + this weird exception. + """ + image_to_string(["no-text.png", "en"]) From b5f6c06b8baa552011ee5056125c98f91c3d6b7e Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 11 Mar 2017 16:37:18 +0000 Subject: [PATCH 2/5] fix: a little cleanup --- src/documents/admin.py | 1 + .../templates/admin/documents/document/change_list_results.html | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/documents/admin.py b/src/documents/admin.py index aada98fc9..7cba10c74 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -67,6 +67,7 @@ class DocumentAdmin(CommonAdmin): def created_(self, obj): return obj.created.date().strftime("%Y-%m-%d") + created_.short_description = "Created" def thumbnail(self, obj): png_img = self._html_tag( diff --git a/src/documents/templates/admin/documents/document/change_list_results.html b/src/documents/templates/admin/documents/document/change_list_results.html index 0730f92b3..1e418ebfa 100644 --- a/src/documents/templates/admin/documents/document/change_list_results.html +++ b/src/documents/templates/admin/documents/document/change_list_results.html @@ -158,7 +158,7 @@