diff --git a/.travis.yml b/.travis.yml index dcaaeab8d..7b06c1734 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,9 @@ matrix: env: TOXENV=py34 - python: 3.5 env: TOXENV=py35 - - python: 3.5 + - python: 3.6 + env: TOXENV=py36 + - python: 3.6 env: TOXENV=pep8 install: diff --git a/docs/changelog.rst b/docs/changelog.rst index a109cceb0..766d6424c 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -4,6 +4,14 @@ Changelog * 0.3.6 * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the correspondent or the tags for a document. + * The ``content`` field is now optional, to allow for the edge case of a + purely graphical document. + * You can no longer add documents via the admin. This never worked in the + first place, so all I've done here is remove the link to the broken form. + * The consumer code has been heavily refactored to support a pluggable + interface. Install a paperless consumer via pip and tell paperless about + it with an environment variable, and you're good to go. Proper + documentation is on its way. * 0.3.5 * A serious facelift for the documents listing page wherein we drop the diff --git a/src/documents/admin.py b/src/documents/admin.py index aada98fc9..7cba10c74 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -67,6 +67,7 @@ class DocumentAdmin(CommonAdmin): def created_(self, obj): return obj.created.date().strftime("%Y-%m-%d") + created_.short_description = "Created" def thumbnail(self, obj): png_img = self._html_tag( diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 02397c118..65e74f3a8 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,35 +1,21 @@ +import datetime +import hashlib +import logging import os import re import uuid -import shutil -import hashlib -import logging -import datetime -import tempfile -import itertools -import subprocess -from multiprocessing.pool import Pool -import pyocr -import langdetect -from PIL import Image from django.conf import settings from django.utils import timezone from paperless.db import GnuPG -from pyocr.tesseract import TesseractError -from pyocr.libtesseract.tesseract_raw import \ - TesseractError as OtherTesseractError -from .models import Tag, Document, FileInfo +from .models import Document, FileInfo, Tag +from .parsers import ParseError from .signals import ( - document_consumption_started, - document_consumption_finished + document_consumer_declaration, + document_consumption_finished, + document_consumption_started ) -from .languages import ISO639 - - -class OCRError(Exception): - pass class ConsumerError(Exception): @@ -47,13 +33,7 @@ class Consumer(object): """ SCRATCH = settings.SCRATCH_DIR - CONVERT = settings.CONVERT_BINARY - UNPAPER = settings.UNPAPER_BINARY CONSUME = settings.CONSUMPTION_DIR - THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None - DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 - - DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE def __init__(self): @@ -78,6 +58,16 @@ class Consumer(object): raise ConsumerError( "Consumption directory {} does not exist".format(self.CONSUME)) + self.parsers = [] + for response in document_consumer_declaration.send(self): + self.parsers.append(response[1]) + + if not self.parsers: + raise ConsumerError( + "No parsers could be found, not even the default. " + "This is a problem." + ) + def log(self, level, message): getattr(self.logger, level)(message, extra={ "group": self.logging_group @@ -109,6 +99,13 @@ class Consumer(object): self._ignore.append(doc) continue + parser_class = self._get_parser_class(doc) + if not parser_class: + self.log( + "info", "No parsers could be found for {}".format(doc)) + self._ignore.append(doc) + continue + self.logging_group = uuid.uuid4() self.log("info", "Consuming {}".format(doc)) @@ -119,25 +116,26 @@ class Consumer(object): logging_group=self.logging_group ) - tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) - imgs = self._get_greyscale(tempdir, doc) - thumbnail = self._get_thumbnail(tempdir, doc) + parsed_document = parser_class(doc) + thumbnail = parsed_document.get_thumbnail() try: - - document = self._store(self._get_ocr(imgs), doc, thumbnail) - - except OCRError as e: + document = self._store( + parsed_document.get_text(), + doc, + thumbnail + ) + except ParseError as e: self._ignore.append(doc) - self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) - self._cleanup_tempdir(tempdir) + self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) + parsed_document.cleanup() continue else: - self._cleanup_tempdir(tempdir) + parsed_document.cleanup() self._cleanup_doc(doc) self.log( @@ -151,142 +149,20 @@ class Consumer(object): logging_group=self.logging_group ) - def _get_greyscale(self, tempdir, doc): + def _get_parser_class(self, doc): """ - Greyscale images are easier for Tesseract to OCR + Determine the appropriate parser class based on the file """ - self.log("info", "Generating greyscale image from {}".format(doc)) + options = [] + for parser in self.parsers: + result = parser(doc) + if result: + options.append(result) - # Convert PDF to multiple PNMs - pnm = os.path.join(tempdir, "convert-%04d.pnm") - run_convert( - self.CONVERT, - "-density", str(self.DENSITY), - "-depth", "8", - "-type", "grayscale", - doc, pnm, - ) - - # Get a list of converted images - pnms = [] - for f in os.listdir(tempdir): - if f.endswith(".pnm"): - pnms.append(os.path.join(tempdir, f)) - - # Run unpaper in parallel on converted images - with Pool(processes=self.THREADS) as pool: - pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) - - # Return list of converted images, processed with unpaper - pnms = [] - for f in os.listdir(tempdir): - if f.endswith(".unpaper.pnm"): - pnms.append(os.path.join(tempdir, f)) - - return sorted(filter(lambda __: os.path.isfile(__), pnms)) - - def _get_thumbnail(self, tempdir, doc): - """ - The thumbnail of a PDF is just a 500px wide image of the first page. - """ - - self.log("info", "Generating the thumbnail") - - run_convert( - self.CONVERT, - "-scale", "500x5000", - "-alpha", "remove", - doc, os.path.join(tempdir, "convert-%04d.png") - ) - - return os.path.join(tempdir, "convert-0000.png") - - def _guess_language(self, text): - try: - guess = langdetect.detect(text) - self.log("debug", "Language detected: {}".format(guess)) - return guess - except Exception as e: - self.log("warning", "Language detection error: {}".format(e)) - - def _get_ocr(self, imgs): - """ - Attempts to do the best job possible OCR'ing the document based on - simple language detection trial & error. - """ - - if not imgs: - raise OCRError("No images found") - - self.log("info", "OCRing the document") - - # Since the division gets rounded down by int, this calculation works - # for every edge-case, i.e. 1 - middle = int(len(imgs) / 2) - raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) - - guessed_language = self._guess_language(raw_text) - - if not guessed_language or guessed_language not in ISO639: - self.log("warning", "Language detection failed!") - if settings.FORGIVING_OCR: - self.log( - "warning", - "As FORGIVING_OCR is enabled, we're going to make the " - "best with what we have." - ) - raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) - return raw_text - raise OCRError("Language detection failed") - - if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: - raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) - return raw_text - - try: - return self._ocr(imgs, ISO639[guessed_language]) - except pyocr.pyocr.tesseract.TesseractError: - if settings.FORGIVING_OCR: - self.log( - "warning", - "OCR for {} failed, but we're going to stick with what " - "we've got since FORGIVING_OCR is enabled.".format( - guessed_language - ) - ) - raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) - return raw_text - raise OCRError( - "The guessed language is not available in this instance of " - "Tesseract." - ) - - def _assemble_ocr_sections(self, imgs, middle, text): - """ - Given a `middle` value and the text that middle page represents, we OCR - the remainder of the document and return the whole thing. - """ - text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text - text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) - return text - - def _ocr(self, imgs, lang): - """ - Performs a single OCR attempt. - """ - - if not imgs: - return "" - - self.log("info", "Parsing for {}".format(lang)) - - with Pool(processes=self.THREADS) as pool: - r = pool.map(image_to_string, itertools.product(imgs, [lang])) - r = " ".join(r) - - # Strip out excess white space to allow matching to go smoother - return strip_excess_whitespace(r) + # Return the parser with the highest weight. + return sorted( + options, key=lambda _: _["weight"], reverse=True)[0]["parser"] def _store(self, text, doc, thumbnail): @@ -332,10 +208,6 @@ class Consumer(object): return document - def _cleanup_tempdir(self, d): - self.log("debug", "Deleting directory {}".format(d)) - shutil.rmtree(d) - def _cleanup_doc(self, doc): self.log("debug", "Deleting document {}".format(doc)) os.unlink(doc) @@ -361,41 +233,3 @@ class Consumer(object): with open(doc, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() return Document.objects.filter(checksum=checksum).exists() - - -def strip_excess_whitespace(text): - collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) - no_leading_whitespace = re.sub( - "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) - no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) - return no_trailing_whitespace - - -def image_to_string(args): - img, lang = args - ocr = pyocr.get_available_tools()[0] - with Image.open(os.path.join(Consumer.SCRATCH, img)) as f: - if ocr.can_detect_orientation(): - try: - orientation = ocr.detect_orientation(f, lang=lang) - f = f.rotate(orientation["angle"], expand=1) - except (TesseractError, OtherTesseractError): - pass - return ocr.image_to_string(f, lang=lang) - - -def run_unpaper(args): - unpaper, pnm = args - subprocess.Popen( - (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() - - -def run_convert(*args): - - environment = os.environ.copy() - if settings.CONVERT_MEMORY_LIMIT: - environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT - if settings.CONVERT_TMPDIR: - environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR - - subprocess.Popen(args, env=environment).wait() diff --git a/src/documents/models.py b/src/documents/models.py index 56c330f75..0ee896ec3 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -158,13 +158,22 @@ class Document(models.Model): correspondent = models.ForeignKey( Correspondent, blank=True, null=True, related_name="documents") + title = models.CharField(max_length=128, blank=True, db_index=True) - content = models.TextField(db_index=True) + + content = models.TextField( + db_index=True, + blank=True, + help_text="The raw, text-only data of the document. This field is " + "primarily used for searching." + ) + file_type = models.CharField( max_length=4, editable=False, choices=tuple([(t, t.upper()) for t in TYPES]) ) + tags = models.ManyToManyField( Tag, related_name="documents", blank=True) diff --git a/src/documents/parsers.py b/src/documents/parsers.py new file mode 100644 index 000000000..9f63cbbcd --- /dev/null +++ b/src/documents/parsers.py @@ -0,0 +1,45 @@ +import logging +import shutil +import tempfile + +from django.conf import settings + + +class ParseError(Exception): + pass + + +class DocumentParser(object): + """ + Subclass this to make your own parser. Have a look at + `paperless_tesseract.parsers` for inspiration. + """ + + SCRATCH = settings.SCRATCH_DIR + + def __init__(self, path): + self.document_path = path + self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) + self.logger = logging.getLogger(__name__) + self.logging_group = None + + def get_thumbnail(self): + """ + Returns the path to a file we can use as a thumbnail for this document. + """ + raise NotImplementedError() + + def get_text(self): + """ + Returns the text from the document and only the text. + """ + raise NotImplementedError() + + def log(self, level, message): + getattr(self.logger, level)(message, extra={ + "group": self.logging_group + }) + + def cleanup(self): + self.log("debug", "Deleting directory {}".format(self.tempdir)) + shutil.rmtree(self.tempdir) diff --git a/src/documents/signals/__init__.py b/src/documents/signals/__init__.py index 257a20d46..810f14f49 100644 --- a/src/documents/signals/__init__.py +++ b/src/documents/signals/__init__.py @@ -2,3 +2,4 @@ from django.dispatch import Signal document_consumption_started = Signal(providing_args=["filename"]) document_consumption_finished = Signal(providing_args=["document"]) +document_consumer_declaration = Signal(providing_args=[]) diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index a4096154b..e6fecbf6a 100644 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -1,6 +1,5 @@ import logging import os - from subprocess import Popen from django.conf import settings diff --git a/src/documents/templates/admin/documents/document/change_list_results.html b/src/documents/templates/admin/documents/document/change_list_results.html index 0730f92b3..1e418ebfa 100644 --- a/src/documents/templates/admin/documents/document/change_list_results.html +++ b/src/documents/templates/admin/documents/document/change_list_results.html @@ -158,7 +158,7 @@