diff --git a/.travis.yml b/.travis.yml index 4a136be91..41abf71ee 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,9 @@ language: python +before_install: +- sudo apt-get update -qq +- sudo apt-get install -qq libpoppler-cpp-dev + sudo: false matrix: diff --git a/Dockerfile b/Dockerfile index 11a83e2b1..9c2f9c5f8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,9 +18,9 @@ ENV PAPERLESS_EXPORT_DIR=/export \ # Install dependencies RUN apk --no-cache --update add \ python3 gnupg libmagic bash \ - sudo tesseract-ocr imagemagick ghostscript unpaper && \ + sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \ apk --no-cache add --virtual .build-dependencies \ - python3-dev gcc musl-dev zlib-dev jpeg-dev && \ + python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \ # Install python dependencies python3 -m ensurepip && \ rm -r /usr/lib/python*/ensurepip && \ diff --git a/docs/changelog.rst b/docs/changelog.rst index 6f3cc8567..bd8b751a1 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,7 +3,15 @@ Changelog * 1.2.0 * New Docker image, now based on Alpine, thanks to the efforts of `addadi`_ - and `Pit`_. + and `Pit`_. + * `BastianPoe`_ has added the long-awaited feature to automatically skip the + OCR step when the PDF already contains text. This can be overridden by + setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or + in the environment. Note that this also means that Paperless now requires + ``libpoppler-cpp-dev`` to be installed. **You'll need to run + ``pip install -r requirements.txt`` after the usual ``git pull`` to + properly update**. + * 1.1.0 * Fix for `#283`_, a redirect bug which broke interactions with paperless-desktop. Thanks to `chris-aeviator`_ for reporting it. @@ -272,6 +280,7 @@ Changelog .. _chris-aeviator: https://github.com/chris-aeviator .. _Dan Panzarella: https://github.com/pzl .. _addadi: https://github.com/addadi +.. _BastianPoe: https://github.com/BastianPoe .. _#20: https://github.com/danielquinn/paperless/issues/20 .. _#44: https://github.com/danielquinn/paperless/issues/44 diff --git a/docs/requirements.rst b/docs/requirements.rst index 1f476c9dd..ee42cb96a 100644 --- a/docs/requirements.rst +++ b/docs/requirements.rst @@ -11,24 +11,27 @@ should work) that has the following software installed: * `Tesseract`_, plus its language files matching your document base. * `Imagemagick`_ version 6.7.5 or higher * `unpaper`_ +* `libpoppler-cpp-dev`_ PDF rendering library .. _Python3: https://python.org/ .. _GNU Privacy Guard: https://gnupg.org .. _Tesseract: https://github.com/tesseract-ocr .. _Imagemagick: http://imagemagick.org/ .. _unpaper: https://www.flameeyes.eu/projects/unpaper +.. _libpoppler-cpp-dev: https://poppler.freedesktop.org/ Notably, you should confirm how you access your Python3 installation. Many -Linux distributions will install Python3 in parallel to Python2, using the names -``python3`` and ``python`` respectively. The same goes for ``pip3`` and -``pip``. Running Paperless with Python2 will likely break things, so make sure that -you're using the right version. +Linux distributions will install Python3 in parallel to Python2, using the +names ``python3`` and ``python`` respectively. The same goes for ``pip3`` and +``pip``. Running Paperless with Python2 will likely break things, so make sure +that you're using the right version. For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to refer to their Python3 versions. In addition to the above, there are a number of Python requirements, all of -which are listed in a file called ``requirements.txt`` in the project root directory. +which are listed in a file called ``requirements.txt`` in the project root +directory. If you're not working on a virtual environment (like Vagrant or Docker), you should probably be using a virtualenv, but that's your call. The reasons why @@ -39,12 +42,13 @@ probably figure that out before continuing. .. _requirements-apple: -Apple-tastic Complications --------------------------- +Problems with Imagemagick & PDFs +-------------------------------- -Some users have `run into problems`_ with installing ImageMagick on Apple -systems using HomeBrew. The solution appears to be to install ghostscript as -well as ImageMagick: +Some users have `run into problems`_ with getting ImageMagick to do its thing +with PDFs. Often this is the case with Apple systems using HomeBrew, but other +Linuxes have been a problem as well. The solution appears to be to install +ghostscript as well as ImageMagick: .. _run into problems: https://github.com/danielquinn/paperless/issues/25 diff --git a/requirements.txt b/requirements.txt index fa857c677..ce172e92a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ python-gnupg>=0.3.9 pytz>=2016.10 dateparser>=0.6.0 gunicorn==19.7.1 +pdftotext>=2.0.1 # For the tests factory-boy diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 3dad91e86..14b2aeb63 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") # The amount of threads to use for OCR OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") +# OCR all documents? +OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true")) + # If this is true, any failed attempts to OCR a PDF will result in the PDF # being indexed anyway, with whatever we could get. If it's False, the file # will simply be left in the CONSUMPTION_DIR. diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index abadbd3a5..7a36fcdd6 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -4,6 +4,7 @@ import re import subprocess from multiprocessing.pool import Pool import dateparser +import pdftotext import langdetect import pyocr @@ -33,6 +34,7 @@ class RasterisedDocumentParser(DocumentParser): UNPAPER = settings.UNPAPER_BINARY DATE_ORDER = settings.DATE_ORDER DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE + OCR_ALWAYS = settings.OCR_ALWAYS def get_thumbnail(self): """ @@ -48,7 +50,21 @@ class RasterisedDocumentParser(DocumentParser): return os.path.join(self.tempdir, "convert-0000.png") + def _is_ocred(self): + # Extract text from PDF using pdftotext + text = get_text_from_pdf(self.document_path) + + # We assume, that a PDF with at least 50 characters contains text + # (so no OCR required) + if len(text) > 50: + return True + + return False + def get_text(self): + if not self.OCR_ALWAYS and self._is_ocred(): + self.log("info", "Skipping OCR, using Text from PDF") + return get_text_from_pdf(self.document_path) images = self._get_greyscale() @@ -237,3 +253,13 @@ def image_to_string(args): except (TesseractError, OtherTesseractError): pass return ocr.image_to_string(f, lang=lang) + + +def get_text_from_pdf(pdf_file): + with open(pdf_file, "rb") as f: + try: + pdf = pdftotext.PDF(f) + except pdftotext.Error: + return False + + return "\n".join(pdf)