mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Add support for using pre-existing text from PDFs
This commit is contained in:
parent
bd0b593c4a
commit
87e466c47c
@ -1,5 +1,9 @@
|
|||||||
language: python
|
language: python
|
||||||
|
|
||||||
|
before_install:
|
||||||
|
- sudo apt-get update -qq
|
||||||
|
- sudo apt-get install -qq libpoppler-cpp-dev
|
||||||
|
|
||||||
sudo: false
|
sudo: false
|
||||||
|
|
||||||
matrix:
|
matrix:
|
||||||
|
@ -18,9 +18,9 @@ ENV PAPERLESS_EXPORT_DIR=/export \
|
|||||||
# Install dependencies
|
# Install dependencies
|
||||||
RUN apk --no-cache --update add \
|
RUN apk --no-cache --update add \
|
||||||
python3 gnupg libmagic bash \
|
python3 gnupg libmagic bash \
|
||||||
sudo tesseract-ocr imagemagick ghostscript unpaper && \
|
sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
|
||||||
apk --no-cache add --virtual .build-dependencies \
|
apk --no-cache add --virtual .build-dependencies \
|
||||||
python3-dev gcc musl-dev zlib-dev jpeg-dev && \
|
python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
|
||||||
# Install python dependencies
|
# Install python dependencies
|
||||||
python3 -m ensurepip && \
|
python3 -m ensurepip && \
|
||||||
rm -r /usr/lib/python*/ensurepip && \
|
rm -r /usr/lib/python*/ensurepip && \
|
||||||
|
@ -3,7 +3,15 @@ Changelog
|
|||||||
|
|
||||||
* 1.2.0
|
* 1.2.0
|
||||||
* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
|
* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
|
||||||
and `Pit`_.
|
and `Pit`_.
|
||||||
|
* `BastianPoe`_ has added the long-awaited feature to automatically skip the
|
||||||
|
OCR step when the PDF already contains text. This can be overridden by
|
||||||
|
setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or
|
||||||
|
in the environment. Note that this also means that Paperless now requires
|
||||||
|
``libpoppler-cpp-dev`` to be installed. **You'll need to run
|
||||||
|
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
||||||
|
properly update**.
|
||||||
|
|
||||||
* 1.1.0
|
* 1.1.0
|
||||||
* Fix for `#283`_, a redirect bug which broke interactions with
|
* Fix for `#283`_, a redirect bug which broke interactions with
|
||||||
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
|
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
|
||||||
@ -272,6 +280,7 @@ Changelog
|
|||||||
.. _chris-aeviator: https://github.com/chris-aeviator
|
.. _chris-aeviator: https://github.com/chris-aeviator
|
||||||
.. _Dan Panzarella: https://github.com/pzl
|
.. _Dan Panzarella: https://github.com/pzl
|
||||||
.. _addadi: https://github.com/addadi
|
.. _addadi: https://github.com/addadi
|
||||||
|
.. _BastianPoe: https://github.com/BastianPoe
|
||||||
|
|
||||||
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
||||||
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
||||||
|
@ -11,24 +11,27 @@ should work) that has the following software installed:
|
|||||||
* `Tesseract`_, plus its language files matching your document base.
|
* `Tesseract`_, plus its language files matching your document base.
|
||||||
* `Imagemagick`_ version 6.7.5 or higher
|
* `Imagemagick`_ version 6.7.5 or higher
|
||||||
* `unpaper`_
|
* `unpaper`_
|
||||||
|
* `libpoppler-cpp-dev`_ PDF rendering library
|
||||||
|
|
||||||
.. _Python3: https://python.org/
|
.. _Python3: https://python.org/
|
||||||
.. _GNU Privacy Guard: https://gnupg.org
|
.. _GNU Privacy Guard: https://gnupg.org
|
||||||
.. _Tesseract: https://github.com/tesseract-ocr
|
.. _Tesseract: https://github.com/tesseract-ocr
|
||||||
.. _Imagemagick: http://imagemagick.org/
|
.. _Imagemagick: http://imagemagick.org/
|
||||||
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
|
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
|
||||||
|
.. _libpoppler-cpp-dev: https://poppler.freedesktop.org/
|
||||||
|
|
||||||
Notably, you should confirm how you access your Python3 installation. Many
|
Notably, you should confirm how you access your Python3 installation. Many
|
||||||
Linux distributions will install Python3 in parallel to Python2, using the names
|
Linux distributions will install Python3 in parallel to Python2, using the
|
||||||
``python3`` and ``python`` respectively. The same goes for ``pip3`` and
|
names ``python3`` and ``python`` respectively. The same goes for ``pip3`` and
|
||||||
``pip``. Running Paperless with Python2 will likely break things, so make sure that
|
``pip``. Running Paperless with Python2 will likely break things, so make sure
|
||||||
you're using the right version.
|
that you're using the right version.
|
||||||
|
|
||||||
For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
|
For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
|
||||||
refer to their Python3 versions.
|
refer to their Python3 versions.
|
||||||
|
|
||||||
In addition to the above, there are a number of Python requirements, all of
|
In addition to the above, there are a number of Python requirements, all of
|
||||||
which are listed in a file called ``requirements.txt`` in the project root directory.
|
which are listed in a file called ``requirements.txt`` in the project root
|
||||||
|
directory.
|
||||||
|
|
||||||
If you're not working on a virtual environment (like Vagrant or Docker), you
|
If you're not working on a virtual environment (like Vagrant or Docker), you
|
||||||
should probably be using a virtualenv, but that's your call. The reasons why
|
should probably be using a virtualenv, but that's your call. The reasons why
|
||||||
@ -39,12 +42,13 @@ probably figure that out before continuing.
|
|||||||
|
|
||||||
.. _requirements-apple:
|
.. _requirements-apple:
|
||||||
|
|
||||||
Apple-tastic Complications
|
Problems with Imagemagick & PDFs
|
||||||
--------------------------
|
--------------------------------
|
||||||
|
|
||||||
Some users have `run into problems`_ with installing ImageMagick on Apple
|
Some users have `run into problems`_ with getting ImageMagick to do its thing
|
||||||
systems using HomeBrew. The solution appears to be to install ghostscript as
|
with PDFs. Often this is the case with Apple systems using HomeBrew, but other
|
||||||
well as ImageMagick:
|
Linuxes have been a problem as well. The solution appears to be to install
|
||||||
|
ghostscript as well as ImageMagick:
|
||||||
|
|
||||||
.. _run into problems: https://github.com/danielquinn/paperless/issues/25
|
.. _run into problems: https://github.com/danielquinn/paperless/issues/25
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@ python-gnupg>=0.3.9
|
|||||||
pytz>=2016.10
|
pytz>=2016.10
|
||||||
dateparser>=0.6.0
|
dateparser>=0.6.0
|
||||||
gunicorn==19.7.1
|
gunicorn==19.7.1
|
||||||
|
pdftotext>=2.0.1
|
||||||
|
|
||||||
# For the tests
|
# For the tests
|
||||||
factory-boy
|
factory-boy
|
||||||
|
@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
|||||||
# The amount of threads to use for OCR
|
# The amount of threads to use for OCR
|
||||||
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
|
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
|
||||||
|
|
||||||
|
# OCR all documents?
|
||||||
|
OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
|
||||||
|
|
||||||
# If this is true, any failed attempts to OCR a PDF will result in the PDF
|
# If this is true, any failed attempts to OCR a PDF will result in the PDF
|
||||||
# being indexed anyway, with whatever we could get. If it's False, the file
|
# being indexed anyway, with whatever we could get. If it's False, the file
|
||||||
# will simply be left in the CONSUMPTION_DIR.
|
# will simply be left in the CONSUMPTION_DIR.
|
||||||
|
@ -4,6 +4,7 @@ import re
|
|||||||
import subprocess
|
import subprocess
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import Pool
|
||||||
import dateparser
|
import dateparser
|
||||||
|
import pdftotext
|
||||||
|
|
||||||
import langdetect
|
import langdetect
|
||||||
import pyocr
|
import pyocr
|
||||||
@ -33,6 +34,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
UNPAPER = settings.UNPAPER_BINARY
|
UNPAPER = settings.UNPAPER_BINARY
|
||||||
DATE_ORDER = settings.DATE_ORDER
|
DATE_ORDER = settings.DATE_ORDER
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
|
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||||
|
|
||||||
def get_thumbnail(self):
|
def get_thumbnail(self):
|
||||||
"""
|
"""
|
||||||
@ -48,7 +50,21 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
return os.path.join(self.tempdir, "convert-0000.png")
|
return os.path.join(self.tempdir, "convert-0000.png")
|
||||||
|
|
||||||
|
def _is_ocred(self):
|
||||||
|
# Extract text from PDF using pdftotext
|
||||||
|
text = get_text_from_pdf(self.document_path)
|
||||||
|
|
||||||
|
# We assume, that a PDF with at least 50 characters contains text
|
||||||
|
# (so no OCR required)
|
||||||
|
if len(text) > 50:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
|
if not self.OCR_ALWAYS and self._is_ocred():
|
||||||
|
self.log("info", "Skipping OCR, using Text from PDF")
|
||||||
|
return get_text_from_pdf(self.document_path)
|
||||||
|
|
||||||
images = self._get_greyscale()
|
images = self._get_greyscale()
|
||||||
|
|
||||||
@ -237,3 +253,13 @@ def image_to_string(args):
|
|||||||
except (TesseractError, OtherTesseractError):
|
except (TesseractError, OtherTesseractError):
|
||||||
pass
|
pass
|
||||||
return ocr.image_to_string(f, lang=lang)
|
return ocr.image_to_string(f, lang=lang)
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_from_pdf(pdf_file):
|
||||||
|
with open(pdf_file, "rb") as f:
|
||||||
|
try:
|
||||||
|
pdf = pdftotext.PDF(f)
|
||||||
|
except pdftotext.Error:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return "\n".join(pdf)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user