mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Add support for using pre-existing text from PDFs
This commit is contained in:
		| @@ -1,5 +1,9 @@ | |||||||
| language: python | language: python | ||||||
|  |  | ||||||
|  | before_install: | ||||||
|  | - sudo apt-get update -qq | ||||||
|  | - sudo apt-get install -qq libpoppler-cpp-dev | ||||||
|  |  | ||||||
| sudo: false | sudo: false | ||||||
|  |  | ||||||
| matrix: | matrix: | ||||||
|   | |||||||
| @@ -18,9 +18,9 @@ ENV PAPERLESS_EXPORT_DIR=/export \ | |||||||
| # Install dependencies | # Install dependencies | ||||||
| RUN apk --no-cache --update add \ | RUN apk --no-cache --update add \ | ||||||
|         python3 gnupg libmagic bash \ |         python3 gnupg libmagic bash \ | ||||||
|         sudo tesseract-ocr imagemagick ghostscript unpaper && \ |         sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \ | ||||||
|     apk --no-cache add --virtual .build-dependencies \ |     apk --no-cache add --virtual .build-dependencies \ | ||||||
|         python3-dev gcc musl-dev zlib-dev jpeg-dev && \ |         python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \ | ||||||
| # Install python dependencies | # Install python dependencies | ||||||
|     python3 -m ensurepip && \ |     python3 -m ensurepip && \ | ||||||
|     rm -r /usr/lib/python*/ensurepip && \ |     rm -r /usr/lib/python*/ensurepip && \ | ||||||
|   | |||||||
| @@ -3,7 +3,15 @@ Changelog | |||||||
|  |  | ||||||
| * 1.2.0 | * 1.2.0 | ||||||
|   * New Docker image, now based on Alpine, thanks to the efforts of `addadi`_ |   * New Docker image, now based on Alpine, thanks to the efforts of `addadi`_ | ||||||
|   and `Pit`_. |     and `Pit`_. | ||||||
|  |   * `BastianPoe`_ has added the long-awaited feature to automatically skip the | ||||||
|  |     OCR step when the PDF already contains text. This can be overridden by | ||||||
|  |     setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or | ||||||
|  |     in the environment.  Note that this also means that Paperless now requires | ||||||
|  |     ``libpoppler-cpp-dev`` to be installed. **You'll need to run | ||||||
|  |     ``pip install -r requirements.txt`` after the usual ``git pull`` to | ||||||
|  |     properly update**. | ||||||
|  |  | ||||||
| * 1.1.0 | * 1.1.0 | ||||||
|   * Fix for `#283`_, a redirect bug which broke interactions with |   * Fix for `#283`_, a redirect bug which broke interactions with | ||||||
|     paperless-desktop.  Thanks to `chris-aeviator`_ for reporting it. |     paperless-desktop.  Thanks to `chris-aeviator`_ for reporting it. | ||||||
| @@ -272,6 +280,7 @@ Changelog | |||||||
| .. _chris-aeviator: https://github.com/chris-aeviator | .. _chris-aeviator: https://github.com/chris-aeviator | ||||||
| .. _Dan Panzarella: https://github.com/pzl | .. _Dan Panzarella: https://github.com/pzl | ||||||
| .. _addadi: https://github.com/addadi | .. _addadi: https://github.com/addadi | ||||||
|  | .. _BastianPoe: https://github.com/BastianPoe | ||||||
|  |  | ||||||
| .. _#20: https://github.com/danielquinn/paperless/issues/20 | .. _#20: https://github.com/danielquinn/paperless/issues/20 | ||||||
| .. _#44: https://github.com/danielquinn/paperless/issues/44 | .. _#44: https://github.com/danielquinn/paperless/issues/44 | ||||||
|   | |||||||
| @@ -11,24 +11,27 @@ should work) that has the following software installed: | |||||||
| * `Tesseract`_, plus its language files matching your document base. | * `Tesseract`_, plus its language files matching your document base. | ||||||
| * `Imagemagick`_ version 6.7.5 or higher | * `Imagemagick`_ version 6.7.5 or higher | ||||||
| * `unpaper`_ | * `unpaper`_ | ||||||
|  | * `libpoppler-cpp-dev`_ PDF rendering library | ||||||
|  |  | ||||||
| .. _Python3: https://python.org/ | .. _Python3: https://python.org/ | ||||||
| .. _GNU Privacy Guard: https://gnupg.org | .. _GNU Privacy Guard: https://gnupg.org | ||||||
| .. _Tesseract: https://github.com/tesseract-ocr | .. _Tesseract: https://github.com/tesseract-ocr | ||||||
| .. _Imagemagick: http://imagemagick.org/ | .. _Imagemagick: http://imagemagick.org/ | ||||||
| .. _unpaper: https://www.flameeyes.eu/projects/unpaper | .. _unpaper: https://www.flameeyes.eu/projects/unpaper | ||||||
|  | .. _libpoppler-cpp-dev: https://poppler.freedesktop.org/ | ||||||
|  |  | ||||||
| Notably, you should confirm how you access your Python3 installation.  Many | Notably, you should confirm how you access your Python3 installation.  Many | ||||||
| Linux distributions will install Python3 in parallel to Python2, using the names | Linux distributions will install Python3 in parallel to Python2, using the | ||||||
| ``python3`` and ``python`` respectively.  The same goes for ``pip3`` and | names ``python3`` and ``python`` respectively.  The same goes for ``pip3`` and | ||||||
| ``pip``.  Running Paperless with Python2 will likely break things, so make sure that  | ``pip``.  Running Paperless with Python2 will likely break things, so make sure | ||||||
| you're using the right version. | that you're using the right version. | ||||||
|  |  | ||||||
| For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to | For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to | ||||||
| refer to their Python3 versions. | refer to their Python3 versions. | ||||||
|  |  | ||||||
| In addition to the above, there are a number of Python requirements, all of | In addition to the above, there are a number of Python requirements, all of | ||||||
| which are listed in a file called ``requirements.txt`` in the project root directory. | which are listed in a file called ``requirements.txt`` in the project root | ||||||
|  | directory. | ||||||
|  |  | ||||||
| If you're not working on a virtual environment (like Vagrant or Docker), you | If you're not working on a virtual environment (like Vagrant or Docker), you | ||||||
| should probably be using a virtualenv, but that's your call.  The reasons why | should probably be using a virtualenv, but that's your call.  The reasons why | ||||||
| @@ -39,12 +42,13 @@ probably figure that out before continuing. | |||||||
|  |  | ||||||
| .. _requirements-apple: | .. _requirements-apple: | ||||||
|  |  | ||||||
| Apple-tastic Complications | Problems with Imagemagick & PDFs | ||||||
| -------------------------- | -------------------------------- | ||||||
|  |  | ||||||
| Some users have `run into problems`_ with installing ImageMagick on Apple | Some users have `run into problems`_ with getting ImageMagick to do its thing | ||||||
| systems using HomeBrew.  The solution appears to be to install ghostscript as | with PDFs.  Often this is the case with Apple systems using HomeBrew, but other | ||||||
| well as ImageMagick: | Linuxes have been a problem as well.  The solution appears to be to install | ||||||
|  | ghostscript as well as ImageMagick: | ||||||
|  |  | ||||||
| .. _run into problems: https://github.com/danielquinn/paperless/issues/25 | .. _run into problems: https://github.com/danielquinn/paperless/issues/25 | ||||||
|  |  | ||||||
|   | |||||||
| @@ -14,6 +14,7 @@ python-dotenv>=0.6.2 | |||||||
| python-gnupg>=0.3.9 | python-gnupg>=0.3.9 | ||||||
| pytz>=2016.10 | pytz>=2016.10 | ||||||
| gunicorn==19.7.1 | gunicorn==19.7.1 | ||||||
|  | pdftotext>=2.0.1 | ||||||
|  |  | ||||||
| # For the tests | # For the tests | ||||||
| factory-boy | factory-boy | ||||||
|   | |||||||
| @@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | |||||||
| # The amount of threads to use for OCR | # The amount of threads to use for OCR | ||||||
| OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") | OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") | ||||||
|  |  | ||||||
|  | # OCR all documents? | ||||||
|  | OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true")) | ||||||
|  |  | ||||||
| # If this is true, any failed attempts to OCR a PDF will result in the PDF | # If this is true, any failed attempts to OCR a PDF will result in the PDF | ||||||
| # being indexed anyway, with whatever we could get.  If it's False, the file | # being indexed anyway, with whatever we could get.  If it's False, the file | ||||||
| # will simply be left in the CONSUMPTION_DIR. | # will simply be left in the CONSUMPTION_DIR. | ||||||
|   | |||||||
| @@ -3,6 +3,7 @@ import os | |||||||
| import re | import re | ||||||
| import subprocess | import subprocess | ||||||
| from multiprocessing.pool import Pool | from multiprocessing.pool import Pool | ||||||
|  | import pdftotext | ||||||
|  |  | ||||||
| import langdetect | import langdetect | ||||||
| import pyocr | import pyocr | ||||||
| @@ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None |     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||||
|     UNPAPER = settings.UNPAPER_BINARY |     UNPAPER = settings.UNPAPER_BINARY | ||||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE |     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||||
|  |     OCR_ALWAYS = settings.OCR_ALWAYS | ||||||
|  |  | ||||||
|     def get_thumbnail(self): |     def get_thumbnail(self): | ||||||
|         """ |         """ | ||||||
| @@ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|  |  | ||||||
|         return os.path.join(self.tempdir, "convert-0000.png") |         return os.path.join(self.tempdir, "convert-0000.png") | ||||||
|  |  | ||||||
|  |     def _is_ocred(self): | ||||||
|  |         # Extract text from PDF using pdftotext | ||||||
|  |         text = get_text_from_pdf(self.document_path) | ||||||
|  |  | ||||||
|  |         # We assume, that a PDF with at least 50 characters contains text | ||||||
|  |         # (so no OCR required) | ||||||
|  |         if len(text) > 50: | ||||||
|  |             return True | ||||||
|  |  | ||||||
|  |         return False | ||||||
|  |  | ||||||
|     def get_text(self): |     def get_text(self): | ||||||
|  |         if not self.OCR_ALWAYS and self._is_ocred(): | ||||||
|  |             self.log("info", "Skipping OCR, using Text from PDF") | ||||||
|  |             return get_text_from_pdf(self.document_path) | ||||||
|  |  | ||||||
|         images = self._get_greyscale() |         images = self._get_greyscale() | ||||||
|  |  | ||||||
| @@ -212,3 +228,13 @@ def image_to_string(args): | |||||||
|             except (TesseractError, OtherTesseractError): |             except (TesseractError, OtherTesseractError): | ||||||
|                 pass |                 pass | ||||||
|         return ocr.image_to_string(f, lang=lang) |         return ocr.image_to_string(f, lang=lang) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def get_text_from_pdf(pdf_file): | ||||||
|  |     with open(pdf_file, "rb") as f: | ||||||
|  |         try: | ||||||
|  |             pdf = pdftotext.PDF(f) | ||||||
|  |         except pdftotext.Error: | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |     return "\n".join(pdf) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn