From 37af5992c73b3d59d892568cd2e69c4db937588a Mon Sep 17 00:00:00 2001 From: Guy Addadi Date: Sat, 9 Dec 2017 23:08:56 +0200 Subject: [PATCH 01/16] adapted Dockerfile for alpine image --- Dockerfile | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index f661012b5..20206a5ba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,17 @@ -FROM python:3.5 -MAINTAINER Pit Kleyersburg +FROM alpine:latest # Install dependencies -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - sudo \ - tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \ - && rm -rf /var/lib/apt/lists/* +RUN apk --no-cache --update add \ + python3 python3-dev gcc musl-dev gnupg zlib-dev jpeg-dev libmagic \ + sudo tesseract-ocr imagemagick ghostscript unpaper -# Install python dependencies -RUN mkdir -p /usr/src/paperless +## Install python dependencies +RUN python3 -m ensurepip && \ + rm -r /usr/lib/python*/ensurepip && \ + mkdir -p /usr/src/paperless WORKDIR /usr/src/paperless COPY requirements.txt /usr/src/paperless/ -RUN pip install --no-cache-dir -r requirements.txt +RUN pip3 install --no-cache-dir -r requirements.txt # Copy application RUN mkdir -p /usr/src/paperless/src @@ -31,8 +30,8 @@ WORKDIR /usr/src/paperless/src RUN ./manage.py migrate # Create user -RUN groupadd -g 1000 paperless \ - && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \ +RUN addgroup -g 1000 paperless \ + && adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless \ && chown -Rh paperless:paperless /usr/src/paperless # Set export directory From 7d81de4edf7bf540457ea4f3e984c5b3a9b9151b Mon Sep 17 00:00:00 2001 From: Guy Addadi Date: Mon, 11 Dec 2017 00:41:36 +0200 Subject: [PATCH 02/16] added bash and moved all dev packages to be with virtual alpine env that is removed after python libraries installation --- Dockerfile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 20206a5ba..7b0464a21 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,10 @@ FROM alpine:latest # Install dependencies RUN apk --no-cache --update add \ - python3 python3-dev gcc musl-dev gnupg zlib-dev jpeg-dev libmagic \ - sudo tesseract-ocr imagemagick ghostscript unpaper + python3 gnupg libmagic bash \ + sudo tesseract-ocr imagemagick ghostscript unpaper && \ + apk --no-cache add --virtual .build-dependencies \ + python3-dev gcc musl-dev zlib-dev jpeg-dev ## Install python dependencies RUN python3 -m ensurepip && \ @@ -46,4 +48,7 @@ RUN chmod 755 /sbin/docker-entrypoint.sh VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"] ENTRYPOINT ["/sbin/docker-entrypoint.sh"] + +# Remove build dependencies +RUN apk del .build-dependencies CMD ["--help"] From e1cf2117f5dded62492b72b0d0a6a2284c878966 Mon Sep 17 00:00:00 2001 From: Guy Addadi Date: Mon, 11 Dec 2017 22:03:51 +0200 Subject: [PATCH 03/16] moved to alpine:3.7 removed RUN layers to save image space, removed redundant mkdir commands --- Dockerfile | 67 ++++++++++++++++++++++-------------------------------- 1 file changed, 27 insertions(+), 40 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7b0464a21..befa90c26 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,54 +1,41 @@ -FROM alpine:latest - +FROM alpine:3.7 +WORKDIR /usr/src/paperless +COPY requirements.txt /usr/src/paperless/ +# Copy application +COPY src/ /usr/src/paperless/src/ +COPY data/ /usr/src/paperless/data/ +COPY media/ /usr/src/paperless/media/ +# Set export directory +ENV PAPERLESS_EXPORT_DIR /export +# Set consumption directory +ENV PAPERLESS_CONSUMPTION_DIR /consume +COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh # Install dependencies RUN apk --no-cache --update add \ python3 gnupg libmagic bash \ sudo tesseract-ocr imagemagick ghostscript unpaper && \ apk --no-cache add --virtual .build-dependencies \ - python3-dev gcc musl-dev zlib-dev jpeg-dev - + python3-dev gcc musl-dev zlib-dev jpeg-dev && \ ## Install python dependencies -RUN python3 -m ensurepip && \ + python3 -m ensurepip && \ rm -r /usr/lib/python*/ensurepip && \ - mkdir -p /usr/src/paperless -WORKDIR /usr/src/paperless -COPY requirements.txt /usr/src/paperless/ -RUN pip3 install --no-cache-dir -r requirements.txt - -# Copy application -RUN mkdir -p /usr/src/paperless/src -RUN mkdir -p /usr/src/paperless/data -RUN mkdir -p /usr/src/paperless/media -COPY src/ /usr/src/paperless/src/ -COPY data/ /usr/src/paperless/data/ -COPY media/ /usr/src/paperless/media/ - -# Set consumption directory -ENV PAPERLESS_CONSUMPTION_DIR /consume -RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR - + mkdir -p /usr/src/paperless && \ + pip3 install --no-cache-dir -r requirements.txt && \ +# Remove build dependencies + apk del .build-dependencies && \ +# Create the consumption directory + mkdir -p $PAPERLESS_CONSUMPTION_DIR && \ # Migrate database -WORKDIR /usr/src/paperless/src -RUN ./manage.py migrate - + ./src/manage.py migrate && \ # Create user -RUN addgroup -g 1000 paperless \ - && adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless \ - && chown -Rh paperless:paperless /usr/src/paperless - -# Set export directory -ENV PAPERLESS_EXPORT_DIR /export -RUN mkdir -p $PAPERLESS_EXPORT_DIR - + addgroup -g 1000 paperless && \ + adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \ + chown -Rh paperless:paperless /usr/src/paperless && \ + mkdir -p $PAPERLESS_EXPORT_DIR && \ # Setup entrypoint -COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh -RUN chmod 755 /sbin/docker-entrypoint.sh - + chmod 755 /sbin/docker-entrypoint.sh +WORKDIR /usr/src/paperless/src # Mount volumes VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"] - ENTRYPOINT ["/sbin/docker-entrypoint.sh"] - -# Remove build dependencies -RUN apk del .build-dependencies CMD ["--help"] From 76293084a43ba9c9389f3498513bafffafe374fb Mon Sep 17 00:00:00 2001 From: Guy Addadi Date: Tue, 12 Dec 2017 23:12:34 +0200 Subject: [PATCH 04/16] removed ENV WORKDIR layers, reorg the commands in groups with comments and black lines when possible. Removed redundant mkdir command --- Dockerfile | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index befa90c26..d2144d8c9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,25 +1,26 @@ FROM alpine:3.7 -WORKDIR /usr/src/paperless -COPY requirements.txt /usr/src/paperless/ + # Copy application +COPY requirements.txt /usr/src/paperless/ COPY src/ /usr/src/paperless/src/ COPY data/ /usr/src/paperless/data/ COPY media/ /usr/src/paperless/media/ -# Set export directory -ENV PAPERLESS_EXPORT_DIR /export -# Set consumption directory -ENV PAPERLESS_CONSUMPTION_DIR /consume COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh + +# Set export and consumption directories +ENV PAPERLESS_EXPORT_DIR=/export \ + PAPERLESS_CONSUMPTION_DIR=/consume + # Install dependencies RUN apk --no-cache --update add \ python3 gnupg libmagic bash \ sudo tesseract-ocr imagemagick ghostscript unpaper && \ apk --no-cache add --virtual .build-dependencies \ python3-dev gcc musl-dev zlib-dev jpeg-dev && \ -## Install python dependencies +# Install python dependencies python3 -m ensurepip && \ rm -r /usr/lib/python*/ensurepip && \ - mkdir -p /usr/src/paperless && \ + cd /usr/src/paperless && \ pip3 install --no-cache-dir -r requirements.txt && \ # Remove build dependencies apk del .build-dependencies && \ @@ -34,8 +35,9 @@ RUN apk --no-cache --update add \ mkdir -p $PAPERLESS_EXPORT_DIR && \ # Setup entrypoint chmod 755 /sbin/docker-entrypoint.sh + WORKDIR /usr/src/paperless/src -# Mount volumes +# Mount volumes and set Entrypoint VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"] ENTRYPOINT ["/sbin/docker-entrypoint.sh"] CMD ["--help"] From 68cdeb7b3d26e18d3a17fd7db24cc779f8b9c239 Mon Sep 17 00:00:00 2001 From: Guy Date: Tue, 19 Dec 2017 22:34:22 +0200 Subject: [PATCH 05/16] changed docker-comppse.yml example to build the docker image instead of pull the previously used debian based image from docker hub --- docker-compose.yml.example | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml.example b/docker-compose.yml.example index 2b7ee8b36..6920836ba 100644 --- a/docker-compose.yml.example +++ b/docker-compose.yml.example @@ -2,7 +2,7 @@ version: '2' services: webserver: - image: pitkley/paperless + build: ./ ports: # You can adapt the port you want Paperless to listen on by # modifying the part before the `:`. @@ -20,7 +20,7 @@ services: command: ["runserver", "--insecure", "0.0.0.0:8000"] consumer: - image: pitkley/paperless + build: ./ volumes: - data:/usr/src/paperless/data - media:/usr/src/paperless/media From 7e49d047b0b16bdb2a3a5e5efbe8ec3651783525 Mon Sep 17 00:00:00 2001 From: Guy Date: Wed, 20 Dec 2017 16:17:58 +0200 Subject: [PATCH 06/16] adapted docker-entrypoint script for alpine docker image (mainly how to install additional OCR languages) --- scripts/docker-entrypoint.sh | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh index 38b8ac1da..f9fde70c8 100644 --- a/scripts/docker-entrypoint.sh +++ b/scripts/docker-entrypoint.sh @@ -9,7 +9,7 @@ map_uidgid() { USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID} if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID" - groupmod -g "${USERMAP_GID}" paperless + addgroup -g "${USERMAP_GID}" paperless sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd fi } @@ -56,25 +56,24 @@ install_languages() { return fi - # Update apt-lists - apt-get update - # Loop over languages to be installed for lang in "${langs[@]}"; do - pkg="tesseract-ocr-$lang" - if dpkg -s "$pkg" > /dev/null 2>&1; then + pkg="tesseract-ocr-data-$lang" + + # English is installed by default + if [ "$lang" == "eng" ]; then + continue + fi + + if apk info -e "$pkg" > /dev/null 2>&1; then + continue + fi + if ! apk info "$pkg" > /dev/null 2>&1; then continue fi - if ! apt-cache show "$pkg" > /dev/null 2>&1; then - continue - fi - - apt-get install "$pkg" + apk --no-cache --update add "$pkg" done - - # Remove apt lists - rm -rf /var/lib/apt/lists/* } From cbbc4d37d0e047ff791ad96c8505858176b07cec Mon Sep 17 00:00:00 2001 From: Guy Date: Mon, 29 Jan 2018 23:19:06 +0200 Subject: [PATCH 07/16] Updated Dockerfile with maintainer and contributors Updated setup.rst with information on upgrade path if coming from an earlier version of docker-compose images --- Dockerfile | 4 ++++ docs/setup.rst | 10 +++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index d2144d8c9..11a83e2b1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,9 @@ FROM alpine:3.7 +LABEL maintainer="The Paperless Project https://github.com/danielquinn/paperless" \ + contributors="Guy Addadi , Pit Kleyersburg , \ + Sven Fischer " + # Copy application COPY requirements.txt /usr/src/paperless/ COPY src/ /usr/src/paperless/src/ diff --git a/docs/setup.rst b/docs/setup.rst index 6c0522b8e..a2f2564be 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -175,7 +175,7 @@ Docker Method modified versions of the configuration files. 4. Modify ``docker-compose.yml`` to your preferences, following the instructions in comments in the file. The only change that is a hard - requirement is to specify where the consumption directory should mount. + requirement is to specify where the consumption directory should mount. [#docker-compose]_ 5. Modify ``docker-compose.env`` and adapt the following environment variables: ``PAPERLESS_PASSPHRASE`` @@ -192,7 +192,7 @@ Docker Method default English, set this parameter to a space separated list of three-letter language-codes after `ISO 639-2/T`_. For a list of available languages -- including their three letter codes -- see the - `Debian packagelist`_. + `Alpine packagelist`_. ``USERMAP_UID`` and ``USERMAP_GID`` If you want to mount the consumption volume (directory ``/consume`` within @@ -282,12 +282,16 @@ Docker Method .. _Docker: https://www.docker.com/ .. _docker-compose: https://docs.docker.com/compose/install/ .. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes -.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr- +.. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64 .. [#compose] You of course don't have to use docker-compose, but it simplifies deployment immensely. If you know your way around Docker, feel free to tinker around without using compose! +.. [#docker-compose] If you're upgrading your docker-compse images from version + 1.1.0 or earlier, you might need to change in the ``docker-compose.yml` + file the ``image: pitkley/paperless`` in both the ``webserver`` and ``consumer`` + sections to ``build: ./`` as per the newer ``docker-compose.yml.example`` file .. _setup-permanent: From e20b4fb9051ce78ffba0a1c9d03b787d6d438086 Mon Sep 17 00:00:00 2001 From: Guy Date: Mon, 29 Jan 2018 23:41:52 +0200 Subject: [PATCH 08/16] fixing typos and rst syntax --- docs/setup.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/setup.rst b/docs/setup.rst index a2f2564be..7fad393ba 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -175,7 +175,7 @@ Docker Method modified versions of the configuration files. 4. Modify ``docker-compose.yml`` to your preferences, following the instructions in comments in the file. The only change that is a hard - requirement is to specify where the consumption directory should mount. [#docker-compose]_ + requirement is to specify where the consumption directory should mount. _. [#docker-compose.yml]_ 5. Modify ``docker-compose.env`` and adapt the following environment variables: ``PAPERLESS_PASSPHRASE`` @@ -288,9 +288,9 @@ Docker Method simplifies deployment immensely. If you know your way around Docker, feel free to tinker around without using compose! -.. [#docker-compose] If you're upgrading your docker-compse images from version - 1.1.0 or earlier, you might need to change in the ``docker-compose.yml` - file the ``image: pitkley/paperless`` in both the ``webserver`` and ``consumer`` +.. [#docker-compose.yml] If you're upgrading your docker-compse images from version + 1.1.0 or earlier, you might need to change in the ``docker-compose.yml`` + file the ``image: pitkley/paperless`` directive in both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the newer ``docker-compose.yml.example`` file .. _setup-permanent: From e900a38983709cc73ced3bc80c2dbc9ba230aee6 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Tue, 30 Jan 2018 17:19:18 +0000 Subject: [PATCH 09/16] Update docs to reflect Docker changes --- docs/changelog.rst | 6 ++++++ docs/setup.rst | 12 +++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 1c0fe3860..6f3cc8567 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,9 @@ Changelog ######### +* 1.2.0 + * New Docker image, now based on Alpine, thanks to the efforts of `addadi`_ + and `Pit`_. * 1.1.0 * Fix for `#283`_, a redirect bug which broke interactions with paperless-desktop. Thanks to `chris-aeviator`_ for reporting it. @@ -268,6 +271,7 @@ Changelog .. _Lukas Winkler: https://github.com/Findus23 .. _chris-aeviator: https://github.com/chris-aeviator .. _Dan Panzarella: https://github.com/pzl +.. _addadi: https://github.com/addadi .. _#20: https://github.com/danielquinn/paperless/issues/20 .. _#44: https://github.com/danielquinn/paperless/issues/44 @@ -317,3 +321,5 @@ Changelog .. _#283: https://github.com/danielquinn/paperless/issues/283 .. _#256: https://github.com/danielquinn/paperless/pull/256 .. _#285: https://github.com/danielquinn/paperless/pull/285 + +.. _pipenv: https://docs.pipenv.org/ diff --git a/docs/setup.rst b/docs/setup.rst index 7fad393ba..f8c696fc8 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -175,7 +175,8 @@ Docker Method modified versions of the configuration files. 4. Modify ``docker-compose.yml`` to your preferences, following the instructions in comments in the file. The only change that is a hard - requirement is to specify where the consumption directory should mount. _. [#docker-compose.yml]_ + requirement is to specify where the consumption directory should + mount.[#dockercomposeyml]_ 5. Modify ``docker-compose.env`` and adapt the following environment variables: ``PAPERLESS_PASSPHRASE`` @@ -288,10 +289,11 @@ Docker Method simplifies deployment immensely. If you know your way around Docker, feel free to tinker around without using compose! -.. [#docker-compose.yml] If you're upgrading your docker-compse images from version - 1.1.0 or earlier, you might need to change in the ``docker-compose.yml`` - file the ``image: pitkley/paperless`` directive in both the ``webserver`` and ``consumer`` - sections to ``build: ./`` as per the newer ``docker-compose.yml.example`` file +.. [#dockercomposeyml] If you're upgrading your docker-compose images from + version 1.1.0 or earlier, you might need to change in the + ``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in + both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the + newer ``docker-compose.yml.example`` file .. _setup-permanent: From 31c8cf020ef99a0e9f5053c1b5a988e40c104ae6 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Tue, 30 Jan 2018 18:46:46 +0000 Subject: [PATCH 10/16] Clean up grammar & remove VersionEye --- README.rst | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 4f0d21c2c..fc1573ecd 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,6 @@ Paperless |Documentation| |Chat| |Travis| -|Dependencies| Index and archive all of your scanned paper documents @@ -28,12 +27,11 @@ scanner produces 1. Buy a document scanner that can write to a place on your network. If you need some inspiration, have a look at the `scanner recommendations`_ page. - recommended by another user. 2. Set it up to "scan to FTP" or something similar. It should be able to push - scanned images to a server without you having to do anything. If your - scanner doesn't know how to automatically upload the file somewhere, you can - always do that manually. Paperless doesn't care how the documents get into - its local consumption directory. + scanned images to a server without you having to do anything. Of course if + your scanner doesn't know how to automatically upload the file somewhere, + you can always do that manually. Paperless doesn't care how the documents + get into its local consumption directory. 3. Have the target server run the Paperless consumption script to OCR the file and index it into a local database. 4. Use the web frontend to sift through the database and find what you want. @@ -140,5 +138,3 @@ work and they need the money a lot more than I do. :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge .. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master :target: https://travis-ci.org/danielquinn/paperless -.. |Dependencies| image:: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500/badge.svg - :target: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500 From cd92c005e3f18b1dd3dbeddb26b046b31ad563cf Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Tue, 30 Jan 2018 20:13:35 +0000 Subject: [PATCH 11/16] Add support for using pre-existing text from PDFs --- .travis.yml | 4 ++++ Dockerfile | 4 ++-- docs/changelog.rst | 11 ++++++++++- docs/requirements.rst | 24 ++++++++++++++---------- requirements.txt | 1 + src/paperless/settings.py | 3 +++ src/paperless_tesseract/parsers.py | 26 ++++++++++++++++++++++++++ 7 files changed, 60 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4a136be91..41abf71ee 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,9 @@ language: python +before_install: +- sudo apt-get update -qq +- sudo apt-get install -qq libpoppler-cpp-dev + sudo: false matrix: diff --git a/Dockerfile b/Dockerfile index 11a83e2b1..9c2f9c5f8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,9 +18,9 @@ ENV PAPERLESS_EXPORT_DIR=/export \ # Install dependencies RUN apk --no-cache --update add \ python3 gnupg libmagic bash \ - sudo tesseract-ocr imagemagick ghostscript unpaper && \ + sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \ apk --no-cache add --virtual .build-dependencies \ - python3-dev gcc musl-dev zlib-dev jpeg-dev && \ + python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \ # Install python dependencies python3 -m ensurepip && \ rm -r /usr/lib/python*/ensurepip && \ diff --git a/docs/changelog.rst b/docs/changelog.rst index 6f3cc8567..bd8b751a1 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,7 +3,15 @@ Changelog * 1.2.0 * New Docker image, now based on Alpine, thanks to the efforts of `addadi`_ - and `Pit`_. + and `Pit`_. + * `BastianPoe`_ has added the long-awaited feature to automatically skip the + OCR step when the PDF already contains text. This can be overridden by + setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or + in the environment. Note that this also means that Paperless now requires + ``libpoppler-cpp-dev`` to be installed. **You'll need to run + ``pip install -r requirements.txt`` after the usual ``git pull`` to + properly update**. + * 1.1.0 * Fix for `#283`_, a redirect bug which broke interactions with paperless-desktop. Thanks to `chris-aeviator`_ for reporting it. @@ -272,6 +280,7 @@ Changelog .. _chris-aeviator: https://github.com/chris-aeviator .. _Dan Panzarella: https://github.com/pzl .. _addadi: https://github.com/addadi +.. _BastianPoe: https://github.com/BastianPoe .. _#20: https://github.com/danielquinn/paperless/issues/20 .. _#44: https://github.com/danielquinn/paperless/issues/44 diff --git a/docs/requirements.rst b/docs/requirements.rst index 1f476c9dd..ee42cb96a 100644 --- a/docs/requirements.rst +++ b/docs/requirements.rst @@ -11,24 +11,27 @@ should work) that has the following software installed: * `Tesseract`_, plus its language files matching your document base. * `Imagemagick`_ version 6.7.5 or higher * `unpaper`_ +* `libpoppler-cpp-dev`_ PDF rendering library .. _Python3: https://python.org/ .. _GNU Privacy Guard: https://gnupg.org .. _Tesseract: https://github.com/tesseract-ocr .. _Imagemagick: http://imagemagick.org/ .. _unpaper: https://www.flameeyes.eu/projects/unpaper +.. _libpoppler-cpp-dev: https://poppler.freedesktop.org/ Notably, you should confirm how you access your Python3 installation. Many -Linux distributions will install Python3 in parallel to Python2, using the names -``python3`` and ``python`` respectively. The same goes for ``pip3`` and -``pip``. Running Paperless with Python2 will likely break things, so make sure that -you're using the right version. +Linux distributions will install Python3 in parallel to Python2, using the +names ``python3`` and ``python`` respectively. The same goes for ``pip3`` and +``pip``. Running Paperless with Python2 will likely break things, so make sure +that you're using the right version. For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to refer to their Python3 versions. In addition to the above, there are a number of Python requirements, all of -which are listed in a file called ``requirements.txt`` in the project root directory. +which are listed in a file called ``requirements.txt`` in the project root +directory. If you're not working on a virtual environment (like Vagrant or Docker), you should probably be using a virtualenv, but that's your call. The reasons why @@ -39,12 +42,13 @@ probably figure that out before continuing. .. _requirements-apple: -Apple-tastic Complications --------------------------- +Problems with Imagemagick & PDFs +-------------------------------- -Some users have `run into problems`_ with installing ImageMagick on Apple -systems using HomeBrew. The solution appears to be to install ghostscript as -well as ImageMagick: +Some users have `run into problems`_ with getting ImageMagick to do its thing +with PDFs. Often this is the case with Apple systems using HomeBrew, but other +Linuxes have been a problem as well. The solution appears to be to install +ghostscript as well as ImageMagick: .. _run into problems: https://github.com/danielquinn/paperless/issues/25 diff --git a/requirements.txt b/requirements.txt index 96336725b..9df571328 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ python-dotenv>=0.6.2 python-gnupg>=0.3.9 pytz>=2016.10 gunicorn==19.7.1 +pdftotext>=2.0.1 # For the tests factory-boy diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 696b0ddbd..6d750c9b0 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") # The amount of threads to use for OCR OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") +# OCR all documents? +OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true")) + # If this is true, any failed attempts to OCR a PDF will result in the PDF # being indexed anyway, with whatever we could get. If it's False, the file # will simply be left in the CONSUMPTION_DIR. diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 0c5d039e1..c90c9f020 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -3,6 +3,7 @@ import os import re import subprocess from multiprocessing.pool import Pool +import pdftotext import langdetect import pyocr @@ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser): THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None UNPAPER = settings.UNPAPER_BINARY DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE + OCR_ALWAYS = settings.OCR_ALWAYS def get_thumbnail(self): """ @@ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser): return os.path.join(self.tempdir, "convert-0000.png") + def _is_ocred(self): + # Extract text from PDF using pdftotext + text = get_text_from_pdf(self.document_path) + + # We assume, that a PDF with at least 50 characters contains text + # (so no OCR required) + if len(text) > 50: + return True + + return False + def get_text(self): + if not self.OCR_ALWAYS and self._is_ocred(): + self.log("info", "Skipping OCR, using Text from PDF") + return get_text_from_pdf(self.document_path) images = self._get_greyscale() @@ -212,3 +228,13 @@ def image_to_string(args): except (TesseractError, OtherTesseractError): pass return ocr.image_to_string(f, lang=lang) + + +def get_text_from_pdf(pdf_file): + with open(pdf_file, "rb") as f: + try: + pdf = pdftotext.PDF(f) + except pdftotext.Error: + return False + + return "\n".join(pdf) From 3fcd1e2d7eeb17e16315f8ad361841292cc69e91 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Tue, 30 Jan 2018 20:27:40 +0000 Subject: [PATCH 12/16] Fix text formatting --- docs/changelog.rst | 474 ++++++++++++++++++++++++--------------------- 1 file changed, 258 insertions(+), 216 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index bd8b751a1..3cc5dfaf1 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,254 +1,296 @@ Changelog ######### -* 1.2.0 - * New Docker image, now based on Alpine, thanks to the efforts of `addadi`_ - and `Pit`_. - * `BastianPoe`_ has added the long-awaited feature to automatically skip the - OCR step when the PDF already contains text. This can be overridden by - setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or - in the environment. Note that this also means that Paperless now requires - ``libpoppler-cpp-dev`` to be installed. **You'll need to run - ``pip install -r requirements.txt`` after the usual ``git pull`` to - properly update**. +1.2.0 +===== -* 1.1.0 - * Fix for `#283`_, a redirect bug which broke interactions with - paperless-desktop. Thanks to `chris-aeviator`_ for reporting it. - * Addition of an optional new financial year filter, courtesy of - `David Martin`_ `#256`_ - * Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of - `Dan Panzarella`_ +* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_ + and `Pit`_. +* `BastianPoe`_ has added the long-awaited feature to automatically skip the + OCR step when the PDF already contains text. This can be overridden by + setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or + in the environment. Note that this also means that Paperless now requires + ``libpoppler-cpp-dev`` to be installed. **Important**: You'll need to run + ``pip install -r requirements.txt`` after the usual ``git pull`` to + properly update. -* 1.0.0 - * Upgrade to Django 1.11. **You'll need to run - ``pip install -r requirements.txt`` after the usual ``git pull`` to - properly update**. - * Replace the templatetag-based hack we had for document listing in favour of - a slightly less ugly solution in the form of another template tag with less - copypasta. - * Support for multi-word-matches for auto-tagging thanks to an excellent - patch from `ishirav`_ `#277`_. - * Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of - the text and checkboxes under some resolutions `#272`_. - * Patched the Docker config to force the serving of static files. Credit for - this one goes to `dev-rke`_ via `#248`_. - * Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_. - * Date fields in the admin are now expressed as HTML5 date fields thanks to - `Lukas Winkler`_'s issue `#278`_ +1.1.0 +===== -* 0.8.0 - * Paperless can now run in a subdirectory on a host (``/paperless``), rather - than always running in the root (``/``) thanks to `maphy-psd`_'s work on - `#255`_. +* Fix for `#283`_, a redirect bug which broke interactions with + paperless-desktop. Thanks to `chris-aeviator`_ for reporting it. +* Addition of an optional new financial year filter, courtesy of + `David Martin`_ `#256`_ +* Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of + `Dan Panzarella`_ -* 0.7.0 - * **Potentially breaking change**: As per `#235`_, Paperless will no longer - automatically delete documents attached to correspondents when those - correspondents are themselves deleted. This was Django's default - behaviour, but didn't make much sense in Paperless' case. Thanks to - `Thomas Brueggemann`_ and `David Martin`_ for their input on this one. - * Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files - properly. Thanks to `ayounggun`_ for reporting this one and to - `Kusti Skytén`_ for posting the correct solution in the Github issue. +1.0.0 +===== -* 0.6.0 - * Abandon the shared-secret trick we were using for the POST API in favour - of BasicAuth or Django session. - * Fix the POST API so it actually works. `#236`_ - * **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET`` - as it was being used both for the API (now replaced with a normal auth) - and form email polling. Now that we're only using it for email, this - variable has been renamed to ``PAPERLESS_EMAIL_SECRET``. The old value - will still work for a while, but you should change your config if you've - been using the email polling feature. Thanks to `Joshua Gilman`_ for all - the help with this feature. -* 0.5.0 - * Support for fuzzy matching in the auto-tagger & auto-correspondent systems - thanks to `Jake Gysland`_'s patch `#220`_. - * Modified the Dockerfile to prepare an export directory (`#212`_). Thanks - to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on - this one. - * Updated the import/export scripts to include support for thumbnails. Big - thanks to `CkuT`_ for finding this shortcoming and doing the work to get - it fixed in `#224`_. - * All of the following changes are thanks to `David Martin`_: - * Bumped the dependency on pyocr to 0.4.7 so new users can make use of - Tesseract 4 if they so prefer (`#226`_). - * Fixed a number of issues with the automated mail handler (`#227`_, `#228`_) - * Amended the documentation for better handling of systemd service files (`#229`_) - * Amended the Django Admin configuration to have nice headers (`#230`_) +* Upgrade to Django 1.11. **You'll need to run + ``pip install -r requirements.txt`` after the usual ``git pull`` to + properly update**. +* Replace the templatetag-based hack we had for document listing in favour of + a slightly less ugly solution in the form of another template tag with less + copypasta. +* Support for multi-word-matches for auto-tagging thanks to an excellent + patch from `ishirav`_ `#277`_. +* Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of + the text and checkboxes under some resolutions `#272`_. +* Patched the Docker config to force the serving of static files. Credit for + this one goes to `dev-rke`_ via `#248`_. +* Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_. +* Date fields in the admin are now expressed as HTML5 date fields thanks to + `Lukas Winkler`_'s issue `#278`_ -* 0.4.1 - * Fix for `#206`_ wherein the pluggable parser didn't recognise files with - all-caps suffixes like ``.PDF`` +0.8.0 +===== -* 0.4.0 - * Introducing reminders. See `#199`_ for more information, but the short - explanation is that you can now attach simple notes & times to documents - which are made available via the API. Currently, the default API - (basically just the Django admin) doesn't really make use of this, but - `Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would - like to make use of this feature in his project. +* Paperless can now run in a subdirectory on a host (``/paperless``), rather + than always running in the root (``/``) thanks to `maphy-psd`_'s work on + `#255`_. -* 0.3.6 - * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the - correspondent or the tags for a document. - * The ``content`` field is now optional, to allow for the edge case of a - purely graphical document. - * You can no longer add documents via the admin. This never worked in the - first place, so all I've done here is remove the link to the broken form. - * The consumer code has been heavily refactored to support a pluggable - interface. Install a paperless consumer via pip and tell paperless about - it with an environment variable, and you're good to go. Proper - documentation is on its way. +0.7.0 +===== -* 0.3.5 - * A serious facelift for the documents listing page wherein we drop the - tabular layout in favour of a tiled interface. - * Users can now configure the number of items per page. - * Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value. - * Moved the dotenv loading to the top of settings.py - * Fix for `#112`_: Added checks for binaries required for document - consumption. +* **Potentially breaking change**: As per `#235`_, Paperless will no longer + automatically delete documents attached to correspondents when those + correspondents are themselves deleted. This was Django's default + behaviour, but didn't make much sense in Paperless' case. Thanks to + `Thomas Brueggemann`_ and `David Martin`_ for their input on this one. +* Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files + properly. Thanks to `ayounggun`_ for reporting this one and to + `Kusti Skytén`_ for posting the correct solution in the Github issue. -* 0.3.4 - * Removal of django-suit due to a licensing conflict I bumped into in 0.3.3. - Note that you *can* use Django Suit with Paperless, but only in a - non-profit situation as their free license prohibits for-profit use. As a - result, I can't bundle Suit with Paperless without conflicting with the - GPL. Further development will be done against the stock Django admin. - * I shrunk the thumbnails a little 'cause they were too big for me, even on - my high-DPI monitor. - * BasicAuth support for document and thumbnail downloads, as well as the Push - API thanks to @thomasbrueggemann. See `#179`_. +0.6.0 +===== -* 0.3.3 - * Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw! - * Timezone, items per page, and default language are now all configurable, - also thanks to @ekw. +* Abandon the shared-secret trick we were using for the POST API in favour + of BasicAuth or Django session. +* Fix the POST API so it actually works. `#236`_ +* **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET`` + as it was being used both for the API (now replaced with a normal auth) + and form email polling. Now that we're only using it for email, this + variable has been renamed to ``PAPERLESS_EMAIL_SECRET``. The old value + will still work for a while, but you should change your config if you've + been using the email polling feature. Thanks to `Joshua Gilman`_ for all + the help with this feature. -* 0.3.2 - * Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the - user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need - arise. +0.5.0 +===== -* 0.3.1 - * Added a default value for ``CONVERT_BINARY`` +* Support for fuzzy matching in the auto-tagger & auto-correspondent systems + thanks to `Jake Gysland`_'s patch `#220`_. +* Modified the Dockerfile to prepare an export directory (`#212`_). Thanks + to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on + this one. +* Updated the import/export scripts to include support for thumbnails. Big + thanks to `CkuT`_ for finding this shortcoming and doing the work to get + it fixed in `#224`_. +* All of the following changes are thanks to `David Martin`_: + * Bumped the dependency on pyocr to 0.4.7 so new users can make use of + Tesseract 4 if they so prefer (`#226`_). + * Fixed a number of issues with the automated mail handler (`#227`_, `#228`_) + * Amended the documentation for better handling of systemd service files (`#229`_) + * Amended the Django Admin configuration to have nice headers (`#230`_) -* 0.3.0 - * Updated to using django-filter 1.x - * Added some system checks so new users aren't confused by misconfigurations. - * Consumer loop time is now configurable for systems with slow writes. Just - set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds. The default - is 10. - * As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``, - ``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``. Please use - ``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and - ``PAPERLESS_SHARED_SECRET`` respectively instead. +0.4.1 +===== -* 0.2.0 +* Fix for `#206`_ wherein the pluggable parser didn't recognise files with + all-caps suffixes like ``.PDF`` - * `#150`_: The media root is now a variable you can set in - ``paperless.conf``. - * `#148`_: The database location (sqlite) is now a variable you can set in - ``paperless.conf``. - * `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch`` - URL. - * `#131`_: Document files are now automatically removed from disk when - they're deleted in Paperless. - * `#121`_: Fixed a bug where Paperless wasn't setting document creation time - based on the file naming scheme. - * `#81`_: Added a hook to run an arbitrary script after every document is - consumed. - * `#98`_: Added optional environment variables for ImageMagick so that it - doesn't explode when handling Very Large Documents or when it's just - running on a low-memory system. Thanks to `Florian Harr`_ for his help on - this one. - * `#89`_ Ported the auto-tagging code to correspondents as well. Thanks to - `Justin Snyman`_ for the pointers in the issue queue. - * Added support for guessing the date from the file name along with the - correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull - request that I took forever to merge and to `Pit`_ for his efforts on the - regex front. - * `#94`_: Restored support for changing the created date in the UI. Thanks - to `Martin Honermeyer`_ and `Tim White`_ for working with me on this. +0.4.0 +===== -* 0.1.1 +* Introducing reminders. See `#199`_ for more information, but the short + explanation is that you can now attach simple notes & times to documents + which are made available via the API. Currently, the default API + (basically just the Django admin) doesn't really make use of this, but + `Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would + like to make use of this feature in his project. - * Potentially **Breaking Change**: All references to "sender" in the code - have been renamed to "correspondent" to better reflect the nature of the - property (one could quite reasonably scan a document before sending it to - someone.) - * `#67`_: Rewrote the document exporter and added a new importer that allows - for full metadata retention without depending on the file name and - modification time. A big thanks to `Tikitu de Jager`_, `Pit`_, - `Florian Jung`_, and `Christopher Luu`_ for their code snippets and - contributing conversation that lead to this change. - * `#20`_: Added *unpaper* support to help in cleaning up the scanned image - before it's OCR'd. Thanks to `Pit`_ for this one. - * `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI. - * `#68`_: Added support for using a proper config file at - ``/etc/paperless.conf`` and modified the systemd unit files to use it. - * Refactored the Vagrant installation process to use environment variables - rather than asking the user to modify ``settings.py``. - * `#44`_: Harmonise environment variable names with constant names. - * `#60`_: Setup logging to actually use the Python native logging framework. - * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images - to be imported but made unavailable. +0.3.6 +===== -* 0.1.0 +* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the + correspondent or the tags for a document. +* The ``content`` field is now optional, to allow for the edge case of a + purely graphical document. +* You can no longer add documents via the admin. This never worked in the + first place, so all I've done here is remove the link to the broken form. +* The consumer code has been heavily refactored to support a pluggable + interface. Install a paperless consumer via pip and tell paperless about + it with an environment variable, and you're good to go. Proper + documentation is on its way. - * Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and - `Tikitu de Jager`_ for this one, and especially to `Pit`_ - who spearheadded this effort. - * A simple REST API is in place, but it should be considered unstable. - * Cleaned up the consumer to use temporary directories instead of a single - scratch space. (Thanks `Pit`_) - * Improved the efficiency of the consumer by parsing pages more intelligently - and introducing a threaded OCR process (thanks again `Pit`_). - * `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_. - * `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by - `Pit`_. - * `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_) - * `#54`_: Documented the re-tagger (`zedster`_) - * `#57`_: Make sure file is preserved on import failure (`darkmatter`_) - * Added tox with pep8 checking +0.3.5 +===== -* 0.0.6 +* A serious facelift for the documents listing page wherein we drop the + tabular layout in favour of a tiled interface. +* Users can now configure the number of items per page. +* Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value. +* Moved the dotenv loading to the top of settings.py +* Fix for `#112`_: Added checks for binaries required for document + consumption. - * Added support for parallel OCR (significant work from `Pit`_) - * Sped up the language detection (significant work from `Pit`_) - * Added simple logging +0.3.4 +===== -* 0.0.5 +* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3. + Note that you *can* use Django Suit with Paperless, but only in a + non-profit situation as their free license prohibits for-profit use. As a + result, I can't bundle Suit with Paperless without conflicting with the + GPL. Further development will be done against the stock Django admin. +* I shrunk the thumbnails a little 'cause they were too big for me, even on + my high-DPI monitor. +* BasicAuth support for document and thumbnail downloads, as well as the Push + API thanks to @thomasbrueggemann. See `#179`_. - * Added support for image files as documents (png, jpg, gif, tiff) - * Added a crude means of HTTP POST for document imports - * Added IMAP mail support - * Added a re-tagging utility - * Documentation for the above as well as data migration +0.3.3 +===== -* 0.0.4 +* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw! +* Timezone, items per page, and default language are now all configurable, + also thanks to @ekw. - * Added automated tagging basted on keyword matching - * Cleaned up the document listing page - * Removed ``User`` and ``Group`` from the admin - * Added ``pytz`` to the list of requirements +0.3.2 +===== -* 0.0.3 +* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the + user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need + arise. - * Added basic tagging +0.3.1 +===== -* 0.0.2 +* Added a default value for ``CONVERT_BINARY`` - * Added language detection - * Added datestamps to ``document_exporter``. - * Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``. +0.3.0 +===== -* 0.0.1 +* Updated to using django-filter 1.x +* Added some system checks so new users aren't confused by misconfigurations. +* Consumer loop time is now configurable for systems with slow writes. Just + set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds. The default + is 10. +* As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``, + ``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``. Please use + ``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and + ``PAPERLESS_SHARED_SECRET`` respectively instead. - * Initial release +0.2.0 +===== + +* `#150`_: The media root is now a variable you can set in + ``paperless.conf``. +* `#148`_: The database location (sqlite) is now a variable you can set in + ``paperless.conf``. +* `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch`` + URL. +* `#131`_: Document files are now automatically removed from disk when + they're deleted in Paperless. +* `#121`_: Fixed a bug where Paperless wasn't setting document creation time + based on the file naming scheme. +* `#81`_: Added a hook to run an arbitrary script after every document is + consumed. +* `#98`_: Added optional environment variables for ImageMagick so that it + doesn't explode when handling Very Large Documents or when it's just + running on a low-memory system. Thanks to `Florian Harr`_ for his help on + this one. +* `#89`_ Ported the auto-tagging code to correspondents as well. Thanks to + `Justin Snyman`_ for the pointers in the issue queue. +* Added support for guessing the date from the file name along with the + correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull + request that I took forever to merge and to `Pit`_ for his efforts on the + regex front. +* `#94`_: Restored support for changing the created date in the UI. Thanks + to `Martin Honermeyer`_ and `Tim White`_ for working with me on this. + +0.1.1 +===== + +* Potentially **Breaking Change**: All references to "sender" in the code + have been renamed to "correspondent" to better reflect the nature of the + property (one could quite reasonably scan a document before sending it to + someone.) +* `#67`_: Rewrote the document exporter and added a new importer that allows + for full metadata retention without depending on the file name and + modification time. A big thanks to `Tikitu de Jager`_, `Pit`_, + `Florian Jung`_, and `Christopher Luu`_ for their code snippets and + contributing conversation that lead to this change. +* `#20`_: Added *unpaper* support to help in cleaning up the scanned image + before it's OCR'd. Thanks to `Pit`_ for this one. +* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI. +* `#68`_: Added support for using a proper config file at + ``/etc/paperless.conf`` and modified the systemd unit files to use it. +* Refactored the Vagrant installation process to use environment variables + rather than asking the user to modify ``settings.py``. +* `#44`_: Harmonise environment variable names with constant names. +* `#60`_: Setup logging to actually use the Python native logging framework. +* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images + to be imported but made unavailable. + +0.1.0 +===== + +* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and + `Tikitu de Jager`_ for this one, and especially to `Pit`_ + who spearheadded this effort. +* A simple REST API is in place, but it should be considered unstable. +* Cleaned up the consumer to use temporary directories instead of a single + scratch space. (Thanks `Pit`_) +* Improved the efficiency of the consumer by parsing pages more intelligently + and introducing a threaded OCR process (thanks again `Pit`_). +* `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_. +* `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by + `Pit`_. +* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_) +* `#54`_: Documented the re-tagger (`zedster`_) +* `#57`_: Make sure file is preserved on import failure (`darkmatter`_) +* Added tox with pep8 checking + +0.0.6 +===== + +* Added support for parallel OCR (significant work from `Pit`_) +* Sped up the language detection (significant work from `Pit`_) +* Added simple logging + +0.0.5 +===== + +* Added support for image files as documents (png, jpg, gif, tiff) +* Added a crude means of HTTP POST for document imports +* Added IMAP mail support +* Added a re-tagging utility +* Documentation for the above as well as data migration + +0.0.4 +===== + +* Added automated tagging basted on keyword matching +* Cleaned up the document listing page +* Removed ``User`` and ``Group`` from the admin +* Added ``pytz`` to the list of requirements + +0.0.3 +===== + +* Added basic tagging + +0.0.2 +===== + +* Added language detection +* Added datestamps to ``document_exporter``. +* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``. + +0.0.1 +===== + +* Initial release .. _Brian Conn: https://github.com/TheConnMan .. _Christopher Luu: https://github.com/nuudles From fd5b831979bd7bd16e262e453db310f56e1d7e5a Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Thu, 1 Feb 2018 12:18:01 +0000 Subject: [PATCH 13/16] Fix pytest to 3.3.2 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 96336725b..7066a9163 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,7 @@ gunicorn==19.7.1 # For the tests factory-boy -pytest +pytest==3.3.2 # Newer versions break with pytest-sugar pytest-django pytest-sugar pytest-env From 88736ff867eed1d33166251535557507233c7669 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Thu, 1 Feb 2018 12:37:21 +0000 Subject: [PATCH 14/16] Version bump in anticipation of release later this week --- src/paperless/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paperless/version.py b/src/paperless/version.py index 37a1329f2..09ac5f672 100644 --- a/src/paperless/version.py +++ b/src/paperless/version.py @@ -1 +1 @@ -__version__ = (1, 1, 0) +__version__ = (1, 2, 0) From 9470154df292fb3fd784aee11917b327b9929665 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Thu, 1 Feb 2018 13:02:48 +0000 Subject: [PATCH 15/16] Fiddling to get Docker Hub to behave --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 9c2f9c5f8..f60ccd073 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,3 +45,4 @@ WORKDIR /usr/src/paperless/src VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"] ENTRYPOINT ["/sbin/docker-entrypoint.sh"] CMD ["--help"] + From ce98019b49d96110c7b1da4595dfcfb0c21916f3 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 1 Feb 2018 10:08:57 -0500 Subject: [PATCH 16/16] Fixing error sentinel for pdftotext when the PDF has no text (scanned images). It was causing a crash previously. --- src/paperless_tesseract/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index c90c9f020..43c898df5 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -235,6 +235,6 @@ def get_text_from_pdf(pdf_file): try: pdf = pdftotext.PDF(f) except pdftotext.Error: - return False + return "" return "\n".join(pdf)