mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge branch 'master' into feature/api
This commit is contained in:
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -68,6 +68,8 @@ db.sqlite3 | ||||
| # Other stuff that doesn't belong | ||||
| virtualenv | ||||
| .vagrant | ||||
| docker-compose.yml | ||||
| docker-compose.env | ||||
|  | ||||
| # Used for development | ||||
| scripts/import-for-development | ||||
|   | ||||
							
								
								
									
										43
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | ||||
| FROM python:3.5.1 | ||||
| MAINTAINER Pit Kleyersburg <pitkley@googlemail.com> | ||||
|  | ||||
| # Install dependencies | ||||
| RUN apt-get update \ | ||||
|     && apt-get install -y --no-install-recommends \ | ||||
|         sudo \ | ||||
|         tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \ | ||||
|     && rm -rf /var/lib/apt/lists/* | ||||
|  | ||||
| # Install python dependencies | ||||
| RUN mkdir -p /usr/src/paperless | ||||
| WORKDIR /usr/src/paperless | ||||
| COPY requirements.txt /usr/src/paperless/ | ||||
| RUN pip install --no-cache-dir -r requirements.txt | ||||
|  | ||||
| # Copy application | ||||
| RUN mkdir -p /usr/src/paperless/src | ||||
| COPY src/ /usr/src/paperless/src/ | ||||
|  | ||||
| # Set consumption directory | ||||
| ENV PAPERLESS_CONSUME /consume | ||||
| RUN mkdir -p $PAPERLESS_CONSUME | ||||
|  | ||||
| # Migrate database | ||||
| WORKDIR /usr/src/paperless/src | ||||
| RUN mkdir /usr/src/paperless/data | ||||
| RUN ./manage.py migrate | ||||
|  | ||||
| # Create user | ||||
| RUN groupadd -g 1000 paperless \ | ||||
|     && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \ | ||||
|     && chown -Rh paperless:paperless /usr/src/paperless | ||||
|  | ||||
| # Setup entrypoint | ||||
| COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh | ||||
| RUN chmod 755 /sbin/docker-entrypoint.sh | ||||
|  | ||||
| # Mount volumes | ||||
| VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"] | ||||
|  | ||||
| ENTRYPOINT ["/sbin/docker-entrypoint.sh"] | ||||
| CMD ["--help"] | ||||
							
								
								
									
										15
									
								
								docker-compose.env.example
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								docker-compose.env.example
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,15 @@ | ||||
| # Environment variables to set for Paperless | ||||
| # Commented out variables will be replaced by a default within Paperless. | ||||
|  | ||||
| # Passphrase Paperless uses to encrypt and decrypt your documents | ||||
| PAPERLESS_PASSPHRASE=CHANGE_ME | ||||
|  | ||||
| # The amount of threads to use for text recognition | ||||
| # PAPERLESS_OCR_THREADS=4 | ||||
|  | ||||
| # Additional languages to install for text recognition | ||||
| # PAPERLESS_OCR_LANGUAGES=deu ita | ||||
|  | ||||
| # You can change the default user and group id to a custom one | ||||
| # USERMAP_UID=1000 | ||||
| # USERMAP_GID=1000 | ||||
							
								
								
									
										37
									
								
								docker-compose.yml.example
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								docker-compose.yml.example
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,37 @@ | ||||
| version: '2' | ||||
|  | ||||
| services: | ||||
|     webserver: | ||||
|         image: paperless | ||||
|         ports: | ||||
|             # You can adapt the port you want Paperless to listen on by | ||||
|             # modifying the part before the `:`. | ||||
|             - "8000:8000" | ||||
|         volumes: | ||||
|             - paperless-data:/usr/src/paperless/data | ||||
|             - paperless-media:/usr/src/paperless/media | ||||
|         env_file: docker-compose.env | ||||
|         environment: | ||||
|             - PAPERLESS_OCR_LANGUAGES= | ||||
|         command: ["runserver", "0.0.0.0:8000"] | ||||
|  | ||||
|     consumer: | ||||
|         image: paperless | ||||
|         volumes: | ||||
|             - paperless-data:/usr/src/paperless/data | ||||
|             - paperless-media:/usr/src/paperless/media | ||||
|             # You have to adapt the local path you want the consumption | ||||
|             # directory to mount to by modifying the part before the ':'. | ||||
|             - /path/to/arbitrary/place:/consume | ||||
|             # Likewise, you can add a local path to mount a directory for | ||||
|             # exporting. This is not strictly needed for paperless to | ||||
|             # function, only if you're exporting your files: uncomment | ||||
|             # it and fill in a local path if you know you're going to  | ||||
|             # want to export your documents. | ||||
|             # - /path/to/another/arbitrary/place:/export | ||||
|         env_file: docker-compose.env | ||||
|         command: ["document_consumer"] | ||||
|  | ||||
| volumes: | ||||
|     paperless-data: | ||||
|     paperless-media: | ||||
							
								
								
									
										18
									
								
								docs/Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								docs/Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,18 @@ | ||||
| FROM python:3.5.1 | ||||
| MAINTAINER Pit Kleyersburg <pitkley@googlemail.com> | ||||
|  | ||||
| # Install Sphinx and Pygments | ||||
| RUN pip install Sphinx Pygments | ||||
|  | ||||
| # Setup directories, copy data | ||||
| RUN mkdir /build | ||||
| COPY . /build | ||||
| WORKDIR /build/docs | ||||
|  | ||||
| # Build documentation | ||||
| RUN make html | ||||
|  | ||||
| # Start webserver | ||||
| WORKDIR /build/docs/_build/html | ||||
| EXPOSE 8000/tcp | ||||
| CMD ["python3", "-m", "http.server"] | ||||
| @@ -30,6 +30,20 @@ as part of the update: | ||||
| Note that it's possible (even likely) that while ``git pull`` may update some | ||||
| files, the ``migrate`` step may not update anything.  This is totally normal. | ||||
|  | ||||
| If you are :ref:`using Docker <setup-installation-docker>` the update process | ||||
| requires only one additional step: | ||||
|  | ||||
| .. code-block:: shell-session | ||||
|  | ||||
|     $ cd /path/to/project | ||||
|     $ git pull | ||||
|     $ docker build -t paperless . | ||||
|     $ docker-compose up -d | ||||
|     $ docker-compose run --rm webserver migrate | ||||
|  | ||||
| If ``git pull`` doesn't report any changes, there is no need to continue with | ||||
| the remaining steps. | ||||
|  | ||||
|  | ||||
| .. _migrating-backup: | ||||
|  | ||||
| @@ -53,6 +67,45 @@ with Django's ``dumpdata`` command, which produces JSON output. | ||||
|     $ ./manage.py document_export /path/to/arbitrary/place/ | ||||
|     $ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json | ||||
|  | ||||
| If you are :ref:`using Docker <setup-installation-docker>`, exporting your tags | ||||
| as JSON is almost as easy: | ||||
|  | ||||
| .. code-block:: shell-session | ||||
|  | ||||
|     $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json | ||||
|  | ||||
| To export the documents you can either use ``docker run`` directly, specifying all | ||||
| the commandline options by hand, or (more simply) mount a second volume for export. | ||||
|  | ||||
| To mount a volume for exports, follow the instructions in the | ||||
| ``docker-compose.yml.example`` file for the ``/export`` volume (making the changes | ||||
| in your own ``docker-compose.yml`` file, of course). Once you have the | ||||
| volume mounted, the command to run an export is: | ||||
|  | ||||
| .. code-block:: console | ||||
|  | ||||
|    $ docker-compose run --rm consumer document_exporter /export | ||||
|  | ||||
| If you prefer to use ``docker run`` directly, supplying the necessary commandline | ||||
| options: | ||||
|  | ||||
| .. code-block:: shell-session | ||||
|  | ||||
|    $ # Identify your containers | ||||
|    $ docker-compose ps | ||||
|            Name                       Command                State     Ports | ||||
|    ------------------------------------------------------------------------- | ||||
|    paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0 | ||||
|    paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0 | ||||
|  | ||||
|    $ # Make sure to replace your passphrase and remove or adapt the id mapping | ||||
|    $ docker run --rm \ | ||||
|        --volumes-from paperless_data_1 \ | ||||
|        --volume /path/to/arbitrary/place:/export \ | ||||
|        -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ | ||||
|        -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ | ||||
|        paperless document_exporter /export | ||||
|  | ||||
|  | ||||
| .. _migrating-restoring: | ||||
|  | ||||
| @@ -77,3 +130,25 @@ exported documents into the consumption directory and start up the consumer. | ||||
|     $ cp /path/to/exported/docs/* /path/to/consumption/dir/ | ||||
|     $ ./manage.py document_consumer | ||||
|  | ||||
| Importing your data if you are :ref:`using Docker <setup-installation-docker>` | ||||
| is almost as simple: | ||||
|  | ||||
| .. code-block:: shell-session | ||||
|  | ||||
|     $ # Stop and remove your current containers | ||||
|     $ docker-compose stop | ||||
|     $ docker-compose rm -f | ||||
|  | ||||
|     $ # Recreate them, add the superuser | ||||
|     $ docker-compose up -d | ||||
|     $ docker-compose run --rm webserver createsuperuser | ||||
|  | ||||
|     $ # Load the tags | ||||
|     $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin - | ||||
|  | ||||
|     $ # Load your exported documents into the consumption directory | ||||
|     $ # (How you do this highly depends on how you have set this up) | ||||
|     $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/ | ||||
|  | ||||
| After loading the documents into the consumption directory the consumer will | ||||
| immediately start consuming the documents. | ||||
|   | ||||
| @@ -101,3 +101,16 @@ you'd like to generate your own docs locally, you'll need to: | ||||
|     $ pip install sphinx | ||||
|  | ||||
| and then cd into the ``docs`` directory and type ``make html``. | ||||
|  | ||||
| If you are using Docker, you can use the following commands to build the | ||||
| documentation and run a webserver serving it on `port 8001`_: | ||||
|  | ||||
| .. code:: bash | ||||
|  | ||||
|     $ pwd | ||||
|     /path/to/paperless | ||||
|  | ||||
|     $ docker build -t paperless:docs -f docs/Dockerfile . | ||||
|     $ docker run --rm -it -p "8001:8000" paperless:docs | ||||
|  | ||||
| .. _port 8001: http://127.0.0.1:8001 | ||||
|   | ||||
							
								
								
									
										174
									
								
								docs/setup.rst
									
									
									
									
									
								
							
							
						
						
									
										174
									
								
								docs/setup.rst
									
									
									
									
									
								
							| @@ -37,11 +37,18 @@ or just download the tarball and go that route: | ||||
| Installation & Configuration | ||||
| ---------------------------- | ||||
|  | ||||
| You can go two routes with setting up and running Paperless.  The *Vagrant* | ||||
| route is quick & easy, but means you're running a VM which comes with memory | ||||
| consumption etc.  Alternatively the standard, "bare metal" approach is a little | ||||
| more complicated. | ||||
| You can go multiple routes with setting up and running Paperless. The `Vagrant | ||||
| route`_ is quick & easy, but means you're running a VM which comes with memory | ||||
| consumption etc. We also `support Docker`_, which you can use natively under | ||||
| Linux and in a VM with `Docker Machine`_ (this guide was written for native | ||||
| Docker usage under Linux, you might have to adapt it for Docker Machine.) | ||||
| Alternatively the standard, `bare metal`_ approach is a little more complicated. | ||||
|  | ||||
| .. _Vagrant route: setup-installation-vagrant_ | ||||
| .. _support Docker: setup-installation-docker_ | ||||
| .. _bare metal: setup-installation-standard_ | ||||
|  | ||||
| .. _Docker Machine: https://docs.docker.com/machine/ | ||||
|  | ||||
| .. _setup-installation-standard: | ||||
|  | ||||
| @@ -118,6 +125,157 @@ Vagrant Method | ||||
| .. _Paperless server: http://172.28.128.4:8000 | ||||
|  | ||||
|  | ||||
| .. _setup-installation-docker: | ||||
|  | ||||
| Docker Method | ||||
| ............. | ||||
|  | ||||
| 1. Install `Docker`_. | ||||
|  | ||||
|    .. caution:: | ||||
|  | ||||
|       As mentioned earlier, this guide assumes that you use Docker natively | ||||
|       under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows, | ||||
|       you will have to adapt IP addresses, volume-mounting, command execution | ||||
|       and maybe more. | ||||
|  | ||||
| 2. Install `docker-compose`_. [#compose]_ | ||||
|  | ||||
|    .. caution:: | ||||
|  | ||||
|        If you want to use the included ``docker-compose.yml.example`` file, you | ||||
|        need to have at least Docker version **1.10.0** and docker-compose | ||||
|        version **1.6.0**. | ||||
|  | ||||
|        See the `Docker installation guide`_ on how to install the current | ||||
|        version of Docker for your operating system or Linux distribution of | ||||
|        choice. To get an up-to-date version of docker-compose, follow the | ||||
|        `docker-compose installation guide`_ if your package repository doesn't | ||||
|        include it. | ||||
|  | ||||
|        .. _Docker installation guide: https://docs.docker.com/engine/installation/ | ||||
|        .. _docker-compose installation guide: https://docs.docker.com/compose/install/ | ||||
|  | ||||
| 3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and | ||||
|    a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be | ||||
|    editing both these files: taking a copy ensures that you can ``git pull`` to  | ||||
|    receive updates without risking merge conflicts with your modified versions  | ||||
|    of the configuration files. | ||||
| 4. Modify ``docker-compose.yml`` to your preferences, following the instructions | ||||
|    in comments in the file. The only change that is a hard requirement is to  | ||||
|    specify where the consumption directory should mount. | ||||
| 5. Modify ``docker-compose.env`` and adapt the following environment variables: | ||||
|  | ||||
|    ``PAPERLESS_PASSPHRASE`` | ||||
|      This is the passphrase Paperless uses to encrypt/decrypt the original | ||||
|      document. | ||||
|  | ||||
|    ``PAPERLESS_OCR_THREADS`` | ||||
|      This is the number of threads the OCR process will spawn to process | ||||
|      document pages in parallel. If the variable is not set, Python determines | ||||
|      the core-count of your CPU and uses that value. | ||||
|  | ||||
|    ``PAPERLESS_OCR_LANGUAGES`` | ||||
|      If you want the OCR to recognize other languages in addition to the default | ||||
|      English, set this parameter to a space separated list of three-letter | ||||
|      language-codes after `ISO 639-2/T`_. For a list of available languages -- | ||||
|      including their three letter codes -- see the `Debian packagelist`_. | ||||
|  | ||||
|    ``USERMAP_UID`` and ``USERMAP_GID`` | ||||
|      If you want to mount the consumption volume (directory ``/consume`` within | ||||
|      the containers) to a host-directory -- which you probably want to do -- | ||||
|      access rights might be an issue. The default user and group ``paperless`` | ||||
|      in the containers have an id of 1000. The containers will enforce that the | ||||
|      owning group of the consumption directory will be ``paperless`` to be able | ||||
|      to delete consumed documents. If your host-system has a group with an id of | ||||
|      1000 and you don't want this group to have access rights to the consumption | ||||
|      directory, you can use ``USERMAP_GID`` to change the id in the container | ||||
|      and thus the one of the consumption directory. Furthermore, you can change | ||||
|      the id of the default user as well using ``USERMAP_UID``. | ||||
|  | ||||
| 6. Run ``docker-compose up -d``. This will create and start the necessary | ||||
|    containers. | ||||
| 7. To be able to login, you will need a super user. To create it, execute the | ||||
|    following command: | ||||
|  | ||||
|    .. code-block:: shell-session | ||||
|  | ||||
|        $ docker-compose run --rm webserver createsuperuser | ||||
|  | ||||
|    This will prompt you to set a username (default ``paperless``), an optional | ||||
|    e-mail address and finally a password. | ||||
| 8. The default ``docker-compose.yml`` exports the webserver on your local port | ||||
|    8000. If you haven't adapted this, you should now be able to visit your | ||||
|    `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the | ||||
|    user and password you just created. | ||||
| 9. Add files to consumption directory the way you prefer to. Following are two | ||||
|    possible options: | ||||
|  | ||||
|    1. Mount the consumption directory to a local host path by modifying your | ||||
|       ``docker-compose.yml``: | ||||
|  | ||||
|       .. code-block:: diff | ||||
|  | ||||
|          diff --git a/docker-compose.yml b/docker-compose.yml | ||||
|          --- a/docker-compose.yml | ||||
|          +++ b/docker-compose.yml | ||||
|          @@ -17,9 +18,8 @@ services: | ||||
|                   volumes: | ||||
|                       - paperless-data:/usr/src/paperless/data | ||||
|                       - paperless-media:/usr/src/paperless/media | ||||
|          -            - /consume | ||||
|          +            - /local/path/you/choose:/consume | ||||
|  | ||||
|       .. danger:: | ||||
|  | ||||
|           While the consumption container will ensure at startup that it can | ||||
|           **delete** a consumed file from a host-mounted directory, it might not | ||||
|           be able to **read** the document in the first place if the access | ||||
|           rights to the file are incorrect. | ||||
|  | ||||
|           Make sure that the documents you put into the consumption directory | ||||
|           will either be readable by everyone (``chmod o+r file.pdf``) or | ||||
|           readable by the default user or group id 1000 (or the one you have set | ||||
|           with ``USERMAP_UID`` or ``USERMAP_GID`` respectively). | ||||
|  | ||||
|    2. Use ``docker cp`` to copy your files directly into the container: | ||||
|  | ||||
|       .. code-block:: shell-session | ||||
|  | ||||
|          $ # Identify your containers | ||||
|          $ docker-compose ps | ||||
|                  Name                       Command                State     Ports | ||||
|          ------------------------------------------------------------------------- | ||||
|          paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0 | ||||
|          paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0 | ||||
|  | ||||
|          $ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume | ||||
|  | ||||
|       ``docker cp`` is a one-shot-command, just like ``cp``. This means that | ||||
|       every time you want to consume a new document, you will have to execute | ||||
|       ``docker cp`` again. You can of course automate this process, but option 1 | ||||
|       is generally the preferred one. | ||||
|  | ||||
|       .. danger:: | ||||
|  | ||||
|           ``docker cp`` will change the owning user and group of a copied file | ||||
|           to the acting user at the destination, which will be ``root``. | ||||
|  | ||||
|           You therefore need to ensure that the documents you want to copy into | ||||
|           the container are readable by everyone (``chmod o+r file.pdf``) before | ||||
|           copying them. | ||||
|  | ||||
|  | ||||
| .. _Docker: https://www.docker.com/ | ||||
| .. _docker-compose: https://docs.docker.com/compose/install/ | ||||
| .. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes | ||||
| .. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr- | ||||
|  | ||||
| .. [#compose] You of course don't have to use docker-compose, but it | ||||
|    simplifies deployment immensely. If you know your way around Docker, feel | ||||
|    free to tinker around without using compose! | ||||
|  | ||||
|  | ||||
| .. _making-things-a-little-more-permanent: | ||||
|  | ||||
| Making Things a Little more Permanent | ||||
| @@ -126,5 +284,9 @@ Making Things a Little more Permanent | ||||
| Once you've tested things and are happy with the work flow, you can automate the | ||||
| process of starting the webserver and consumer automatically.  If you're running | ||||
| on a bare metal system that's using Systemd, you can use the service unit files | ||||
| in the ``scripts`` directory to set this up.  If you're on a SysV or other | ||||
| startup system (like the Vagrant box), then you're currently on your own. | ||||
| in the ``scripts`` directory to set this up.  If you're on another startup | ||||
| system or are using a Vagrant box, then you're currently on your own. If you are | ||||
| using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to | ||||
| have the containers automatically start with the Docker daemon. | ||||
|  | ||||
| .. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart | ||||
|   | ||||
| @@ -105,3 +105,30 @@ import, so should you can now safely delete the entire project directly, | ||||
| database, encrypted PDFs and all, and later create it all again simply by | ||||
| running the consumer again and dumping all of these files into | ||||
| ``CONSUMPTION_DIR``. | ||||
|  | ||||
|  | ||||
| .. _utilities-retagger: | ||||
|  | ||||
| The Re-tagger | ||||
| ------------- | ||||
|  | ||||
| Say you've imported a few hundred documents and now want to introduce a tag | ||||
| and apply its matching to all of the currently-imported docs.  This problem is | ||||
| common enough that there's a tool for it. | ||||
|  | ||||
|  | ||||
| .. _utilities-retagger-howto: | ||||
|  | ||||
| How to Use It | ||||
| ............. | ||||
|  | ||||
| This too is done via the ``manage.py`` script: | ||||
|  | ||||
| .. code:: bash | ||||
|  | ||||
|     $ /path/to/paperless/src/manage.py document_retagger | ||||
|  | ||||
| That's it.  It'll loop over all of the documents in your database and attempt | ||||
| to match all of your tags to them.  If one matches, it'll be applied.  And | ||||
| don't worry, you can run this as often as you like, it' won't double-tag | ||||
| a document. | ||||
|   | ||||
							
								
								
									
										74
									
								
								scripts/docker-entrypoint.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								scripts/docker-entrypoint.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,74 @@ | ||||
| #!/bin/bash | ||||
| set -e | ||||
|  | ||||
| # Source: https://github.com/sameersbn/docker-gitlab/ | ||||
| map_uidgid() { | ||||
|     USERMAP_ORIG_UID=$(id -u paperless) | ||||
|     USERMAP_ORIG_UID=$(id -g paperless) | ||||
|     USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}} | ||||
|     USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID} | ||||
|     if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then | ||||
|         echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID" | ||||
|         groupmod -g ${USERMAP_GID} paperless | ||||
|         sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd | ||||
|     fi | ||||
| } | ||||
|  | ||||
| set_permissions() { | ||||
|     # Set permissions for consumption directory | ||||
|     chgrp paperless "$PAPERLESS_CONSUME" | ||||
|     chmod g+x "$PAPERLESS_CONSUME" | ||||
|  | ||||
|     # Set permissions for application directory | ||||
|     chown -Rh paperless:paperless /usr/src/paperless | ||||
| } | ||||
|  | ||||
| initialize() { | ||||
|     map_uidgid | ||||
|     set_permissions | ||||
| } | ||||
|  | ||||
| install_languages() { | ||||
|     local langs="$1" | ||||
|     read -ra langs <<<"$langs" | ||||
|  | ||||
|     # Check that it is not empty | ||||
|     if [ ${#langs[@]} -eq 0 ]; then | ||||
|         return | ||||
|     fi | ||||
|  | ||||
|     # Update apt-lists | ||||
|     apt-get update | ||||
|  | ||||
|     # Loop over languages to be installed | ||||
|     for lang in "${langs[@]}"; do | ||||
|         pkg="tesseract-ocr-$lang" | ||||
|         if dpkg -s "$pkg" 2>&1 > /dev/null; then | ||||
|             continue | ||||
|         fi | ||||
|  | ||||
|         if ! apt-cache show "$pkg" 2>&1 > /dev/null; then | ||||
|             continue | ||||
|         fi | ||||
|  | ||||
|         apt-get install "$pkg" | ||||
|     done | ||||
|  | ||||
|     # Remove apt lists | ||||
|     rm -rf /var/lib/apt/lists/* | ||||
| } | ||||
|  | ||||
|  | ||||
| if [[ "$1" != "/"* ]]; then | ||||
|     initialize | ||||
|  | ||||
|     # Install additional languages if specified | ||||
|     if [ ! -z "$PAPERLESS_OCR_LANGUAGES"  ]; then | ||||
|         install_languages "$PAPERLESS_OCR_LANGUAGES" | ||||
|     fi | ||||
|  | ||||
|     exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@" | ||||
| fi | ||||
|  | ||||
| exec "$@" | ||||
|  | ||||
| @@ -1,21 +1,23 @@ | ||||
| import datetime | ||||
| import glob | ||||
| import tempfile | ||||
| from multiprocessing.pool import Pool | ||||
|  | ||||
| import itertools | ||||
|  | ||||
| import langdetect | ||||
| import os | ||||
| import random | ||||
| import re | ||||
| import subprocess | ||||
|  | ||||
| import pyocr | ||||
| import shutil | ||||
|  | ||||
| from PIL import Image | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
| from django.template.defaultfilters import slugify | ||||
| from pyocr.tesseract import TesseractError | ||||
|  | ||||
| from logger.models import Log | ||||
| from paperless.db import GnuPG | ||||
| @@ -27,6 +29,12 @@ from .languages import ISO639 | ||||
| def image_to_string(args): | ||||
|     self, png, lang = args | ||||
|     with Image.open(os.path.join(self.SCRATCH, png)) as f: | ||||
|         if self.OCR.can_detect_orientation(): | ||||
|             try: | ||||
|                 orientation = self.OCR.detect_orientation(f, lang=lang) | ||||
|                 f = f.rotate(orientation["angle"], expand=1) | ||||
|             except TesseractError: | ||||
|                 pass | ||||
|         return self.OCR.image_to_string(f, lang=lang) | ||||
|  | ||||
|  | ||||
| @@ -111,34 +119,41 @@ class Consumer(object): | ||||
|  | ||||
|             Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) | ||||
|  | ||||
|             pngs = self._get_greyscale(doc) | ||||
|             tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) | ||||
|             pngs = self._get_greyscale(tempdir, doc) | ||||
|  | ||||
|             try: | ||||
|                 text = self._get_ocr(pngs) | ||||
|                 self._store(text, doc) | ||||
|             except OCRError: | ||||
|                 self._ignore.append(doc) | ||||
|                 Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) | ||||
|                 self._cleanup_tempdir(tempdir) | ||||
|                 continue | ||||
|             else: | ||||
|                 self._cleanup_tempdir(tempdir) | ||||
|                 self._cleanup_doc(doc) | ||||
|  | ||||
|             self._store(text, doc) | ||||
|             self._cleanup(pngs, doc) | ||||
|  | ||||
|     def _get_greyscale(self, doc): | ||||
|     def _get_greyscale(self, tempdir, doc): | ||||
|  | ||||
|         Log.debug( | ||||
|             "Generating greyscale image from {}".format(doc), | ||||
|             Log.COMPONENT_CONSUMER | ||||
|         ) | ||||
|  | ||||
|         i = random.randint(1000000, 9999999) | ||||
|         png = os.path.join(self.SCRATCH, "{}.png".format(i)) | ||||
|         png = os.path.join(tempdir, "convert-%04d.jpg") | ||||
|  | ||||
|         subprocess.Popen(( | ||||
|             self.CONVERT, "-density", "300", "-depth", "8", | ||||
|             "-type", "grayscale", doc, png | ||||
|         )).wait() | ||||
|  | ||||
|         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||||
|         pngs = [] | ||||
|         for f in os.listdir(tempdir): | ||||
|             if f.startswith("convert"): | ||||
|                 pngs.append(os.path.join(tempdir, f)) | ||||
|  | ||||
|         return sorted(filter(lambda __: os.path.isfile(__), pngs)) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _guess_language(text): | ||||
| @@ -271,11 +286,7 @@ class Consumer(object): | ||||
|     def _store(self, text, doc): | ||||
|  | ||||
|         sender, title, tags, file_type = self._guess_attributes_from_name(doc) | ||||
|         tags = list(tags) | ||||
|  | ||||
|         lower_text = text.lower() | ||||
|         relevant_tags = set( | ||||
|             [t for t in Tag.objects.all() if t.matches(lower_text)] + tags) | ||||
|         relevant_tags = set(list(Tag.match_all(text)) + list(tags)) | ||||
|  | ||||
|         stats = os.stat(doc) | ||||
|  | ||||
| @@ -303,14 +314,15 @@ class Consumer(object): | ||||
|                 Log.debug("Encrypting", Log.COMPONENT_CONSUMER) | ||||
|                 encrypted.write(GnuPG.encrypted(unencrypted)) | ||||
|  | ||||
|     def _cleanup(self, pngs, doc): | ||||
|     @staticmethod | ||||
|     def _cleanup_tempdir(d): | ||||
|         Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER) | ||||
|         shutil.rmtree(d) | ||||
|  | ||||
|         png_glob = os.path.join( | ||||
|             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) | ||||
|  | ||||
|         for f in list(glob.glob(png_glob)) + [doc]: | ||||
|             Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER) | ||||
|             os.unlink(f) | ||||
|     @staticmethod | ||||
|     def _cleanup_doc(doc): | ||||
|         Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER) | ||||
|         os.unlink(doc) | ||||
|  | ||||
|     def _is_ready(self, doc): | ||||
|         """ | ||||
|   | ||||
| @@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand): | ||||
|         self.verbosity = options["verbosity"] | ||||
|  | ||||
|         for document in Document.objects.all(): | ||||
|  | ||||
|             tags = Tag.objects.exclude( | ||||
|                 pk__in=document.tags.values_list("pk", flat=True)) | ||||
|             for tag in tags: | ||||
|                 if tag.matches(document.content): | ||||
|  | ||||
|             for tag in Tag.match_all(document.content, tags): | ||||
|                 print('Tagging {} with "{}"'.format(document, tag)) | ||||
|                 document.tags.add(tag) | ||||
|   | ||||
							
								
								
									
										23
									
								
								src/documents/management/commands/loaddata_stdin.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/documents/management/commands/loaddata_stdin.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| """ | ||||
| Source: | ||||
|     https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 | ||||
|  | ||||
| License: | ||||
|     MIT | ||||
|     Copyright (c) 2016 Baptiste Mispelon | ||||
| """ | ||||
| import sys | ||||
|  | ||||
| from django.core.management.commands.loaddata import Command as LoadDataCommand | ||||
|  | ||||
|  | ||||
| class Command(LoadDataCommand): | ||||
|     def parse_name(self, fixture_name): | ||||
|         self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None) | ||||
|         if fixture_name == '-': | ||||
|             return '-', 'json', 'stdin' | ||||
|  | ||||
|     def find_fixtures(self, fixture_label): | ||||
|         if fixture_label == '-': | ||||
|             return [('-', None, '-')] | ||||
|         return super(Command, self).find_fixtures(fixture_label) | ||||
| @@ -86,28 +86,40 @@ class Tag(SluggedModel): | ||||
|         return "{}: \"{}\" ({})".format( | ||||
|             self.name, self.match, self.get_matching_algorithm_display()) | ||||
|  | ||||
|     @classmethod | ||||
|     def match_all(cls, text, tags=None): | ||||
|  | ||||
|         if tags is None: | ||||
|             tags = cls.objects.all() | ||||
|  | ||||
|         text = text.lower() | ||||
|         for tag in tags: | ||||
|             if tag.matches(text): | ||||
|                 yield tag | ||||
|  | ||||
|     def matches(self, text): | ||||
|  | ||||
|         # Check that match is not empty | ||||
|         if self.match.strip() == "": | ||||
|             return False | ||||
|  | ||||
|         if self.matching_algorithm == self.MATCH_ALL: | ||||
|             for word in self.match.split(" "): | ||||
|                 if word not in text: | ||||
|                 if not re.search(r"\b{}\b".format(word), text): | ||||
|                     return False | ||||
|             return True | ||||
|  | ||||
|         if self.matching_algorithm == self.MATCH_ANY: | ||||
|             for word in self.match.split(" "): | ||||
|                 if word in text: | ||||
|                 if re.search(r"\b{}\b".format(word), text): | ||||
|                     return True | ||||
|             return False | ||||
|  | ||||
|         if self.matching_algorithm == self.MATCH_LITERAL: | ||||
|             return self.match in text | ||||
|             return bool(re.search(r"\b{}\b".format(self.match), text)) | ||||
|  | ||||
|         if self.matching_algorithm == self.MATCH_REGEX: | ||||
|             return re.search(re.compile(self.match), text) | ||||
|             return bool(re.search(re.compile(self.match), text)) | ||||
|  | ||||
|         raise NotImplementedError("Unsupported matching algorithm") | ||||
|  | ||||
|   | ||||
							
								
								
									
										120
									
								
								src/documents/tests/test_tags.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										120
									
								
								src/documents/tests/test_tags.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,120 @@ | ||||
| from django.test import TestCase | ||||
|  | ||||
| from ..models import Tag | ||||
|  | ||||
|  | ||||
| class TestTagMatching(TestCase): | ||||
|  | ||||
|     def test_match_all(self): | ||||
|  | ||||
|         t = Tag.objects.create( | ||||
|             name="Test 0", | ||||
|             match="alpha charlie gamma", | ||||
|             matching_algorithm=Tag.MATCH_ALL | ||||
|         ) | ||||
|         self.assertFalse(t.matches("I have alpha in me")) | ||||
|         self.assertFalse(t.matches("I have charlie in me")) | ||||
|         self.assertFalse(t.matches("I have gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alpha and charlie in me")) | ||||
|         self.assertTrue(t.matches("I have alpha, charlie, and gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alphas in me")) | ||||
|         self.assertFalse(t.matches("I have bravo in me")) | ||||
|  | ||||
|         t = Tag.objects.create( | ||||
|             name="Test 1", | ||||
|             match="12 34 56", | ||||
|             matching_algorithm=Tag.MATCH_ALL | ||||
|         ) | ||||
|         self.assertFalse(t.matches("I have 12 in me")) | ||||
|         self.assertFalse(t.matches("I have 34 in me")) | ||||
|         self.assertFalse(t.matches("I have 56 in me")) | ||||
|         self.assertFalse(t.matches("I have 12 and 34 in me")) | ||||
|         self.assertTrue(t.matches("I have 12 34, and 56 in me")) | ||||
|         self.assertFalse(t.matches("I have 120, 34, and 56 in me")) | ||||
|         self.assertFalse(t.matches("I have 123456 in me")) | ||||
|         self.assertFalse(t.matches("I have 01234567 in me")) | ||||
|  | ||||
|     def test_match_any(self): | ||||
|  | ||||
|         t = Tag.objects.create( | ||||
|             name="Test 0", | ||||
|             match="alpha charlie gamma", | ||||
|             matching_algorithm=Tag.MATCH_ANY | ||||
|         ) | ||||
|  | ||||
|         self.assertTrue(t.matches("I have alpha in me")) | ||||
|         self.assertTrue(t.matches("I have charlie in me")) | ||||
|         self.assertTrue(t.matches("I have gamma in me")) | ||||
|         self.assertTrue(t.matches("I have alpha and charlie in me")) | ||||
|         self.assertFalse(t.matches("I have alphas in me")) | ||||
|         self.assertFalse(t.matches("I have bravo in me")) | ||||
|  | ||||
|         t = Tag.objects.create( | ||||
|             name="Test 1", | ||||
|             match="12 34 56", | ||||
|             matching_algorithm=Tag.MATCH_ANY | ||||
|         ) | ||||
|         self.assertTrue(t.matches("I have 12 in me")) | ||||
|         self.assertTrue(t.matches("I have 34 in me")) | ||||
|         self.assertTrue(t.matches("I have 56 in me")) | ||||
|         self.assertTrue(t.matches("I have 12 and 34 in me")) | ||||
|         self.assertTrue(t.matches("I have 12 34, and 56 in me")) | ||||
|         self.assertTrue(t.matches("I have 120, 34, and 560 in me")) | ||||
|         self.assertFalse(t.matches("I have 120, 340, and 560 in me")) | ||||
|         self.assertFalse(t.matches("I have 123456 in me")) | ||||
|         self.assertFalse(t.matches("I have 01234567 in me")) | ||||
|  | ||||
|     def test_match_literal(self): | ||||
|  | ||||
|         t = Tag.objects.create( | ||||
|             name="Test 0", | ||||
|             match="alpha charlie gamma", | ||||
|             matching_algorithm=Tag.MATCH_LITERAL | ||||
|         ) | ||||
|  | ||||
|         self.assertFalse(t.matches("I have alpha in me")) | ||||
|         self.assertFalse(t.matches("I have charlie in me")) | ||||
|         self.assertFalse(t.matches("I have gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alpha and charlie in me")) | ||||
|         self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) | ||||
|         self.assertTrue(t.matches("I have 'alpha charlie gamma' in me")) | ||||
|         self.assertFalse(t.matches("I have alphas in me")) | ||||
|         self.assertFalse(t.matches("I have bravo in me")) | ||||
|  | ||||
|         t = Tag.objects.create( | ||||
|             name="Test 1", | ||||
|             match="12 34 56", | ||||
|             matching_algorithm=Tag.MATCH_LITERAL | ||||
|         ) | ||||
|         self.assertFalse(t.matches("I have 12 in me")) | ||||
|         self.assertFalse(t.matches("I have 34 in me")) | ||||
|         self.assertFalse(t.matches("I have 56 in me")) | ||||
|         self.assertFalse(t.matches("I have 12 and 34 in me")) | ||||
|         self.assertFalse(t.matches("I have 12 34, and 56 in me")) | ||||
|         self.assertFalse(t.matches("I have 120, 34, and 560 in me")) | ||||
|         self.assertFalse(t.matches("I have 120, 340, and 560 in me")) | ||||
|         self.assertFalse(t.matches("I have 123456 in me")) | ||||
|         self.assertFalse(t.matches("I have 01234567 in me")) | ||||
|         self.assertTrue(t.matches("I have 12 34 56 in me")) | ||||
|  | ||||
|     def test_match_regex(self): | ||||
|  | ||||
|         t = Tag.objects.create( | ||||
|             name="Test 0", | ||||
|             match="alpha\w+gamma", | ||||
|             matching_algorithm=Tag.MATCH_REGEX | ||||
|         ) | ||||
|  | ||||
|         self.assertFalse(t.matches("I have alpha in me")) | ||||
|         self.assertFalse(t.matches("I have gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alpha and charlie in me")) | ||||
|         self.assertTrue(t.matches("I have alpha_and_gamma in me")) | ||||
|         self.assertTrue(t.matches("I have alphas_and_gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alpha,and,gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alpha and gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) | ||||
|         self.assertFalse(t.matches("I have alphas in me")) | ||||
|  | ||||
| @@ -23,4 +23,8 @@ class Migration(migrations.Migration): | ||||
|                 ('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])), | ||||
|             ], | ||||
|         ), | ||||
|         migrations.AlterModelOptions( | ||||
|             name='log', | ||||
|             options={'ordering': ('-time',)}, | ||||
|         ), | ||||
|     ] | ||||
|   | ||||
| @@ -27,7 +27,10 @@ class Log(models.Model): | ||||
|     component = models.PositiveIntegerField(choices=COMPONENTS) | ||||
|  | ||||
|     class Meta(object): | ||||
|         ordering = ("time",) | ||||
|         ordering = ("-time",) | ||||
|  | ||||
|     def __str__(self): | ||||
|         return self.message | ||||
|  | ||||
|     @classmethod | ||||
|     def error(cls, message, component): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn