Merge branch 'dev'

This commit is contained in:
Jonas Winkler 2020-11-18 23:02:48 +01:00
commit 8f5809d1fc
34 changed files with 480 additions and 346 deletions

View File

@ -1,82 +0,0 @@
###############################################################################
### Front end ###
###############################################################################
FROM node:current AS frontend
WORKDIR /usr/src/paperless/src-ui/
COPY src-ui/package* ./
RUN npm install
COPY src-ui .
RUN node_modules/.bin/ng build --prod --output-hashing none --sourceMap=false --output-path dist/paperless-ui
###############################################################################
### Back end ###
###############################################################################
FROM ubuntu:20.04
WORKDIR /usr/src/paperless/
COPY Pipfile* ./
#Dependencies
RUN apt-get update \
&& DEBIAN_FRONTEND="noninteractive" apt-get -y --no-install-recommends install \
build-essential \
curl \
ghostscript \
gnupg \
imagemagick \
libmagic-dev \
libpoppler-cpp-dev \
libpq-dev \
optipng \
python3 \
python3-dev \
python3-pip \
sudo \
tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-deu \
tesseract-ocr-fra \
tesseract-ocr-ita \
tesseract-ocr-spa \
tzdata \
unpaper \
&& pip3 install --upgrade pipenv supervisor setuptools \
&& pipenv install --system --deploy \
&& pipenv --clear \
&& apt-get -y purge build-essential python3-pip python3-dev \
&& apt-get -y autoremove --purge \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir /var/log/supervisord /var/run/supervisord
# copy scripts
# this fixes issues with imagemagick and PDF
COPY docker/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml
COPY docker/gunicorn.conf.py ./
COPY docker/supervisord.conf /etc/supervisord.conf
COPY docker/docker-entrypoint.sh /sbin/docker-entrypoint.sh
# copy app
COPY src/ ./src/
COPY --from=frontend /usr/src/paperless/src-ui/dist/paperless-ui/ ./src/documents/static/frontend/
# add users, setup scripts
RUN addgroup --gid 1000 paperless \
&& useradd --uid 1000 --gid paperless --home-dir /usr/src/paperless paperless \
&& chown -R paperless:paperless . \
&& chmod 755 /sbin/docker-entrypoint.sh
WORKDIR /usr/src/paperless/src/
RUN sudo -HEu paperless python3 manage.py collectstatic --clear --no-input
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/usr/src/paperless/consume", "/usr/src/paperless/export"]
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
CMD ["/usr/local/bin/supervisord", "-c", "/etc/supervisord.conf"]
LABEL maintainer="Jonas Winkler <dev@jpwinkler.de>"

View File

@ -3,6 +3,11 @@ url = "https://pypi.python.org/simple"
verify_ssl = true
name = "pypi"
[[source]]
url = "https://www.piwheels.org/simple"
verify_ssl = true
name = "piwheels"
[packages]
django = "~=3.1"
pillow = "*"

51
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "d6416e6844126b09200b9839a3abdcf3c24ef5cf70052b8f134d8bc804552c17"
"sha256": "abc7e5f5a8d075d4b013ceafd06ca07f57e597f053d670f73449ba210511b114"
},
"pipfile-spec": 6,
"requires": {},
@ -10,6 +10,11 @@
"name": "pypi",
"url": "https://pypi.python.org/simple",
"verify_ssl": true
},
{
"name": "piwheels",
"url": "https://www.piwheels.org/simple",
"verify_ssl": true
}
]
},
@ -102,6 +107,7 @@
},
"filemagic": {
"hashes": [
"sha256:b2fd77411975510e28673220c4b8868ed81b5eb5906339b6f4c233b32122d7d3",
"sha256:e684359ef40820fe406f0ebc5bf8a78f89717bdb7fed688af68082d991d6dbf3"
],
"index": "pypi",
@ -142,6 +148,7 @@
"langdetect": {
"hashes": [
"sha256:363795ea005f1243c958e953245dac5d814fabdc025c9afa91588c5fa6b2fa83",
"sha256:ae53a024643df713274c297c0795dbfb5a16b329902f8e543e7b2d7d45f699e4",
"sha256:f37495e63607865e47deed08d78f7f8e58172658216ff954b2f14671bcd87740"
],
"index": "pypi",
@ -162,6 +169,7 @@
"sha256:448ebb1b3bf64c0267d6b09a7cba26b5ae61b6d2dbabff7c91b660c7eccf2bdb",
"sha256:50e86c076611212ca62e5a59f518edafe0c0730f7d9195fec718da1a5c2bb1fc",
"sha256:5734bdc0342aba9dfc6f04920988140fb41234db42381cf7ccba64169f9fe7ac",
"sha256:5ddd1dfa2be066595c1993165b4cae84b9866b12339d0c903db7f21a094324a3",
"sha256:64324f64f90a9e4ef732be0928be853eee378fd6a01be21a0a8469c4f2682c83",
"sha256:6ae6c680f3ebf1cf7ad1d7748868b39d9f900836df774c453c11c5440bc15b36",
"sha256:6d7593a705d662be5bfe24111af14763016765f43cb6923ed86223f965f52387",
@ -189,7 +197,8 @@
},
"pathtools": {
"hashes": [
"sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0"
"sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0",
"sha256:d77d982475e87f32b82157a43b09f0a5ef3e66c1d8f3c7eb8d2580e783cd8202"
],
"version": "==0.1.2"
},
@ -217,6 +226,7 @@
"sha256:2fb113757a369a6cdb189f8df3226e995acfed0a8919a72416626af1a0a71140",
"sha256:4b0ef2470c4979e345e4e0cc1bbac65fda11d0d7b789dbac035e4c6ce3f98adb",
"sha256:59e903ca800c8cfd1ebe482349ec7c35687b95e98cefae213e271c8c7fffa021",
"sha256:5a3342d34289715928c914ee7f389351eb37fa4857caa9297fc7948f2ed3e53d",
"sha256:5abd653a23c35d980b332bc0431d39663b1709d64142e3652890df4c9b6970f6",
"sha256:5f9403af9c790cc18411ea398a6950ee2def2a830ad0cfe6dc9122e6d528b302",
"sha256:6b4a8fd632b4ebee28282a9fef4c341835a1aa8671e2770b6f89adc8e8c2703c",
@ -274,8 +284,10 @@
"sha256:d14b140a4439d816e3b1229a4a525df917d6ea22a0771a2a78332273fd9528a4",
"sha256:d1b4ab59e02d9008efe10ceabd0b31e79519da6fb67f7d8e8977118832d0f449",
"sha256:d5227b229005a696cc67676e24c214740efd90b148de5733419ac9aaba3773da",
"sha256:d9f3a909b59ac4a3ca9beb77716f4bce627276edb039a71d4e9ec4b7548536a0",
"sha256:e1f57aa70d3f7cc6947fd88636a481638263ba04a742b4a37dd25c373e41491a",
"sha256:e74a55f6bad0e7d3968399deb50f61f4db1926acf4a6d83beaaa7df986f48b1c",
"sha256:e7f5a465c6431c0ad8d4e69603ee3306e521a09d3c6af76a16bdb62946bdddf0",
"sha256:e82aba2188b9ba309fd8e271702bd0d0fc9148ae3150532bbb474f4590039ffb",
"sha256:ee69dad2c7155756ad114c02db06002f4cded41132cc51378e57aad79cc8e4f4",
"sha256:f5ab93a2cb2d8338b1674be43b442a7f544a0971da062a5da774ed40587f18f5"
@ -285,7 +297,8 @@
},
"pyocr": {
"hashes": [
"sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179"
"sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179",
"sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301"
],
"index": "pypi",
"version": "==0.7.2"
@ -316,7 +329,10 @@
},
"python-levenshtein": {
"hashes": [
"sha256:033a11de5e3d19ea25c9302d11224e1a1898fe5abd23c61c7c360c25195e3eb1"
"sha256:033a11de5e3d19ea25c9302d11224e1a1898fe5abd23c61c7c360c25195e3eb1",
"sha256:15e26882728c29ccdf74cfc6ac4b49fc22c08b44d152348cb0eb1ec4f3dbf9df",
"sha256:3df5e5eb144570ecf5ad38864a2393068798328c7f05e7b167a49391d36a2db1",
"sha256:7f049b3ddc4b525bd469febafb98bf5202f789b722e0e4ccbec2ffbe8c07d7b4"
],
"index": "pypi",
"version": "==0.12.0"
@ -331,6 +347,7 @@
"redis": {
"hashes": [
"sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2",
"sha256:3f1c7f166fa6c803613eec222224848a80f5e5b9c6af3aa82461506643034a7a",
"sha256:432b788c4530cfe16d8d943a09d40ca6c16149727e4afe8c2c9d5580c59d9f24"
],
"index": "pypi",
@ -360,7 +377,9 @@
"sha256:749078d1eb89484db5f34b4012092ad14b327944ee7f1c4f74d6279a6e4d1884",
"sha256:7913bd25f4ab274ba37bc97ad0e21c31004224ccb02765ad984eef43e04acc6c",
"sha256:7a25fcbeae08f96a754b45bdc050e1fb94b95cab046bf56b016c25e9ab127b3e",
"sha256:80ef188c0e47a6c964eed71c55a73c245f8daf9f0a4a9d804e91275afb468ca4",
"sha256:83d6b356e116ca119db8e7c6fc2983289d87b27b3fac238cfe5dca529d884562",
"sha256:842fb985b2b99a82a2b145b6bbd588c5f5cfd83693402920fcb985d515794666",
"sha256:8b882a78c320478b12ff024e81dc7d43c1462aa4a3341c754ee65d857a521f85",
"sha256:8f6a2229e8ad946e36815f2a03386bb8353d4bde368fdf8ca5f0cb97264d3b5c",
"sha256:9801c4c1d9ae6a70aeb2128e5b4b68c45d4f0af0d1535500884d644fa9b768c6",
@ -384,6 +403,7 @@
},
"scikit-learn": {
"hashes": [
"sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e",
"sha256:0a127cc70990d4c15b1019680bfedc7fec6c23d14d3719fdf9b64b22d37cdeca",
"sha256:0d39748e7c9669ba648acf40fb3ce96b8a07b240db6888563a7cb76e05e0d9cc",
"sha256:1b8a391de95f6285a2f9adffb7db0892718950954b7149a70c783dc848f104ea",
@ -423,6 +443,7 @@
"sha256:9ad4fcddcbf5dc67619379782e6aeef41218a79e17979aaed01ed099876c0e62",
"sha256:a254b98dbcc744c723a838c03b74a8a34c0558c9ac5c86d5561703362231107d",
"sha256:b03c4338d6d3d299e8ca494194c0ae4f611548da59e3c038813f1a43976cb437",
"sha256:b5e9d3e4474644915809d6aa1416ff20430a3ed9ae723a5d295da5ddb24985e2",
"sha256:cc1f78ebc982cd0602c9a7615d878396bec94908db67d4ecddca864d049112f2",
"sha256:d6d25c41a009e3c6b7e757338948d0076ee1dd1770d1c09ec131f11946883c54",
"sha256:d84cadd7d7998433334c99fa55bcba0d8b4aeff0edb123b2a1dfcface538e474",
@ -468,6 +489,7 @@
},
"watchdog": {
"hashes": [
"sha256:034c85530b647486e8c8477410fe79476511282658f2ce496f97106d9e5acfb8",
"sha256:4214e1379d128b0588021880ccaf40317ee156d4603ac388b9adcf29165e0c04"
],
"index": "pypi",
@ -561,6 +583,7 @@
"sha256:29a6272fec10623fcbe158fdf9abc7a5fa032048ac1d8631f14b50fbfc10d17f",
"sha256:2b31f46bf7b31e6aa690d4c7a3d51bb262438c6dcb0d528adde446531d0d3bb7",
"sha256:2d43af2be93ffbad25dd959899b5b809618a496926146ce98ee0b23683f8c51c",
"sha256:3188a7dfd96f734a7498f37cde6598b1e9c084f1ca68bc1aa04e88db31168ab6",
"sha256:381ead10b9b9af5f64646cd27107fb27b614ee7040bb1226f9c07ba96625cbb5",
"sha256:47a11bdbd8ada9b7ee628596f9d97fbd3851bd9999d398e9436bd67376dbece7",
"sha256:4d6a42744139a7fa5b46a264874a781e8694bb32f1d76d8137b68138686f1729",
@ -586,7 +609,8 @@
"sha256:c851b35fc078389bc16b915a0a7c1d5923e12e2c5aeec58c52f4aa8085ac8237",
"sha256:cb7df71de0af56000115eafd000b867d1261f786b5eebd88a0ca6360cccfaca7",
"sha256:cedb2f9e1f990918ea061f28a0f0077a07702e3819602d3507e2ff98c8d20636",
"sha256:e8caf961e1b1a945db76f1b5fa9c91498d15f545ac0ababbe575cfab185d3bd8"
"sha256:e8caf961e1b1a945db76f1b5fa9c91498d15f545ac0ababbe575cfab185d3bd8",
"sha256:ef221855191457fffeb909d5787d1807800ab4d0111f089e6c93ee68f577634d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==5.3"
@ -608,6 +632,7 @@
},
"docopt": {
"hashes": [
"sha256:15fde8252aa9f2804171014d50d069ffbf42c7a50b7d74bcbb82bfd5700fcfc2",
"sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"
],
"version": "==0.6.2"
@ -638,11 +663,11 @@
},
"faker": {
"hashes": [
"sha256:6afc461ab3f779c9c16e299fc731d775e39ea7e8e063b3053ee359ae198a15ca",
"sha256:ce1c38823eb0f927567cde5bf2e7c8ca565c7a70316139342050ce2ca74b4026"
"sha256:4d038ba51ae5e0a956d79cadd684d856e5750bfd608b61dad1807f8f08b1da49",
"sha256:f260f0375a44cd1e1a735c9b8c9b914304f607b5eef431d20e098c7c2f5b50a6"
],
"markers": "python_version >= '3.5'",
"version": "==4.14.2"
"version": "==4.16.0"
},
"filelock": {
"hashes": [
@ -653,6 +678,7 @@
},
"idna": {
"hashes": [
"sha256:4a57a6379512ade94fa99e2fa46d3cd0f2f553040548d0e2958c6ed90ee48226",
"sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6",
"sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"
],
@ -670,12 +696,14 @@
"iniconfig": {
"hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
"sha256:8647b85c03813b8680f4ae9c9db2fd7293f8591ea536a10d73d90f6eb4b10aac",
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
],
"version": "==1.1.1"
},
"jinja2": {
"hashes": [
"sha256:3f172970d5670703bd3812e8ca6459a9a7e069fa8e51b40195f83c81db191ec4",
"sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0",
"sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035"
],
@ -689,8 +717,10 @@
"sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
"sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
"sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
"sha256:19536834abffb3fa155017053c607cb835b2ecc6a3a2554a88043d991dffb736",
"sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
"sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
"sha256:3d61f15e39611aacd91b7e71d903787da86d9e80896e683c0103fced9add7834",
"sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
"sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
"sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
@ -700,6 +730,7 @@
"sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
"sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
"sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
"sha256:7952deddf24b85c88dab48f6ec366ac6e39d2761b5280f2f9594911e03fcd064",
"sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
"sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
"sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
@ -795,6 +826,7 @@
},
"pytest-env": {
"hashes": [
"sha256:33b4030383a021924fe3f3ba5ca4311990d8b1d02ca77389c2be020c4500f96a",
"sha256:7e94956aef7f2764f3c147d216ce066bf6c42948bb9e293169b1b1c880a580c2"
],
"index": "pypi",
@ -802,6 +834,7 @@
},
"pytest-forked": {
"hashes": [
"sha256:2d1bfc93ab65a28324eb0a63503bfb500c2da6916efede7a24b43a04970fe63c",
"sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
"sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
],
@ -810,6 +843,7 @@
},
"pytest-sugar": {
"hashes": [
"sha256:67a55a83c7b2717ad607704d3fe9004bb6543b54017ef82f9c6590acc38c1aec",
"sha256:b1b2186b0a72aada6859bea2a5764145e3aaa2c1cfbb23c3a19b5f7b697563d3"
],
"index": "pypi",
@ -927,6 +961,7 @@
},
"termcolor": {
"hashes": [
"sha256:19b1225d03bfb56571484caaa8521d8ec6e2473ae1640c9f48a48dda49417706",
"sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"
],
"version": "==1.1.0"

View File

@ -2,27 +2,25 @@
### Back end ###
###############################################################################
FROM ubuntu:20.04
FROM python:3.7-slim
WORKDIR /usr/src/paperless/
COPY Pipfile* ./
COPY requirements.txt ./
#Dependencies
RUN apt-get update \
&& DEBIAN_FRONTEND="noninteractive" apt-get -y --no-install-recommends install \
&& apt-get -y --no-install-recommends install \
build-essential \
curl \
ghostscript \
gnupg \
imagemagick \
libatlas-base-dev \
libmagic-dev \
libpoppler-cpp-dev \
libpq-dev \
optipng \
python3 \
python3-dev \
python3-pip \
sudo \
tesseract-ocr \
tesseract-ocr-eng \
@ -32,10 +30,9 @@ RUN apt-get update \
tesseract-ocr-spa \
tzdata \
unpaper \
&& pip3 install --upgrade pipenv supervisor setuptools \
&& pipenv install --system --deploy \
&& pipenv --clear \
&& apt-get -y purge build-essential python3-pip python3-dev \
&& pip3 install --upgrade supervisor setuptools \
&& pip install --no-cache-dir -r requirements.txt \
&& apt-get -y purge build-essential \
&& apt-get -y autoremove --purge \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir /var/log/supervisord /var/run/supervisord

View File

@ -8,16 +8,40 @@ Administration
Making backups
##############
.. warning::
Multiple options exist for making backups of your paperless instance,
depending on how you installed paperless.
This section is not updated to paperless-ng yet, the exporter is a valid tool
for backups though.
Before making backups, make sure that paperless is not running.
So you're bored of this whole project, or you want to make a remote backup of
your files for whatever reason. This is easy to do, simply use the
:ref:`exporter <utilities-exporter>` to dump your documents and database out
into an arbitrary directory.
Options available to any installation of paperless:
* Use the :ref:`document exporter <utilities-exporter>`.
The document exporter exports all your documents, thumbnails and
metadata to a specific folder. You may import your documents into a
fresh instance of paperless again or store your documents in another
DMS with this export.
Options available to docker installations:
* Backup the docker volumes. These usually reside within
``/var/lib/docker/volumes`` on the host and you need to be root in order
to access them.
Paperless uses 3 volumes:
* ``paperless_media``: This is where your documents are stored.
* ``paperless_data``: This is where auxilliary data is stored. This
folder also contains the SQLite database, if you use it.
* ``paperless_pgdata``: Exists only if you use PostgreSQL and contains
the database.
Options available to bare-metal and non-docker installations:
* Backup the entire paperless folder. This ensures that if your paperless instance
crashes at some point or your disk fails, you can simply copy the folder back
into place and it works.
When using PostgreSQL, you'll also have to backup the database.
.. _migrating-restoring:
@ -25,6 +49,8 @@ Restoring
=========
.. _administration-updating:
Updating paperless

View File

@ -128,6 +128,8 @@ consumer. Once complete, you should see the newly-created document,
automatically tagged with the appropriate data.
.. _advanced-automatic_matching:
Automatic matching
==================
@ -175,8 +177,6 @@ then put the path to that script in ``paperless.conf`` with the variable name
of either ``PAPERLESS_PRE_CONSUME_SCRIPT`` or
``PAPERLESS_POST_CONSUME_SCRIPT``.
.. TODO HYPEREF TO CONFIG
.. important::
These scripts are executed in a **blocking** process, which means that if
@ -319,6 +319,6 @@ for use in filenames.
.. code::
PAPERLESS_FILENAME_FORMAT=../../my/custom/location/{title}
However, keep in mind that inside docker, if files get stored outside of the
predefined volumes, they will be lost after a restart of paperless.

View File

@ -96,6 +96,8 @@ paperless-ng 0.9.0
sqlite.
* ``PAPERLESS_OCR_THREADS`` is gone and replaced with ``PAPERLESS_TASK_WORKERS`` and
``PAPERLESS_THREADS_PER_WORKER``. Refer to the config example for details.
* ``PAPERLESS_OPTIMIZE_THUMBNAILS`` allows you to disable or enable thumbnail
optimization. This is useful on less powerful devices.
* Many more small changes here and there. The usual stuff.

View File

@ -23,27 +23,35 @@ is
**Q:** *Will paperless-ng run on Raspberry Pi?*
**A:** The short answer is yes. The long answer is that certain parts of
**A:** The short answer is yes. I've tested it on a Raspberry Pi 3 B.
The long answer is that certain parts of
Paperless will run very slow, such as the tesseract OCR. On Rasperry Pi,
try to OCR documents before feeding them into paperless so that paperless can
reuse the text. The web interface should be alot snappier, since it runs
in your browser and paperless has to do much less work to serve the data.
.. note::
Consider setting ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to false to speed up
the consumption process. This takes quite a bit of time on Raspberry Pi.
.. note::
Updating the :ref:`automatic matching algorithm <advanced-automatic_matching>`
takes quite a bit of time. However, the update mechanism checks if your
data has changed before doing the heavy lifting. If you experience the
algorithm taking too much cpu time, consider changing the schedule in the
admin interface to daily or weekly. You can also manually invoke the task
by changing the date and time of the next run to today/now.
The actual matching of the algorithm is fast and works on Raspberry Pi as
well as on any other device.
**Q:** *How do I install paperless-ng on Raspberry Pi?*
**A:** There is not docker image for ARM available. If you know how to build
that automatically, I'm all ears. For now, you have to grab the latest release
archive from the project page and build the image yourself. The release comes
with the front end already compiled, so you don't have to do this on the Pi.
You may encounter some issues during the build:
.. code:: shell-session
W: GPG error: http://ports.ubuntu.com/ubuntu-ports focal InRelease: At least one invalid signature was encountered.
E: The repository 'http://ports.ubuntu.com/ubuntu-ports focal InRelease' is not signed.
N: Updating from such a repository can't be done securely, and is therefore disabled by default.
N: See apt-secure(8) manpage for repository creation and user configuration details.
If this happens, look at `this thread <https://askubuntu.com/questions/1263284/>`:_.
You will need to update docker to the latest version to fix this issue.

View File

@ -10,7 +10,7 @@
# This is required for processing scheduled tasks such as email fetching, index
# optimization and for training the automatic document matcher.
# Defaults to localhost:6379.
#PAPERLESS_REDIS="redis://localhost:6379"
#PAPERLESS_REDIS=redis://localhost:6379
###############################################################################
@ -22,15 +22,15 @@
# configuration for this is already done inside the docker-compose.env file.
#Set PAPERLESS_DBHOST and postgresql will be used instead of mysql.
#PAPERLESS_DBHOST="localhost"
#PAPERLESS_DBHOST=localhost
#Adjust port if necessary
#PAPERLESS_DBPORT=
#name, user and pass all default to "paperless"
#PAPERLESS_DBNAME="paperless"
#PAPERLESS_DBUSER="paperless"
#PAPERLESS_DBPASS="paperless"
#PAPERLESS_DBNAME=paperless
#PAPERLESS_DBUSER=paperless
#PAPERLESS_DBPASS=paperless
###############################################################################
@ -40,23 +40,23 @@
# This where your documents should go to be consumed. Make sure that it exists
# and that the user running the paperless service can read/write its contents
# before you start Paperless.
PAPERLESS_CONSUMPTION_DIR="../consume"
PAPERLESS_CONSUMPTION_DIR=../consume
# This is where paperless stores all its data (search index, sqlite database,
# classification model, etc).
#PAPERLESS_DATA_DIR="../data"
#PAPERLESS_DATA_DIR=../data
# This is where your documents and thumbnails are stored.
#PAPERLESS_MEDIA_ROOT="../media"
#PAPERLESS_MEDIA_ROOT=../media
# Override the default STATIC_ROOT here. This is where all static files
# created using "collectstatic" manager command are stored.
#PAPERLESS_STATICDIR="../static"
#PAPERLESS_STATICDIR=../static
# Override the STATIC_URL here. Unless you're hosting Paperless off a
# subdomain like /paperless/, you probably don't need to change this.
#PAPERLESS_STATIC_URL="/static/"
#PAPERLESS_STATIC_URL=/static/
# Specify a filename format for the document (directories are supported)
@ -69,7 +69,7 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
# * {tags[INDEX]} If your tags are strings, select the tag by index
# Uniqueness of filenames is ensured, as an incrementing counter is attached
# to each filename.
#PAPERLESS_FILENAME_FORMAT=""
#PAPERLESS_FILENAME_FORMAT=
###############################################################################
#### Security ####
@ -77,10 +77,12 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
# Controls whether django's debug mode is enabled. Disable this on production
# systems. Debug mode is disabled by default.
#PAPERLESS_DEBUG="false"
#PAPERLESS_DEBUG=false
# GnuPG encryption is deprecated and will be removed in future versions.
#
# Dont use it. It does not provide any security at all.
#
# Paperless can be instructed to attempt to encrypt your PDF files with GPG
# using the PAPERLESS_PASSPHRASE specified below. If however you're not
# concerned about encrypting these files (for example if you have disk
@ -93,13 +95,13 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
# you've since changed it to a new one.
#
# The default is to not use encryption at all.
#PAPERLESS_PASSPHRASE="secret"
#PAPERLESS_PASSPHRASE=secret
# The secret key has a default that should be fine so long as you're hosting
# Paperless on a closed network. However, if you're putting this anywhere
# public, you should change the key to something unique and verbose.
#PAPERLESS_SECRET_KEY="change-me"
#PAPERLESS_SECRET_KEY=change-me
# If you're planning on putting Paperless on the open internet, then you
@ -109,19 +111,19 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
#
# Just remember that this is a comma-separated list, so "example.com" is fine,
# as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
#PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com"
#PAPERLESS_ALLOWED_HOSTS=example.com,www.example.com
# If you decide to use the Paperless API in an ajax call, you need to add your
# servers to the list of allowed hosts that can do CORS calls. By default
# Paperless allows calls from localhost:8080, but you'd like to change that,
# you can set this value to a comma-separated list.
#PAPERLESS_CORS_ALLOWED_HOSTS="localhost:8080,example.com,localhost:8000"
#PAPERLESS_CORS_ALLOWED_HOSTS=localhost:8080,example.com,localhost:8000
# To host paperless under a subpath url like example.com/paperless you set
# this value to /paperless. No trailing slash!
#
# https://docs.djangoproject.com/en/1.11/ref/settings/#force-script-name
#PAPERLESS_FORCE_SCRIPT_NAME=""
#PAPERLESS_FORCE_SCRIPT_NAME=
###############################################################################
#### Software Tweaks ####
@ -158,14 +160,19 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
# When the consumer detects a duplicate document, it will not touch the
# original document. This default behavior can be changed here.
#PAPERLESS_CONSUMER_DELETE_DUPLICATES="false"
#PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
# Use optipng to optimize thumbnails. This usually reduces the sice of
# thumbnails by about 20%, but uses considerable compute time during
# consumption.
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
# After a document is consumed, Paperless can trigger an arbitrary script if
# you like. This script will be passed a number of arguments for you to work
# with. The default is blank, which means nothing will be executed. For more
# information, take a look at the docs:
# http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process
#PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh"
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
# By default, paperless will check the document text for document date information.
# Uncomment the line below to enable checking the document filename for date
@ -173,7 +180,7 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
# https://dateparser.readthedocs.io/en/latest/#settings. The filename will be
# checked first, and if nothing is found, the document text will be checked
# as normal.
#PAPERLESS_FILENAME_DATE_ORDER="YMD"
#PAPERLESS_FILENAME_DATE_ORDER=YMD
# Sometimes devices won't create filenames which can be parsed properly
# by the filename parser (see
@ -243,7 +250,7 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
# By default Paperless does not OCR a document if the text can be retrieved from
# the document directly. Set to true to always OCR documents.
#PAPERLESS_OCR_ALWAYS="false"
#PAPERLESS_OCR_ALWAYS=false
###############################################################################
@ -271,7 +278,7 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
# Ghostscript
#PAPERLESS_GS_BINARY = /usr/bin/gs
#PAPERLESS_GS_BINARY=/usr/bin/gs
# Unpaper
#PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper

View File

@ -24,12 +24,17 @@ then
rm "$PAPERLESS_DIST" -r
fi
mkdir "$PAPERLESS_DIST"
mkdir "$PAPERLESS_DIST_APP"
mkdir "$PAPERLESS_DIST_APP/docker"
# setup dependencies.
cd "$PAPERLESS_ROOT"
pipenv clean
pipenv install --dev
pipenv lock --keep-outdated -r > "$PAPERLESS_DIST_APP/requirements.txt"
# test if the application works.
@ -44,10 +49,6 @@ make clean html
# copy stuff into place
mkdir "$PAPERLESS_DIST"
mkdir "$PAPERLESS_DIST_APP"
mkdir "$PAPERLESS_DIST_APP/docker"
# the application itself
cp "$PAPERLESS_ROOT/.env" \
@ -92,8 +93,6 @@ cd "$PAPERLESS_DIST_APP"
docker build . -t "jonaswinkler/paperless-ng:$VERSION"
docker push "jonaswinkler/paperless-ng:$VERSION"
# works. package the app!
cd "$PAPERLESS_DIST"

23
scripts/push-release.sh Executable file
View File

@ -0,0 +1,23 @@
#!/bin/bash
set -e
VERSION=$1
if [ -z "$VERSION" ]
then
echo "Need a version string."
exit 1
fi
# source root directory of paperless
PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
# output directory
PAPERLESS_DIST="$PAPERLESS_ROOT/dist"
PAPERLESS_DIST_APP="$PAPERLESS_DIST/paperless-ng"
cd "$PAPERLESS_DIST_APP"
docker push "jonaswinkler/paperless-ng:$VERSION"

View File

@ -132,6 +132,28 @@
</a>
</li>
</ul>
<h6 class="sidebar-heading d-flex justify-content-between align-items-center px-3 mt-4 mb-1 text-muted">
<span>Misc</span>
</h6>
<ul class="nav flex-column mb-2">
<li class="nav-item">
<a class="nav-link" href="https://paperless-ng.readthedocs.io/en/latest/">
<svg class="sidebaricon" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#question-circle"/>
</svg>
Documentation
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="https://github.com/jonaswinkler/paperless-ng">
<svg class="sidebaricon" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#link"/>
</svg>
Github
</a>
</li>
</ul>
</div>
</nav>

View File

@ -1,6 +1,6 @@
<div class="row pt-3 pb-2 mb-3 border-bottom align-items-center">
<div class="row pt-3 pb-1 mb-3 border-bottom align-items-center" >
<div class="col text-truncate">
<h1 class="h2 text-truncate">{{title}}</h1>
<h1 class="h2 text-truncate" style="line-height: 1.4">{{title}}</h1>
</div>
<div class="btn-toolbar col-auto">
<ng-content></ng-content>

View File

@ -1,3 +1,7 @@
.log-entry-10 {
color: lightslategray !important;
}
.log-entry-30 {
color: yellow !important;
}

View File

@ -3,7 +3,6 @@ import hashlib
import logging
import os
import re
import uuid
from django.conf import settings
from django.db import transaction
@ -12,6 +11,7 @@ from django.utils import timezone
from paperless.db import GnuPG
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class
from .signals import (
@ -24,12 +24,10 @@ class ConsumerError(Exception):
pass
class Consumer:
class Consumer(LoggingMixin):
def __init__(self):
self.logger = logging.getLogger(__name__)
self.logging_group = None
super().__init__()
self.path = None
self.filename = None
self.override_title = None
@ -74,11 +72,6 @@ class Consumer:
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
def try_consume_file(self,
path,
override_filename=None,
@ -100,7 +93,7 @@ class Consumer:
# this is for grouping logging entries for this particular file
# together.
self.logging_group = uuid.uuid4()
self.renew_logging_group()
# Make sure that preconditions for consuming the file are met.

View File

@ -86,7 +86,7 @@ def generate_filename(document):
added_day=document.added.day if document.added else "none",
tags=tags,
)
except (ValueError, KeyError, IndexError) as e:
except (ValueError, KeyError, IndexError):
logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
# Always append the primary key to guarantee uniqueness of filename

View File

@ -32,6 +32,9 @@ class UploadForm(forms.Form):
t = int(mktime(datetime.now().timetuple()))
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
# TODO: dont just append pdf. This is here for taht weird regex check at the start of the consumer.
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
f.write(document)

View File

@ -1,4 +1,5 @@
import logging
import uuid
class PaperlessHandler(logging.Handler):
@ -13,3 +14,19 @@ class PaperlessHandler(logging.Handler):
kwargs["group"] = record.group
Log.objects.create(**kwargs)
class LoggingMixin:
logging_group = None
def renew_logging_group(self):
self.logging_group = uuid.uuid4()
def log(self, level, message):
target = ".".join([self.__class__.__module__, self.__class__.__name__])
logger = logging.getLogger(target)
getattr(logger, level)(message, extra={
"group": self.logging_group
})

View File

@ -1,7 +1,4 @@
# Generated by Django 3.1.3 on 2020-11-07 12:35
import os
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion

View File

@ -20,6 +20,7 @@ from django.utils import timezone
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration
# TODO: isnt there a date parsing library for this?
@ -101,17 +102,17 @@ class ParseError(Exception):
pass
class DocumentParser:
class DocumentParser(LoggingMixin):
"""
Subclass this to make your own parser. Have a look at
`paperless_tesseract.parsers` for inspiration.
"""
def __init__(self, path, logging_group):
super().__init__()
self.logging_group = logging_group
self.document_path = path
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
self.logger = logging.getLogger(__name__)
self.logging_group = logging_group
def get_thumbnail(self):
"""
@ -121,16 +122,19 @@ class DocumentParser:
def optimise_thumbnail(self, in_path):
out_path = os.path.join(self.tempdir, "optipng.png")
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "optipng.png")
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
self.log('debug', 'Execute: ' + " ".join(args))
self.log('debug', 'Execute: ' + " ".join(args))
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
return out_path
return out_path
else:
return in_path
def get_optimised_thumbnail(self):
return self.optimise_thumbnail(self.get_thumbnail())
@ -222,11 +226,6 @@ class DocumentParser:
return date
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
def cleanup(self):
self.log("debug", "Deleting directory {}".format(self.tempdir))
shutil.rmtree(self.tempdir)

View File

@ -2,11 +2,10 @@ import os
import shutil
import tempfile
from unittest import mock
from unittest.mock import MagicMock
from django.contrib.auth.models import User
from django.test import override_settings
from rest_framework.test import APITestCase, APIClient
from rest_framework.test import APITestCase
from documents.models import Document, Correspondent, DocumentType, Tag

View File

@ -80,6 +80,6 @@ class TestClassifier(TestCase):
self.classifier.save_classifier()
newClassifier = DocumentClassifier()
newClassifier.reload()
self.assertFalse(newClassifier.train())
new_classifier = DocumentClassifier()
new_classifier.reload()
self.assertFalse(new_classifier.train())

View File

@ -5,8 +5,6 @@ import tempfile
from unittest import mock
from unittest.mock import MagicMock
from django.conf import settings
from django.db import DatabaseError
from django.test import TestCase, override_settings
from ..consumer import Consumer, ConsumerError
@ -504,9 +502,9 @@ class TestConsumer(TestCase):
def testOverrideFilename(self):
filename = self.get_test_file()
overrideFilename = "My Bank - Statement for November.pdf"
override_filename = "My Bank - Statement for November.pdf"
document = self.consumer.try_consume_file(filename, override_filename=overrideFilename)
document = self.consumer.try_consume_file(filename, override_filename=override_filename)
self.assertEqual(document.correspondent.name, "My Bank")
self.assertEqual(document.title, "Statement for November")

View File

@ -72,11 +72,11 @@ def binaries_check(app_configs, **kwargs):
@register()
def debug_mode_check(app_configs, **kwargs):
if settings.DEBUG:
return [Warning("DEBUG mode is enabled. Disable Debug mode. "
"This is a serious security "
"issue, since it puts security overides in place which"
"are meant to be only used during development. This"
"also means that paperless will tell anyone various"
"debugging information when something goes wrong.")]
return [Warning(
"DEBUG mode is enabled. Disable Debug mode. This is a serious "
"security issue, since it puts security overides in place which "
"are meant to be only used during development. This "
"also means that paperless will tell anyone various "
"debugging information when something goes wrong.")]
else:
return []

View File

@ -257,6 +257,14 @@ LOGGING = {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
"paperless_mail": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
"paperless_tesseract": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
},
}
@ -312,6 +320,8 @@ CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
# The default language that tesseract will attempt to use when parsing
# documents. It should be a 3-letter language code consistent with ISO 639.
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")

View File

@ -1,18 +1,7 @@
from django.contrib import admin
from django import forms
from paperless_mail.models import MailAccount, MailRule
class MailAccountForm(forms.ModelForm):
password = forms.CharField(widget=forms.PasswordInput)
class Meta:
fields = '__all__'
model = MailAccount
class MailAccountAdmin(admin.ModelAdmin):
list_display = ("name", "imap_server", "username")
@ -20,6 +9,8 @@ class MailAccountAdmin(admin.ModelAdmin):
class MailRuleAdmin(admin.ModelAdmin):
list_filter = ("account",)
list_display = ("name", "account", "folder", "action")

View File

@ -8,6 +8,7 @@ from django_q.tasks import async_task
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
MailboxFolderSelectError
from documents.loggers import LoggingMixin
from documents.models import Correspondent
from paperless_mail.models import MailAccount, MailRule
@ -83,72 +84,6 @@ def make_criterias(rule):
return {**criterias, **get_rule_action(rule).get_criteria()}
def handle_mail_account(account):
if account.imap_security == MailAccount.IMAP_SECURITY_NONE:
mailbox = MailBoxUnencrypted(account.imap_server, account.imap_port)
elif account.imap_security == MailAccount.IMAP_SECURITY_STARTTLS:
mailbox = MailBox(account.imap_server, account.imap_port, starttls=True)
elif account.imap_security == MailAccount.IMAP_SECURITY_SSL:
mailbox = MailBox(account.imap_server, account.imap_port)
else:
raise ValueError("Unknown IMAP security")
total_processed_files = 0
with mailbox as M:
try:
M.login(account.username, account.password)
except Exception:
raise MailError(
f"Error while authenticating account {account.name}")
for rule in account.rules.all():
try:
M.folder.set(rule.folder)
except MailboxFolderSelectError:
raise MailError(
f"Rule {rule.name}: Folder {rule.folder} does not exist "
f"in account {account.name}")
criterias = make_criterias(rule)
try:
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while fetching folder "
f"{rule.folder} of account {account.name}")
post_consume_messages = []
for message in messages:
try:
processed_files = handle_message(message, rule)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing mail "
f"{message.uid} of account {account.name}")
if processed_files > 0:
post_consume_messages.append(message.uid)
total_processed_files += processed_files
try:
get_rule_action(rule).post_consume(
M,
post_consume_messages,
rule.action_parameter)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing post-consume "
f"actions for account {account.name}")
return total_processed_files
def get_title(message, att, rule):
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
title = message.subject
@ -189,39 +124,156 @@ def get_correspondent(message, rule):
return correspondent
def handle_message(message, rule):
if not message.attachments:
return 0
def get_mailbox(server, port, security):
if security == MailAccount.IMAP_SECURITY_NONE:
mailbox = MailBoxUnencrypted(server, port)
elif security == MailAccount.IMAP_SECURITY_STARTTLS:
mailbox = MailBox(server, port, starttls=True)
elif security == MailAccount.IMAP_SECURITY_SSL:
mailbox = MailBox(server, port)
else:
raise ValueError("Unknown IMAP security")
return mailbox
correspondent = get_correspondent(message, rule)
tag = rule.assign_tag
doc_type = rule.assign_document_type
processed_attachments = 0
class MailAccountHandler(LoggingMixin):
for att in message.attachments:
def handle_mail_account(self, account):
title = get_title(message, att, rule)
self.renew_logging_group()
# TODO: check with parsers what files types are supported
if att.content_type == 'application/pdf':
self.log('debug', f"Processing mail account {account}")
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
with open(temp_filename, 'wb') as f:
f.write(att.payload)
total_processed_files = 0
async_task(
"documents.tasks.consume_file",
path=temp_filename,
override_filename=att.filename,
override_title=title,
override_correspondent_id=correspondent.id if correspondent else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=[tag.id] if tag else None,
task_name=f"Mail: {att.filename}"
)
with get_mailbox(account.imap_server,
account.imap_port,
account.imap_security) as M:
processed_attachments += 1
try:
M.login(account.username, account.password)
except Exception:
raise MailError(
f"Error while authenticating account {account.name}")
return processed_attachments
self.log('debug', f"Account {account}: Processing "
f"{account.rules.count()} rule(s)")
for rule in account.rules.all():
self.log(
'debug',
f"Account {account}: Processing rule {rule.name}")
self.log(
'debug',
f"Rule {account}.{rule}: Selecting folder {rule.folder}")
try:
M.folder.set(rule.folder)
except MailboxFolderSelectError:
raise MailError(
f"Rule {rule.name}: Folder {rule.folder} does not exist "
f"in account {account.name}")
criterias = make_criterias(rule)
self.log(
'debug',
f"Rule {account}.{rule}: Searching folder with criteria "
f"{str(AND(**criterias))}")
try:
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while fetching folder "
f"{rule.folder} of account {account.name}")
post_consume_messages = []
mails_processed = 0
for message in messages:
try:
processed_files = self.handle_message(message, rule)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing mail "
f"{message.uid} of account {account.name}")
if processed_files > 0:
post_consume_messages.append(message.uid)
total_processed_files += processed_files
mails_processed += 1
self.log(
'debug',
f"Rule {account}.{rule}: Processed {mails_processed} "
f"matching mail(s)")
self.log(
'debug',
f"Rule {account}.{rule}: Running mail actions on "
f"{len(post_consume_messages)} mails")
try:
get_rule_action(rule).post_consume(
M,
post_consume_messages,
rule.action_parameter)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing post-consume "
f"actions for account {account.name}")
return total_processed_files
def handle_message(self, message, rule):
if not message.attachments:
return 0
self.log(
'debug',
f"Rule {rule.account}.{rule}: "
f"Processing mail {message.subject} from {message.from_} with "
f"{len(message.attachments)} attachment(s)")
correspondent = get_correspondent(message, rule)
tag = rule.assign_tag
doc_type = rule.assign_document_type
processed_attachments = 0
for att in message.attachments:
title = get_title(message, att, rule)
# TODO: check with parsers what files types are supported
if att.content_type == 'application/pdf':
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
with open(temp_filename, 'wb') as f:
f.write(att.payload)
self.log(
'info',
f"Rule {rule.account}.{rule}: "
f"Consuming attachment {att.filename} from mail "
f"{message.subject} from {message.from_}")
async_task(
"documents.tasks.consume_file",
path=temp_filename,
override_filename=att.filename,
override_title=title,
override_correspondent_id=correspondent.id if correspondent else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=[tag.id] if tag else None,
task_name=f"Mail: {att.filename}"
)
processed_attachments += 1
return processed_attachments

View File

@ -1,6 +1,6 @@
from django.core.management.base import BaseCommand
from paperless_mail import mail, tasks
from paperless_mail import tasks
class Command(BaseCommand):

View File

@ -0,0 +1,23 @@
# Generated by Django 3.1.3 on 2020-11-18 19:40
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('paperless_mail', '0002_auto_20201117_1334'),
]
operations = [
migrations.AlterField(
model_name='mailaccount',
name='imap_port',
field=models.IntegerField(blank=True, help_text='This is usually 143 for unencrypted and STARTTLS connections, and 993 for SSL connections.', null=True),
),
migrations.AlterField(
model_name='mailrule',
name='name',
field=models.CharField(max_length=256, unique=True),
),
]

View File

@ -1,8 +1,5 @@
from django.db import models
# Create your models here.
from django.db import models
import documents.models as document_models
@ -22,7 +19,11 @@ class MailAccount(models.Model):
imap_server = models.CharField(max_length=256)
imap_port = models.IntegerField(blank=True, null=True)
imap_port = models.IntegerField(
blank=True,
null=True,
help_text="This is usually 143 for unencrypted and STARTTLS "
"connections, and 993 for SSL connections.")
imap_security = models.PositiveIntegerField(
choices=IMAP_SECURITY_OPTIONS,
@ -71,7 +72,7 @@ class MailRule(models.Model):
(CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
)
name = models.CharField(max_length=256)
name = models.CharField(max_length=256, unique=True)
account = models.ForeignKey(
MailAccount,

View File

@ -1,13 +1,13 @@
import logging
from paperless_mail import mail
from paperless_mail.mail import MailAccountHandler
from paperless_mail.models import MailAccount
def process_mail_accounts():
total_new_documents = 0
for account in MailAccount.objects.all():
total_new_documents += mail.handle_mail_account(account)
total_new_documents += MailAccountHandler().handle_mail_account(account)
if total_new_documents > 0:
return f"Added {total_new_documents} document(s)."
@ -18,6 +18,6 @@ def process_mail_accounts():
def process_mail_account(name):
account = MailAccount.objects.find(name=name)
if account:
mail.handle_mail_account(account)
MailAccountHandler().handle_mail_account(account)
else:
logging.error("Unknown mail acccount: {}".format(name))

View File

@ -7,7 +7,7 @@ from django.test import TestCase
from imap_tools import MailMessageFlags, MailboxFolderSelectError
from documents.models import Correspondent
from paperless_mail.mail import get_correspondent, get_title, handle_message, handle_mail_account, MailError
from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title
from paperless_mail.models import MailRule, MailAccount
@ -126,6 +126,8 @@ class TestMail(TestCase):
self.reset_bogus_mailbox()
self.mail_account_handler = MailAccountHandler()
def reset_bogus_mailbox(self):
self.bogus_mailbox.messages = []
self.bogus_mailbox.messages_spam = []
@ -145,10 +147,10 @@ class TestMail(TestCase):
me_localhost = Correspondent.objects.create(name=message2.from_)
someone_else = Correspondent.objects.create(name="someone else")
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
rule = MailRule(name="a", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
self.assertIsNone(get_correspondent(message, rule))
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
rule = MailRule(name="b", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
c = get_correspondent(message, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "someone@somewhere.com")
@ -157,7 +159,7 @@ class TestMail(TestCase):
self.assertEqual(c.name, "me@localhost.com")
self.assertEqual(c.id, me_localhost.id)
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
rule = MailRule(name="c", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
c = get_correspondent(message, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "Someone!")
@ -165,7 +167,7 @@ class TestMail(TestCase):
self.assertIsNotNone(c)
self.assertEqual(c.id, me_localhost.id)
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
rule = MailRule(name="d", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
c = get_correspondent(message, rule)
self.assertEqual(c, someone_else)
@ -174,14 +176,15 @@ class TestMail(TestCase):
message.subject = "the message title"
att = namedtuple('Attachment', [])
att.filename = "this_is_the_file.pdf"
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
rule = MailRule(name="a", assign_title_from=MailRule.TITLE_FROM_FILENAME)
self.assertEqual(get_title(message, att, rule), "this_is_the_file")
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_SUBJECT)
rule = MailRule(name="b", assign_title_from=MailRule.TITLE_FROM_SUBJECT)
self.assertEqual(get_title(message, att, rule), "the message title")
def test_handle_message(self):
message = namedtuple('MailMessage', [])
message.subject = "the message title"
message.from_ = "Myself"
att = namedtuple('Attachment', [])
att.filename = "test1.pdf"
@ -200,9 +203,10 @@ class TestMail(TestCase):
message.attachments = [att, att2, att3]
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
account = MailAccount()
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
result = handle_message(message, rule)
result = self.mail_account_handler.handle_message(message, rule)
self.assertEqual(result, 2)
@ -224,7 +228,7 @@ class TestMail(TestCase):
message.attachments = []
rule = MailRule()
result = handle_message(message, rule)
result = self.mail_account_handler.handle_message(message, rule)
self.assertFalse(m.called)
self.assertEqual(result, 0)
@ -235,11 +239,13 @@ class TestMail(TestCase):
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 2)
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
def test_handle_mail_account_delete(self):
@ -249,7 +255,7 @@ class TestMail(TestCase):
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 2)
self.assertEqual(len(self.bogus_mailbox.messages), 1)
@ -258,11 +264,13 @@ class TestMail(TestCase):
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice")
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 1)
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
def test_handle_mail_account_move(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
@ -272,7 +280,7 @@ class TestMail(TestCase):
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 1)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
@ -281,7 +289,7 @@ class TestMail(TestCase):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
try:
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
except MailError as e:
self.assertTrue(str(e).startswith("Error while authenticating account"))
else:
@ -291,7 +299,7 @@ class TestMail(TestCase):
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
try:
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
except MailError as e:
self.assertTrue("uuuh does not exist" in str(e))
else:
@ -299,10 +307,10 @@ class TestMail(TestCase):
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
rule = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
try:
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
except MailError as e:
self.assertTrue("Error while processing post-consume actions" in str(e))
else:
@ -311,12 +319,12 @@ class TestMail(TestCase):
def test_filters(self):
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Claim")
rule = MailRule.objects.create(name="testrule3", account=account, action=MailRule.ACTION_DELETE, filter_subject="Claim")
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 1)
@ -326,7 +334,7 @@ class TestMail(TestCase):
rule.filter_body = "electronic"
rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 2)
@ -336,7 +344,7 @@ class TestMail(TestCase):
rule.filter_body = None
rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 1)
self.assertEqual(self.async_task.call_count, 4)
@ -347,6 +355,6 @@ class TestMail(TestCase):
rule.filter_subject = "Invoice"
rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 5)

View File

@ -1,3 +0,0 @@
from django.shortcuts import render
# Create your views here.

View File

@ -86,7 +86,7 @@ class RasterisedDocumentParser(DocumentParser):
return self._text
if not settings.OCR_ALWAYS and self._is_ocred():
self.log("info", "Skipping OCR, using Text from PDF")
self.log("debug", "Skipping OCR, using Text from PDF")
self._text = get_text_from_pdf(self.document_path)
return self._text
@ -98,7 +98,7 @@ class RasterisedDocumentParser(DocumentParser):
try:
sample_page_index = int(len(images) / 2)
self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
guessed_language = self._guess_language(sample_page_text)
@ -107,7 +107,7 @@ class RasterisedDocumentParser(DocumentParser):
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
self.log("info", "Detected language: {} (default language)".format(guessed_language))
self.log("debug", "Detected language: {} (default language)".format(guessed_language))
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
@ -115,10 +115,10 @@ class RasterisedDocumentParser(DocumentParser):
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
else:
self.log("info", "Detected language: {}".format(guessed_language))
self.log("debug", "Detected language: {}".format(guessed_language))
ocr_pages = self._ocr(images, ISO639[guessed_language])
self.log("info", "OCR completed.")
self.log("debug", "OCR completed.")
self._text = strip_excess_whitespace(" ".join(ocr_pages))
return self._text
@ -130,7 +130,7 @@ class RasterisedDocumentParser(DocumentParser):
Greyscale images are easier for Tesseract to OCR
"""
self.log("info", "Converting document {} into greyscale images...".format(self.document_path))
self.log("debug", "Converting document {} into greyscale images...".format(self.document_path))
# Convert PDF to multiple PNMs
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
@ -148,7 +148,7 @@ class RasterisedDocumentParser(DocumentParser):
if f.endswith(".pnm"):
pnms.append(os.path.join(self.tempdir, f))
self.log("info", "Running unpaper on {} pages...".format(len(pnms)))
self.log("debug", "Running unpaper on {} pages...".format(len(pnms)))
# Run unpaper in parallel on converted images
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
@ -161,11 +161,11 @@ class RasterisedDocumentParser(DocumentParser):
guess = langdetect.detect(text)
return guess
except Exception as e:
self.log('debug', "Language detection failed with: {}".format(e))
self.log('warning', "Language detection failed with: {}".format(e))
return None
def _ocr(self, imgs, lang):
self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
return r
@ -180,7 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
images_copy = list(images)
del images_copy[sample_page_index]
if images_copy:
self.log('info', 'Continuing ocr with default language.')
self.log('debug', 'Continuing ocr with default language.')
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
ocr_pages.insert(sample_page_index, sample_page)
return ocr_pages