Compare commits
407 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
d5876cc97d | ||
![]() |
6235edf845 | ||
![]() |
9b00a98de3 | ||
![]() |
07e18e773a | ||
![]() |
fc560b8c04 | ||
![]() |
c94f4dcc75 | ||
![]() |
9173bca3c7 | ||
![]() |
f2cf3a6a0f | ||
![]() |
d6346706db | ||
![]() |
48738dab9f | ||
![]() |
11db87fa11 | ||
![]() |
1f7990d742 | ||
![]() |
52b32fddc9 | ||
![]() |
81a8cb45d7 | ||
![]() |
9c583fe9f3 | ||
![]() |
4585308e7f | ||
![]() |
4386b09eb1 | ||
![]() |
f96e7f7895 | ||
![]() |
8218b1aa51 | ||
![]() |
0559204be4 | ||
![]() |
bccac5017c | ||
![]() |
3e8038577d | ||
![]() |
05b7bcd199 | ||
![]() |
3a2a180607 | ||
![]() |
9690a00761 | ||
![]() |
3532745579 | ||
![]() |
24bdc07e14 | ||
![]() |
528b572855 | ||
![]() |
91ddfaa065 | ||
![]() |
ac0cda861e | ||
![]() |
a752a4a91a | ||
![]() |
7e1d59377a | ||
![]() |
7357471b9e | ||
![]() |
bd75a65866 | ||
![]() |
e65e27d11f | ||
![]() |
12488c9634 | ||
![]() |
61cd050e24 | ||
![]() |
f018e8e54f | ||
![]() |
a56a3eb86d | ||
![]() |
2fe7df8ca0 | ||
![]() |
873c98dddb | ||
![]() |
ea287e0db2 | ||
![]() |
4babfa1a5b | ||
![]() |
aa2fc84d7f | ||
![]() |
8d5ae64aff | ||
![]() |
82f9dde055 | ||
![]() |
c983e73d0f | ||
![]() |
20a4a66a57 | ||
![]() |
4ed1fff518 | ||
![]() |
7223ea3c3f | ||
![]() |
676c8f9fa7 | ||
![]() |
00fd2268c5 | ||
![]() |
3aafabba26 | ||
![]() |
b733b32c1d | ||
![]() |
4ba9514007 | ||
![]() |
4505711e4f | ||
![]() |
63c394fa31 | ||
![]() |
27c72a7bc6 | ||
![]() |
72af13e4e4 | ||
![]() |
6c8ef8f044 | ||
![]() |
9d4bebd569 | ||
![]() |
101b7bb9bf | ||
![]() |
52d6cf085d | ||
![]() |
39ead59e45 | ||
![]() |
015c49030b | ||
![]() |
985b9428fe | ||
![]() |
ea90bd3f84 | ||
![]() |
fccc95254b | ||
![]() |
e266e114a9 | ||
![]() |
19faed3634 | ||
![]() |
fcdcf62c2c | ||
![]() |
68251b8be6 | ||
![]() |
8e63388833 | ||
![]() |
1f2079f65a | ||
![]() |
f61fa06993 | ||
![]() |
da1d3820ec | ||
![]() |
f778d3a6e3 | ||
![]() |
96a94c4ee9 | ||
![]() |
b126c6b0ff | ||
![]() |
1d162dc769 | ||
![]() |
a1f257369d | ||
![]() |
45e18d7094 | ||
![]() |
d6fe17f4c6 | ||
![]() |
93bed91937 | ||
![]() |
10d22abd8f | ||
![]() |
fdb50d0446 | ||
![]() |
79bdd829ea | ||
![]() |
1c4226d27c | ||
![]() |
b437803321 | ||
![]() |
b6a266a4f7 | ||
![]() |
b7f1561217 | ||
![]() |
abbd4d772c | ||
![]() |
75ac8d2796 | ||
![]() |
41f816a29b | ||
![]() |
4a25e9655c | ||
![]() |
d0252e8e44 | ||
![]() |
73e62600c2 | ||
![]() |
5c43041610 | ||
![]() |
f56dafe7d9 | ||
![]() |
7a1754fffd | ||
![]() |
f8c6c07bb7 | ||
![]() |
8fefafb844 | ||
![]() |
d1a57b5d68 | ||
![]() |
3adccc0bdb | ||
![]() |
6058630360 | ||
![]() |
81d92fb4ad | ||
![]() |
40cb0190fc | ||
![]() |
e5ebd84eca | ||
![]() |
673b4cf911 | ||
![]() |
ea1260c2ce | ||
![]() |
7040d13f76 | ||
![]() |
ed36070e92 | ||
![]() |
c88a7646e5 | ||
![]() |
d174390624 | ||
![]() |
5bb90cb63d | ||
![]() |
c4bbb71a3b | ||
![]() |
7d6cae96f3 | ||
![]() |
b1e616055e | ||
![]() |
eec8f09d6f | ||
![]() |
c68a6d78eb | ||
![]() |
acd3cc5062 | ||
![]() |
a55a915439 | ||
![]() |
1f68223752 | ||
![]() |
d6b626ec50 | ||
![]() |
926016f1d8 | ||
![]() |
8ea1cc24a7 | ||
![]() |
08adb2f540 | ||
![]() |
4333ee01bc | ||
![]() |
46db0edd5d | ||
![]() |
10d017ebe4 | ||
![]() |
76a13dadb8 | ||
![]() |
72750b9243 | ||
![]() |
fba58f3bdd | ||
![]() |
6662ca3467 | ||
![]() |
6f1ed89e26 | ||
![]() |
7e99e42924 | ||
![]() |
5d01410dc0 | ||
![]() |
9a47dba494 | ||
![]() |
6edee78145 | ||
![]() |
30eac99a61 | ||
![]() |
ea6d040809 | ||
![]() |
8e9d5caa37 | ||
![]() |
122aa2b9f1 | ||
![]() |
fb1da4834c | ||
![]() |
96c7222269 | ||
![]() |
1737e27b34 | ||
![]() |
39f198138a | ||
![]() |
c74bb84c83 | ||
![]() |
07d06d9aee | ||
![]() |
9cef689106 | ||
![]() |
11a9c756b3 | ||
![]() |
11accaff7f | ||
![]() |
77aee832e4 | ||
![]() |
4bde14368c | ||
![]() |
151d85f2be | ||
![]() |
e7c23cfb92 | ||
![]() |
3f9ea7b971 | ||
![]() |
998c3ef51b | ||
![]() |
c85b6b425d | ||
![]() |
273916973c | ||
![]() |
5009bd022f | ||
![]() |
73163d893f | ||
![]() |
506af7c9c2 | ||
![]() |
c90ed2da1d | ||
![]() |
cebb8b9fa2 | ||
![]() |
46aca10a72 | ||
![]() |
6384c698ad | ||
![]() |
abf01be889 | ||
![]() |
1e4928d2a0 | ||
![]() |
503be90932 | ||
![]() |
b5d6a82cc3 | ||
![]() |
c073ba5272 | ||
![]() |
d1e317ce21 | ||
![]() |
d4abeafb34 | ||
![]() |
4d96551619 | ||
![]() |
178361b247 | ||
![]() |
40f8ba23a4 | ||
![]() |
bef2d94374 | ||
![]() |
f39c7654a0 | ||
![]() |
e9fff764cb | ||
![]() |
87e466c47c | ||
![]() |
bd0b593c4a | ||
![]() |
7a8142df2b | ||
![]() |
bbe3084eda | ||
![]() |
89d42bd078 | ||
![]() |
93efaf7a38 | ||
![]() |
398575c70c | ||
![]() |
4e21fa4830 | ||
![]() |
d2d2d9edaf | ||
![]() |
771c8bbbe4 | ||
![]() |
20eeda19b8 | ||
![]() |
96c517d65c | ||
![]() |
e70ad3d493 | ||
![]() |
516bc48a33 | ||
![]() |
7f97716ae9 | ||
![]() |
5e40227bc3 | ||
![]() |
5479942fc0 | ||
![]() |
ce98019b49 | ||
![]() |
9470154df2 | ||
![]() |
5c59120c57 | ||
![]() |
88736ff867 | ||
![]() |
fd5b831979 | ||
![]() |
3fcd1e2d7e | ||
![]() |
2c81648d59 | ||
![]() |
cd92c005e3 | ||
![]() |
31c8cf020e | ||
![]() |
e900a38983 | ||
![]() |
7343a07ddd | ||
![]() |
e20b4fb905 | ||
![]() |
cbbc4d37d0 | ||
![]() |
b140935843 | ||
![]() |
9faf0a102e | ||
![]() |
b747dd58c3 | ||
![]() |
09e1b505e1 | ||
![]() |
a6babffed8 | ||
![]() |
0256e2dfbb | ||
![]() |
7afa90b769 | ||
![]() |
5796956235 | ||
![]() |
3ca215e4dc | ||
![]() |
16c4183333 | ||
![]() |
6fe37678f2 | ||
![]() |
b58188f805 | ||
![]() |
f2a42ab6fe | ||
![]() |
e236b7bf7b | ||
![]() |
35004f434b | ||
![]() |
75251ad694 | ||
![]() |
870357968a | ||
![]() |
a593798b4b | ||
![]() |
4f070ba162 | ||
![]() |
9517d27f40 | ||
![]() |
35bb3dbcc2 | ||
![]() |
06117929bb | ||
![]() |
d1c8241947 | ||
![]() |
4c38b28469 | ||
![]() |
ad0f0a0b5d | ||
![]() |
83746a9aeb | ||
![]() |
6a36a4ec97 | ||
![]() |
7e49d047b0 | ||
![]() |
68cdeb7b3d | ||
![]() |
76293084a4 | ||
![]() |
e1cf2117f5 | ||
![]() |
7d81de4edf | ||
![]() |
37af5992c7 | ||
![]() |
af4623e605 | ||
![]() |
db8e116681 | ||
![]() |
a8616ebfe2 | ||
![]() |
a38d3bf7f8 | ||
![]() |
1cb5bbd07d | ||
![]() |
6edb5b912f | ||
![]() |
ec20c7577e | ||
![]() |
d6df9b3656 | ||
![]() |
80a849fef7 | ||
![]() |
bd67b53d50 | ||
![]() |
e32ed09da3 | ||
![]() |
c5632e5c04 | ||
![]() |
4d2b71454d | ||
![]() |
5cbb33b02b | ||
![]() |
2c55aad6c0 | ||
![]() |
1e039dcb32 | ||
![]() |
6ca8da4858 | ||
![]() |
67b492bcb7 | ||
![]() |
360d1e2802 | ||
![]() |
1cd76634a3 | ||
![]() |
c65c5009e4 | ||
![]() |
24fb6cefb9 | ||
![]() |
d80e272b75 | ||
![]() |
82f05e27c3 | ||
![]() |
7a627e4ad8 | ||
![]() |
73af9552ec | ||
![]() |
e4854f2144 | ||
![]() |
6f5c1ac4e1 | ||
![]() |
22acc51284 | ||
![]() |
a05644fc31 | ||
![]() |
d1aa54caa9 | ||
![]() |
e293f70a91 | ||
![]() |
347986a2b3 | ||
![]() |
ede274386b | ||
![]() |
3e083354cc | ||
![]() |
b2b4f6516a | ||
![]() |
2ae702c7bb | ||
![]() |
b748420a94 | ||
![]() |
8a4546ce0d | ||
![]() |
167412a003 | ||
![]() |
e8d90b42a1 | ||
![]() |
d8c7e9de5f | ||
![]() |
2ac1b78a2c | ||
![]() |
e8e38befb7 | ||
![]() |
b30629dd60 | ||
![]() |
f66d7e1c2d | ||
![]() |
8417ac7eeb | ||
![]() |
6342225b22 | ||
![]() |
4460fb7004 | ||
![]() |
6f635c74fc | ||
![]() |
c82d45689c | ||
![]() |
02e0543a02 | ||
![]() |
fde0276d65 | ||
![]() |
3d6289e4e1 | ||
![]() |
5e55b971a8 | ||
![]() |
0a43b84a96 | ||
![]() |
dc74cc2db5 | ||
![]() |
fc00a09318 | ||
![]() |
19cf9d0b9a | ||
![]() |
f81780cf88 | ||
![]() |
c3a55c91dc | ||
![]() |
482f02fbaa | ||
![]() |
6bf7429ef6 | ||
![]() |
4198de604f | ||
![]() |
8c06dc2dd1 | ||
![]() |
13b4610c1d | ||
![]() |
0090128249 | ||
![]() |
3153bbd6a8 | ||
![]() |
7b1812a9be | ||
![]() |
c647daace2 | ||
![]() |
70dceb3b37 | ||
![]() |
72b1ce5fe6 | ||
![]() |
731942d855 | ||
![]() |
058dad7ba7 | ||
![]() |
fe43e5a717 | ||
![]() |
34bab04310 | ||
![]() |
18f7c4f31f | ||
![]() |
3477b96d87 | ||
![]() |
ac850b64aa | ||
![]() |
279e421ad5 | ||
![]() |
22c8049bed | ||
![]() |
3f1392769d | ||
![]() |
da71eab0ae | ||
![]() |
2e0e6bb8d2 | ||
![]() |
1f145c6cba | ||
![]() |
c4d48181ee | ||
![]() |
0c4ecad4a7 | ||
![]() |
d25de5592a | ||
![]() |
88fc35d8ea | ||
![]() |
02730be871 | ||
![]() |
c7876dbbe8 | ||
![]() |
85fcb5fedf | ||
![]() |
58cbfeb72a | ||
![]() |
b2b6cbe9c8 | ||
![]() |
4c05a511c2 | ||
![]() |
b5bef13b46 | ||
![]() |
bb47dc5e06 | ||
![]() |
511f154e16 | ||
![]() |
10ae2207df | ||
![]() |
71df99ffb6 | ||
![]() |
5eb26102d4 | ||
![]() |
a7fa82a83f | ||
![]() |
6ce27d225d | ||
![]() |
819a0e1f57 | ||
![]() |
702a60b7e7 | ||
![]() |
057d5f149f | ||
![]() |
d047dafd23 | ||
![]() |
b449a7f6e2 | ||
![]() |
f302874ae8 | ||
![]() |
6af58203dd | ||
![]() |
fa4924d5ba | ||
![]() |
5b88ebf0e7 | ||
![]() |
a0edc7d54d | ||
![]() |
b876a0d0df | ||
![]() |
27db4f7e51 | ||
![]() |
426919fa9f | ||
![]() |
e47c152b81 | ||
![]() |
b7cb708053 | ||
![]() |
7611c2b3d5 | ||
![]() |
5f964830aa | ||
![]() |
7ec4f906af | ||
![]() |
b5f6c06b8b | ||
![]() |
55e81ca4bb | ||
![]() |
0f7bfc547a | ||
![]() |
9525725c28 | ||
![]() |
2a2196fa4d | ||
![]() |
237efbcaa0 | ||
![]() |
351cd06ef7 | ||
![]() |
8b37160953 | ||
![]() |
db64478d9f | ||
![]() |
8bc2dfe4c6 | ||
![]() |
3a427c9130 | ||
![]() |
01da48693e | ||
![]() |
499abd38f6 | ||
![]() |
ca2a556259 | ||
![]() |
7699a881d0 | ||
![]() |
425f87618a | ||
![]() |
701ef2f919 | ||
![]() |
ce5c5f0837 | ||
![]() |
75884285cf | ||
![]() |
d7d9c1edc0 | ||
![]() |
92db3fec2e | ||
![]() |
69ea039e31 | ||
![]() |
f640bef5fc | ||
![]() |
88f9fb6fc8 | ||
![]() |
855e9f6c83 | ||
![]() |
e63e9e389e | ||
![]() |
c646cd4977 | ||
![]() |
6b53c0dc27 | ||
![]() |
afb4e317f0 | ||
![]() |
a698c69b4d | ||
![]() |
4121876116 | ||
![]() |
060d3011f7 | ||
![]() |
1f0fea2937 | ||
![]() |
92e178cc59 | ||
![]() |
dc222cefd4 | ||
![]() |
0defa9d0ba | ||
![]() |
e3edb02090 | ||
![]() |
a32625ca04 | ||
![]() |
3c08fa9b33 | ||
![]() |
e6526d3fd4 | ||
![]() |
bee0867a2a | ||
![]() |
1711030cb5 | ||
![]() |
7b586e6857 |
1
.gitattributes
vendored
Normal file
@@ -0,0 +1 @@
|
||||
THANKS.md merge=union
|
2
.gitignore
vendored
@@ -42,6 +42,7 @@ htmlcov/
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*,cover
|
||||
.pytest_cache
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
@@ -68,6 +69,7 @@ db.sqlite3
|
||||
.idea
|
||||
|
||||
# Other stuff that doesn't belong
|
||||
.virtualenv
|
||||
virtualenv
|
||||
.vagrant
|
||||
docker-compose.yml
|
||||
|
19
.travis.yml
@@ -1,18 +1,25 @@
|
||||
language: python
|
||||
|
||||
before_install:
|
||||
- sudo apt-get update -qq
|
||||
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr tesseract-ocr-eng
|
||||
|
||||
sudo: false
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- python: 3.4
|
||||
env: TOXENV=py34
|
||||
- python: 3.5
|
||||
env: TOXENV=py35
|
||||
- python: 3.5
|
||||
env: TOXENV=pep8
|
||||
- python: 3.6
|
||||
|
||||
install:
|
||||
- pip install --requirement requirements.txt
|
||||
- pip install tox
|
||||
- pip install sphinx
|
||||
script:
|
||||
- cd src/
|
||||
- pytest --cov
|
||||
- pycodestyle
|
||||
- sphinx-build -b html ../docs ../docs/_build -W
|
||||
|
||||
script: tox -c src/tox.ini
|
||||
after_success:
|
||||
- coveralls
|
||||
|
46
CODE_OF_CONDUCT.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# Contributor Covenant Code of Conduct
|
||||
|
||||
## Our Pledge
|
||||
|
||||
In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to creating a positive environment include:
|
||||
|
||||
* Using welcoming and inclusive language
|
||||
* Being respectful of differing viewpoints and experiences
|
||||
* Gracefully accepting constructive criticism
|
||||
* Focusing on what is best for the community
|
||||
* Showing empathy towards other community members
|
||||
|
||||
Examples of unacceptable behavior by participants include:
|
||||
|
||||
* Unwelcome sexual attention or advances
|
||||
* Trolling, insulting/derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or electronic address, without explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a professional setting
|
||||
|
||||
## Our Responsibilities
|
||||
|
||||
Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
|
||||
|
||||
Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at code@danielquinn.org. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
|
||||
|
||||
Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
|
||||
|
||||
[homepage]: http://contributor-covenant.org
|
||||
[version]: http://contributor-covenant.org/version/1/4/
|
70
Dockerfile
@@ -1,46 +1,46 @@
|
||||
FROM python:3.5
|
||||
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
|
||||
FROM alpine:3.7
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
sudo \
|
||||
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install python dependencies
|
||||
RUN mkdir -p /usr/src/paperless
|
||||
WORKDIR /usr/src/paperless
|
||||
COPY requirements.txt /usr/src/paperless/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
LABEL maintainer="The Paperless Project https://github.com/danielquinn/paperless" \
|
||||
contributors="Guy Addadi <addadi@gmail.com>, Pit Kleyersburg <pitkley@googlemail.com>, \
|
||||
Sven Fischer <git-dev@linux4tw.de>"
|
||||
|
||||
# Copy application
|
||||
RUN mkdir -p /usr/src/paperless/src
|
||||
RUN mkdir -p /usr/src/paperless/data
|
||||
RUN mkdir -p /usr/src/paperless/media
|
||||
COPY requirements.txt /usr/src/paperless/
|
||||
COPY src/ /usr/src/paperless/src/
|
||||
COPY data/ /usr/src/paperless/data/
|
||||
COPY media/ /usr/src/paperless/media/
|
||||
|
||||
# Set consumption directory
|
||||
ENV PAPERLESS_CONSUMPTION_DIR /consume
|
||||
RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
|
||||
|
||||
# Migrate database
|
||||
WORKDIR /usr/src/paperless/src
|
||||
RUN ./manage.py migrate
|
||||
|
||||
# Create user
|
||||
RUN groupadd -g 1000 paperless \
|
||||
&& useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
|
||||
&& chown -Rh paperless:paperless /usr/src/paperless
|
||||
|
||||
# Setup entrypoint
|
||||
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
|
||||
RUN chmod 755 /sbin/docker-entrypoint.sh
|
||||
|
||||
# Mount volumes
|
||||
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"]
|
||||
# Set export and consumption directories
|
||||
ENV PAPERLESS_EXPORT_DIR=/export \
|
||||
PAPERLESS_CONSUMPTION_DIR=/consume
|
||||
|
||||
# Install dependencies
|
||||
RUN apk --no-cache --update add \
|
||||
python3 gnupg libmagic bash shadow curl \
|
||||
sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
|
||||
apk --no-cache add --virtual .build-dependencies \
|
||||
python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
|
||||
# Install python dependencies
|
||||
python3 -m ensurepip && \
|
||||
rm -r /usr/lib/python*/ensurepip && \
|
||||
cd /usr/src/paperless && \
|
||||
pip3 install --no-cache-dir -r requirements.txt && \
|
||||
# Remove build dependencies
|
||||
apk del .build-dependencies && \
|
||||
# Create the consumption directory
|
||||
mkdir -p $PAPERLESS_CONSUMPTION_DIR && \
|
||||
# Create user
|
||||
addgroup -g 1000 paperless && \
|
||||
adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \
|
||||
chown -Rh paperless:paperless /usr/src/paperless && \
|
||||
mkdir -p $PAPERLESS_EXPORT_DIR && \
|
||||
# Setup entrypoint
|
||||
chmod 755 /sbin/docker-entrypoint.sh
|
||||
|
||||
WORKDIR /usr/src/paperless/src
|
||||
# Mount volumes and set Entrypoint
|
||||
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume", "/export"]
|
||||
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
|
||||
CMD ["--help"]
|
||||
|
||||
|
37
Pipfile
Normal file
@@ -0,0 +1,37 @@
|
||||
[[source]]
|
||||
url = "https://pypi.python.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
django = "<2.0,>=1.11"
|
||||
pillow = "*"
|
||||
coveralls = "*"
|
||||
dateparser = "*"
|
||||
django-crispy-forms = "*"
|
||||
django-extensions = "*"
|
||||
django-filter = "*"
|
||||
django-flat-responsive = "*"
|
||||
djangorestframework = "*"
|
||||
factory-boy = "*"
|
||||
"flake8" = "*"
|
||||
filemagic = "*"
|
||||
fuzzywuzzy = {extras = ["speedup"], version = "==0.15.0"}
|
||||
gunicorn = "*"
|
||||
langdetect = "*"
|
||||
pdftotext = "*"
|
||||
pyocr = "*"
|
||||
python-dateutil = "*"
|
||||
python-dotenv = "*"
|
||||
python-gnupg = "*"
|
||||
pytz = "*"
|
||||
pycodestyle = "*"
|
||||
pytest = "*"
|
||||
pytest-cov = "*"
|
||||
pytest-django = "*"
|
||||
pytest-sugar = "*"
|
||||
pytest-env = "*"
|
||||
pytest-xdist = "*"
|
||||
|
||||
[dev-packages]
|
||||
ipython = "*"
|
594
Pipfile.lock
generated
Normal file
@@ -0,0 +1,594 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "928fbb4c8952128aef7a2ed2707ce510d31d49df96cfc5f08959698edff6e67f"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.python.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"apipkg": {
|
||||
"hashes": [
|
||||
"sha256:2e38399dbe842891fe85392601aab8f40a8f4cc5a9053c326de35a1cc0297ac6",
|
||||
"sha256:65d2aa68b28e7d31233bb2ba8eb31cda40e4671f8ac2d6b241e358c9652a74b9"
|
||||
],
|
||||
"version": "==1.4"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:1c7960ccfd6a005cd9f7ba884e6316b5e430a3f1a6c37c5f87d8b43f83b54ec9",
|
||||
"sha256:a17a9573a6f475c99b551c0e0a812707ddda1ec9653bed04c13841404ed6f450"
|
||||
],
|
||||
"version": "==17.4.0"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:14131608ad2fd56836d33a71ee60fa1c82bc9d2c8d98b7bdbc631fe1b3cd1296",
|
||||
"sha256:edbc3f203427eef571f79a7692bb160a2b0f7ccaa31953e99bd17e307cf63f7d"
|
||||
],
|
||||
"version": "==2018.1.18"
|
||||
},
|
||||
"chardet": {
|
||||
"hashes": [
|
||||
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||
],
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"coverage": {
|
||||
"hashes": [
|
||||
"sha256:03481e81d558d30d230bc12999e3edffe392d244349a90f4ef9b88425fac74ba",
|
||||
"sha256:0b136648de27201056c1869a6c0d4e23f464750fd9a9ba9750b8336a244429ed",
|
||||
"sha256:104ab3934abaf5be871a583541e8829d6c19ce7bde2923b2751e0d3ca44db60a",
|
||||
"sha256:15b111b6a0f46ee1a485414a52a7ad1d703bdf984e9ed3c288a4414d3871dcbd",
|
||||
"sha256:198626739a79b09fa0a2f06e083ffd12eb55449b5f8bfdbeed1df4910b2ca640",
|
||||
"sha256:1c383d2ef13ade2acc636556fd544dba6e14fa30755f26812f54300e401f98f2",
|
||||
"sha256:28b2191e7283f4f3568962e373b47ef7f0392993bb6660d079c62bd50fe9d162",
|
||||
"sha256:2eb564bbf7816a9d68dd3369a510be3327f1c618d2357fa6b1216994c2e3d508",
|
||||
"sha256:337ded681dd2ef9ca04ef5d93cfc87e52e09db2594c296b4a0a3662cb1b41249",
|
||||
"sha256:3a2184c6d797a125dca8367878d3b9a178b6fdd05fdc2d35d758c3006a1cd694",
|
||||
"sha256:3c79a6f7b95751cdebcd9037e4d06f8d5a9b60e4ed0cd231342aa8ad7124882a",
|
||||
"sha256:3d72c20bd105022d29b14a7d628462ebdc61de2f303322c0212a054352f3b287",
|
||||
"sha256:3eb42bf89a6be7deb64116dd1cc4b08171734d721e7a7e57ad64cc4ef29ed2f1",
|
||||
"sha256:4635a184d0bbe537aa185a34193898eee409332a8ccb27eea36f262566585000",
|
||||
"sha256:56e448f051a201c5ebbaa86a5efd0ca90d327204d8b059ab25ad0f35fbfd79f1",
|
||||
"sha256:5a13ea7911ff5e1796b6d5e4fbbf6952381a611209b736d48e675c2756f3f74e",
|
||||
"sha256:69bf008a06b76619d3c3f3b1983f5145c75a305a0fea513aca094cae5c40a8f5",
|
||||
"sha256:6bc583dc18d5979dc0f6cec26a8603129de0304d5ae1f17e57a12834e7235062",
|
||||
"sha256:701cd6093d63e6b8ad7009d8a92425428bc4d6e7ab8d75efbb665c806c1d79ba",
|
||||
"sha256:7608a3dd5d73cb06c531b8925e0ef8d3de31fed2544a7de6c63960a1e73ea4bc",
|
||||
"sha256:76ecd006d1d8f739430ec50cc872889af1f9c1b6b8f48e29941814b09b0fd3cc",
|
||||
"sha256:7aa36d2b844a3e4a4b356708d79fd2c260281a7390d678a10b91ca595ddc9e99",
|
||||
"sha256:7d3f553904b0c5c016d1dad058a7554c7ac4c91a789fca496e7d8347ad040653",
|
||||
"sha256:7e1fe19bd6dce69d9fd159d8e4a80a8f52101380d5d3a4d374b6d3eae0e5de9c",
|
||||
"sha256:8c3cb8c35ec4d9506979b4cf90ee9918bc2e49f84189d9bf5c36c0c1119c6558",
|
||||
"sha256:9d6dd10d49e01571bf6e147d3b505141ffc093a06756c60b053a859cb2128b1f",
|
||||
"sha256:9e112fcbe0148a6fa4f0a02e8d58e94470fc6cb82a5481618fea901699bf34c4",
|
||||
"sha256:ac4fef68da01116a5c117eba4dd46f2e06847a497de5ed1d64bb99a5fda1ef91",
|
||||
"sha256:b8815995e050764c8610dbc82641807d196927c3dbed207f0a079833ffcf588d",
|
||||
"sha256:be6cfcd8053d13f5f5eeb284aa8a814220c3da1b0078fa859011c7fffd86dab9",
|
||||
"sha256:c1bb572fab8208c400adaf06a8133ac0712179a334c09224fb11393e920abcdd",
|
||||
"sha256:de4418dadaa1c01d497e539210cb6baa015965526ff5afc078c57ca69160108d",
|
||||
"sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6",
|
||||
"sha256:e4d96c07229f58cb686120f168276e434660e4358cc9cf3b0464210b04913e77",
|
||||
"sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80",
|
||||
"sha256:f8a923a85cb099422ad5a2e345fe877bbc89a8a8b23235824a93488150e45f6e"
|
||||
],
|
||||
"version": "==4.5.1"
|
||||
},
|
||||
"coveralls": {
|
||||
"hashes": [
|
||||
"sha256:32569a43c9dbc13fa8199247580a4ab182ef439f51f65bb7f8316d377a1340e8",
|
||||
"sha256:664794748d2e5673e347ec476159a9d87f43e0d2d44950e98ed0e27b98da8346"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"dateparser": {
|
||||
"hashes": [
|
||||
"sha256:940828183c937bcec530753211b70f673c0a9aab831e43273489b310538dff86",
|
||||
"sha256:b452ef8b36cd78ae86a50721794bc674aa3994e19b570f7ba92810f4e0a2ae03"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.7.0"
|
||||
},
|
||||
"django": {
|
||||
"hashes": [
|
||||
"sha256:056fe5b9e1f8f7fed9bb392919d64f6b33b3a71cfb0f170a90ee277a6ed32bc2",
|
||||
"sha256:4d398c7b02761e234bbde490aea13ea94cb539ceeb72805b72303f348682f2eb"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.11.12"
|
||||
},
|
||||
"django-crispy-forms": {
|
||||
"hashes": [
|
||||
"sha256:5952bab971110d0b86c278132dae0aa095beee8f723e625c3d3fa28888f1675f",
|
||||
"sha256:705ededc554ad8736157c666681165fe22ead2dec0d5446d65fc9dd976a5a876"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.7.2"
|
||||
},
|
||||
"django-extensions": {
|
||||
"hashes": [
|
||||
"sha256:37a543af370ee3b0721ff50442d33c357dd083e6ea06c5b94a199283b6f9e361",
|
||||
"sha256:bc9f2946c117bb2f49e5e0633eba783787790ae810ea112fe7fd82fa64de2ff1"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.0.6"
|
||||
},
|
||||
"django-filter": {
|
||||
"hashes": [
|
||||
"sha256:ea204242ea83790e1512c9d0d8255002a652a6f4986e93cee664f28955ba0c22",
|
||||
"sha256:ec0ef1ba23ef95b1620f5d481334413700fb33f45cd76d56a63f4b0b1d76976a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.1.0"
|
||||
},
|
||||
"django-flat-responsive": {
|
||||
"hashes": [
|
||||
"sha256:451caa2700c541b52fb7ce2d34d3d8dee9e980cf29f5463bc8a8c6256a1a6474"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.0"
|
||||
},
|
||||
"djangorestframework": {
|
||||
"hashes": [
|
||||
"sha256:b6714c3e4b0f8d524f193c91ecf5f5450092c2145439ac2769711f7eba89a9d9",
|
||||
"sha256:c375e4f95a3a64fccac412e36fb42ba36881e52313ec021ef410b40f67cddca4"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.8.2"
|
||||
},
|
||||
"docopt": {
|
||||
"hashes": [
|
||||
"sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"
|
||||
],
|
||||
"version": "==0.6.2"
|
||||
},
|
||||
"execnet": {
|
||||
"hashes": [
|
||||
"sha256:a7a84d5fa07a089186a329528f127c9d73b9de57f1a1131b82bb5320ee651f6a",
|
||||
"sha256:fc155a6b553c66c838d1a22dba1dc9f5f505c43285a878c6f74a79c024750b83"
|
||||
],
|
||||
"version": "==1.5.0"
|
||||
},
|
||||
"factory-boy": {
|
||||
"hashes": [
|
||||
"sha256:bd5a096d0f102d79b6c78cef1c8c0b650f2e1a3ecba351c735c6d2df8dabd29c",
|
||||
"sha256:be2abc8092294e4097935a29b4e37f5b9ed3e4205e2e32df215c0315b625995e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.10.0"
|
||||
},
|
||||
"faker": {
|
||||
"hashes": [
|
||||
"sha256:226d8fa67a8cf8b4007aab721f67639f130e9cfdc53a7095a2290ebb07a65c71",
|
||||
"sha256:48fed4b4a191e2b42ad20c14115f1c6d36d338b80192075d7573f0f42d7fb321"
|
||||
],
|
||||
"version": "==0.8.13"
|
||||
},
|
||||
"filemagic": {
|
||||
"hashes": [
|
||||
"sha256:e684359ef40820fe406f0ebc5bf8a78f89717bdb7fed688af68082d991d6dbf3"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.6"
|
||||
},
|
||||
"flake8": {
|
||||
"hashes": [
|
||||
"sha256:7253265f7abd8b313e3892944044a365e3f4ac3fcdcfb4298f55ee9ddf188ba0",
|
||||
"sha256:c7841163e2b576d435799169b78703ad6ac1bbb0f199994fc05f700b2a90ea37"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.5.0"
|
||||
},
|
||||
"fuzzywuzzy": {
|
||||
"hashes": [
|
||||
"sha256:3759bc6859daa0eecef8c82b45404bdac20c23f23136cf4c18b46b426bbc418f",
|
||||
"sha256:5b36957ccf836e700f4468324fa80ba208990385392e217be077d5cd738ae602"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.15.0"
|
||||
},
|
||||
"gunicorn": {
|
||||
"hashes": [
|
||||
"sha256:75af03c99389535f218cc596c7de74df4763803f7b63eb09d77e92b3956b36c6",
|
||||
"sha256:eee1169f0ca667be05db3351a0960765620dad53f53434262ff8901b68a1b622"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==19.7.1"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
|
||||
"sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
|
||||
],
|
||||
"version": "==2.6"
|
||||
},
|
||||
"langdetect": {
|
||||
"hashes": [
|
||||
"sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.0.7"
|
||||
},
|
||||
"mccabe": {
|
||||
"hashes": [
|
||||
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
|
||||
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
|
||||
],
|
||||
"version": "==0.6.1"
|
||||
},
|
||||
"more-itertools": {
|
||||
"hashes": [
|
||||
"sha256:0dd8f72eeab0d2c3bd489025bb2f6a1b8342f9b198f6fc37b52d15cfa4531fea",
|
||||
"sha256:11a625025954c20145b37ff6309cd54e39ca94f72f6bb9576d1195db6fa2442e",
|
||||
"sha256:c9ce7eccdcb901a2c75d326ea134e0886abfbea5f93e91cc95de9507c0816c44"
|
||||
],
|
||||
"version": "==4.1.0"
|
||||
},
|
||||
"pdftotext": {
|
||||
"hashes": [
|
||||
"sha256:0b82a9fd255a3f2bf5c861cf9e3174d3c4223e1e441bb060c611dcb4e65c6cb8"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.0.2"
|
||||
},
|
||||
"pillow": {
|
||||
"hashes": [
|
||||
"sha256:00633bc2ec40313f4daf351855e506d296ec3c553f21b66720d0f1225ca84c6f",
|
||||
"sha256:03514478db61b034fc5d38b9bf060f994e5916776e93f02e59732a8270069c61",
|
||||
"sha256:040144ba422216aecf7577484865ade90e1a475f867301c48bf9fbd7579efd76",
|
||||
"sha256:16246261ff22368e5e32ad74d5ef40403ab6895171a7fc6d34f6c17cfc0f1943",
|
||||
"sha256:1cb38df69362af35c14d4a50123b63c7ff18ec9a6d4d5da629a6f19d05e16ba8",
|
||||
"sha256:2400e122f7b21d9801798207e424cbe1f716cee7314cd0c8963fdb6fc564b5fb",
|
||||
"sha256:2ee6364b270b56a49e8b8a51488e847ab130adc1220c171bed6818c0d4742455",
|
||||
"sha256:3b4560c3891b05022c464b09121bd507c477505a4e19d703e1027a3a7c68d896",
|
||||
"sha256:41374a6afb3f44794410dab54a0d7175e6209a5a02d407119c81083f1a4c1841",
|
||||
"sha256:438a3faf5f702c8d0f80b9f9f9b8382cfa048ca6a0d64ef71b86b563b0ee0359",
|
||||
"sha256:472a124c640bde4d5468f6991c9fa7e30b723d84ac4195a77c6ab6aea30f2b9c",
|
||||
"sha256:4d32c8e3623a61d6e29ccd024066cd1ba556555abfb4cd714155020e00107e3f",
|
||||
"sha256:4d8077fd649ac40a5c4165f2c22fa2a4ad18c668e271ecb2f9d849d1017a9313",
|
||||
"sha256:62ec7ae98357fcd46002c110bb7cad15fce532776f0cbe7ca1d44c49b837d49d",
|
||||
"sha256:6c7cab6a05351cf61e469937c49dbf3cdf5ffb3eeac71f8d22dc9be3507598d8",
|
||||
"sha256:6eca36905444c4b91fe61f1b9933a47a30480738a1dd26501ff67d94fc2bc112",
|
||||
"sha256:74e2ebfd19c16c28ad43b8a28ff73b904ed382ea4875188838541751986e8c9a",
|
||||
"sha256:7673e7473a13107059377c96c563aa36f73184c29d2926882e0a0210b779a1e7",
|
||||
"sha256:81762cf5fca9a82b53b7b2d0e6b420e0f3b06167b97678c81d00470daa622d58",
|
||||
"sha256:8554bbeb4218d9cfb1917c69e6f2d2ad0be9b18a775d2162547edf992e1f5f1f",
|
||||
"sha256:9b66e968da9c4393f5795285528bc862c7b97b91251f31a08004a3c626d18114",
|
||||
"sha256:a00edb2dec0035e98ac3ec768086f0b06dfabb4ad308592ede364ef573692f55",
|
||||
"sha256:b48401752496757e95304a46213c3155bc911ac884bed2e9b275ce1c1df3e293",
|
||||
"sha256:b6cf18f9e653a8077522bb3aa753a776b117e3e0cc872c25811cfdf1459491c2",
|
||||
"sha256:bb8adab1877e9213385cbb1adc297ed8337e01872c42a30cfaa66ff8c422779c",
|
||||
"sha256:c8a4b39ba380b57a31a4b5449a9d257b1302d8bc4799767e645dcee25725efe1",
|
||||
"sha256:cee9bc75bff455d317b6947081df0824a8f118de2786dc3d74a3503fd631f4ef",
|
||||
"sha256:d0dc1313dff48af64517cbbd85e046d6b477fbe5e9d69712801f024dcb08c62b",
|
||||
"sha256:d5bf527ed83617edd1855a5c923eeeaf68bcb9ac0ceb28e3f19b575b3a424984",
|
||||
"sha256:df5863a21f91de5ecdf7d32a32f406dd9867ebb35d41033b8bd9607a21887599",
|
||||
"sha256:e39142332541ed2884c257495504858b22c078a5d781059b07aba4c3a80d7551",
|
||||
"sha256:e52e8f675ba0b2b417fa98579e7286a41a8e23871f17f4793772f5aa884fea79",
|
||||
"sha256:e6dd55d5d94b9e36929325dd0c9ab85bfde84a5fc35947c334c32af1af668944",
|
||||
"sha256:e87cc1acbebf263f308a8494272c2d42016aa33c32bf14d209c81e1f65e11868",
|
||||
"sha256:ea0091cd4100519cedfeea2c659f52291f535ac6725e2368bcf59e874f270efa",
|
||||
"sha256:eeb247f4f4d962942b3b555530b0c63b77473c7bfe475e51c6b75b7344b49ce3",
|
||||
"sha256:f0d4433adce6075efd24fc0285135248b0b50f5a58129c7e552030e04fe45c7f",
|
||||
"sha256:f1f3bd92f8e12dc22884935a73c9f94c4d9bd0d34410c456540713d6b7832b8c",
|
||||
"sha256:f42a87cbf50e905f49f053c0b1fb86c911c730624022bf44c8857244fc4cdaca",
|
||||
"sha256:f5f302db65e2e0ae96e26670818157640d3ca83a3054c290eff3631598dcf819",
|
||||
"sha256:f7634d534662bbb08976db801ba27a112aee23e597eeaf09267b4575341e45bf",
|
||||
"sha256:fdd374c02e8bb2d6468a85be50ea66e1c4ef9e809974c30d8576728473a6ed03",
|
||||
"sha256:fe6931db24716a0845bd8c8915bd096b77c2a7043e6fc59ae9ca364fe816f08b"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==5.1.0"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:714306e9b9a7b24ee4c1e3ff6463d7f652cdd30f4693121b31572e2fe1fdaea3",
|
||||
"sha256:7f8ae7f5bdf75671a718d2daf0a64b7885f74510bcd98b1a0bb420eb9a9d0cff",
|
||||
"sha256:d345c8fe681115900d6da8d048ba67c25df42973bda370783cd58826442dcd7c",
|
||||
"sha256:e160a7fcf25762bb60efc7e171d4497ff1d8d2d75a3d0df7a21b76821ecbf5c5"
|
||||
],
|
||||
"version": "==0.6.0"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
"sha256:29c9fab495d7528e80ba1e343b958684f4ace687327e6f789a94bf3d1915f881",
|
||||
"sha256:983f77f3331356039fdd792e9220b7b8ee1aa6bd2b25f567a963ff1de5a64f6a"
|
||||
],
|
||||
"version": "==1.5.3"
|
||||
},
|
||||
"pycodestyle": {
|
||||
"hashes": [
|
||||
"sha256:1ec08a51c901dfe44921576ed6e4c1f5b7ecbad403f871397feedb5eb8e4fa14",
|
||||
"sha256:5ff2fbcbab997895ba9ead77e1b38b3ebc2e5c3b8a6194ef918666e4c790a00e",
|
||||
"sha256:682256a5b318149ca0d2a9185d365d8864a768a28db66a84a2ea946bcc426766",
|
||||
"sha256:6c4245ade1edfad79c3446fadfc96b0de2759662dc29d07d80a6f27ad1ca6ba9"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.3.1"
|
||||
},
|
||||
"pyflakes": {
|
||||
"hashes": [
|
||||
"sha256:08bd6a50edf8cffa9fa09a463063c425ecaaf10d1eb0335a7e8b1401aef89e6f",
|
||||
"sha256:8d616a382f243dbf19b54743f280b80198be0bca3a5396f1d2e1fca6223e8805"
|
||||
],
|
||||
"version": "==1.6.0"
|
||||
},
|
||||
"pyocr": {
|
||||
"hashes": [
|
||||
"sha256:9ee8b5f38dd966ca531115fc5fe4715f7fa8961a9f14cd5109c2d938c17a2043"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.5.1"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:6266f87ab64692112e5477eba395cfedda53b1933ccd29478e671e73b420c19c",
|
||||
"sha256:fae491d1874f199537fd5872b5e1f0e74a009b979df9d53d1553fd03da1703e1"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.5.0"
|
||||
},
|
||||
"pytest-cov": {
|
||||
"hashes": [
|
||||
"sha256:03aa752cf11db41d281ea1d807d954c4eda35cfa1b21d6971966cc041bbf6e2d",
|
||||
"sha256:890fe5565400902b0c78b5357004aab1c814115894f4f21370e2433256a3eeec"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.5.1"
|
||||
},
|
||||
"pytest-django": {
|
||||
"hashes": [
|
||||
"sha256:534505e0261cc566279032d9d887f844235342806fd63a6925689670fa1b29d7",
|
||||
"sha256:7501942093db2250a32a4e36826edfc542347bb9b26c78ed0649cdcfd49e5789"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.2.1"
|
||||
},
|
||||
"pytest-env": {
|
||||
"hashes": [
|
||||
"sha256:7e94956aef7f2764f3c147d216ce066bf6c42948bb9e293169b1b1c880a580c2"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.6.2"
|
||||
},
|
||||
"pytest-forked": {
|
||||
"hashes": [
|
||||
"sha256:e4500cd0509ec4a26535f7d4112a8cc0f17d3a41c29ffd4eab479d2a55b30805",
|
||||
"sha256:f275cb48a73fc61a6710726348e1da6d68a978f0ec0c54ece5a5fae5977e5a08"
|
||||
],
|
||||
"version": "==0.2"
|
||||
},
|
||||
"pytest-sugar": {
|
||||
"hashes": [
|
||||
"sha256:ab8cc42faf121344a4e9b13f39a51257f26f410e416c52ea11078cdd00d98a2c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.9.1"
|
||||
},
|
||||
"pytest-xdist": {
|
||||
"hashes": [
|
||||
"sha256:be2662264b035920ba740ed6efb1c816a83c8a22253df7766d129f6a7bfdbd35",
|
||||
"sha256:e8f5744acc270b3e7d915bdb4d5f471670f049b6fbd163d4cbd52203b075d30f"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.22.2"
|
||||
},
|
||||
"python-dateutil": {
|
||||
"hashes": [
|
||||
"sha256:3220490fb9741e2342e1cf29a503394fdac874bc39568288717ee67047ff29df",
|
||||
"sha256:9d8074be4c993fbe4947878ce593052f71dac82932a677d49194d8ce9778002e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.7.2"
|
||||
},
|
||||
"python-dotenv": {
|
||||
"hashes": [
|
||||
"sha256:4965ed170bf51c347a89820e8050655e9c25db3837db6602e906b6d850fad85c",
|
||||
"sha256:509736185257111613009974e666568a1b031b028b61b500ef1ab4ee780089d5"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.8.2"
|
||||
},
|
||||
"python-gnupg": {
|
||||
"hashes": [
|
||||
"sha256:38f18712b7cfdd0d769bc88a21e90138154b9be2cbffb1e7d28bc37ee73a1c47",
|
||||
"sha256:5a54a6dd25bf78d3758dd7a1864f4efd122f9ca9402101d90e3ec4483ceafb73"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.4.2"
|
||||
},
|
||||
"python-levenshtein": {
|
||||
"hashes": [
|
||||
"sha256:033a11de5e3d19ea25c9302d11224e1a1898fe5abd23c61c7c360c25195e3eb1"
|
||||
],
|
||||
"version": "==0.12.0"
|
||||
},
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
"sha256:65ae0c8101309c45772196b21b74c46b2e5d11b6275c45d251b150d5da334555",
|
||||
"sha256:c06425302f2cf668f1bba7a0a03f3c1d34d4ebeef2c72003da308b3947c7f749"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2018.4"
|
||||
},
|
||||
"regex": {
|
||||
"hashes": [
|
||||
"sha256:1b428a296531ea1642a7da48562746309c5c06471a97bd0c02dd6a82e9cecee8",
|
||||
"sha256:27d72bb42dffb32516c28d218bb054ce128afd3e18464f30837166346758af67",
|
||||
"sha256:32cf4743debee9ea12d3626ee21eae83052763740e04086304e7a74778bf58c9",
|
||||
"sha256:32f6408dbca35040bc65f9f4ae1444d5546411fde989cb71443a182dd643305e",
|
||||
"sha256:333687d9a44738c486735955993f83bd22061a416c48f5a5f9e765e90cf1b0c9",
|
||||
"sha256:35eeccf17af3b017a54d754e160af597036435c58eceae60f1dd1364ae1250c7",
|
||||
"sha256:361a1fd703a35580a4714ec28d85e29780081a4c399a99bbfb2aee695d72aedb",
|
||||
"sha256:494bed6396a20d3aa6376bdf2d3fbb1005b8f4339558d8ac7b53256755f80303",
|
||||
"sha256:5b9c0ddd5b4afa08c9074170a2ea9b34ea296e32aeea522faaaaeeeb2fe0af2e",
|
||||
"sha256:a50532f61b23d4ab9d216a6214f359dd05c911c1a1ad20986b6738a782926c1a",
|
||||
"sha256:a9243d7b359b72c681a2c32eaa7ace8d346b7e8ce09d172a683acf6853161d9c",
|
||||
"sha256:b44624a38d07d3c954c84ad302c29f7930f4bf01443beef5589e9157b14e2a29",
|
||||
"sha256:be42a601aaaeb7a317f818490a39d153952a97c40c6e9beeb2a1103616405348",
|
||||
"sha256:eee4d94b1a626490fc8170ffd788883f8c641b576e11ba9b4a29c9f6623371e0",
|
||||
"sha256:f69d1201a4750f763971ea8364ed95ee888fc128968b39d38883a72a4d005895"
|
||||
],
|
||||
"version": "==2018.2.21"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b",
|
||||
"sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e"
|
||||
],
|
||||
"version": "==2.18.4"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
|
||||
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
|
||||
],
|
||||
"version": "==1.11.0"
|
||||
},
|
||||
"termcolor": {
|
||||
"hashes": [
|
||||
"sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"
|
||||
],
|
||||
"version": "==1.1.0"
|
||||
},
|
||||
"text-unidecode": {
|
||||
"hashes": [
|
||||
"sha256:5a1375bb2ba7968740508ae38d92e1f889a0832913cb1c447d5e2046061a396d",
|
||||
"sha256:801e38bd550b943563660a91de8d4b6fa5df60a542be9093f7abf819f86050cc"
|
||||
],
|
||||
"version": "==1.2"
|
||||
},
|
||||
"tzlocal": {
|
||||
"hashes": [
|
||||
"sha256:4ebeb848845ac898da6519b9b31879cf13b6626f7184c496037b818e238f2c4e"
|
||||
],
|
||||
"version": "==1.5.1"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
|
||||
"sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
|
||||
],
|
||||
"version": "==1.22"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"backcall": {
|
||||
"hashes": [
|
||||
"sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
|
||||
"sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
|
||||
],
|
||||
"version": "==0.1.0"
|
||||
},
|
||||
"decorator": {
|
||||
"hashes": [
|
||||
"sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82",
|
||||
"sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c"
|
||||
],
|
||||
"version": "==4.3.0"
|
||||
},
|
||||
"ipython": {
|
||||
"hashes": [
|
||||
"sha256:85882f97d75122ff8cdfe129215a408085a26039527110c8d4a2b8a5e45b7639",
|
||||
"sha256:a6ac981381b3f5f604b37a293369963485200e3639fb0404fa76092383c10c41"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==6.3.1"
|
||||
},
|
||||
"ipython-genutils": {
|
||||
"hashes": [
|
||||
"sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
|
||||
"sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
|
||||
],
|
||||
"version": "==0.2.0"
|
||||
},
|
||||
"jedi": {
|
||||
"hashes": [
|
||||
"sha256:1972f694c6bc66a2fac8718299e2ab73011d653a6d8059790c3476d2353b99ad",
|
||||
"sha256:5861f6dc0c16e024cbb0044999f9cf8013b292c05f287df06d3d991a87a4eb89"
|
||||
],
|
||||
"version": "==0.12.0"
|
||||
},
|
||||
"parso": {
|
||||
"hashes": [
|
||||
"sha256:62bd6bf7f04ab5c817704ff513ef175328676471bdef3629d4bdd46626f75551",
|
||||
"sha256:a75a304d7090d2c67bd298091c14ef9d3d560e3c53de1c239617889f61d1d307"
|
||||
],
|
||||
"version": "==0.2.0"
|
||||
},
|
||||
"pexpect": {
|
||||
"hashes": [
|
||||
"sha256:9783f4644a3ef8528a6f20374eeb434431a650c797ca6d8df0d81e30fffdfa24",
|
||||
"sha256:9f8eb3277716a01faafaba553d629d3d60a1a624c7cf45daa600d2148c30020c"
|
||||
],
|
||||
"markers": "sys_platform != 'win32'",
|
||||
"version": "==4.5.0"
|
||||
},
|
||||
"pickleshare": {
|
||||
"hashes": [
|
||||
"sha256:84a9257227dfdd6fe1b4be1319096c20eb85ff1e82c7932f36efccfe1b09737b",
|
||||
"sha256:c9a2541f25aeabc070f12f452e1f2a8eae2abd51e1cd19e8430402bdf4c1d8b5"
|
||||
],
|
||||
"version": "==0.7.4"
|
||||
},
|
||||
"prompt-toolkit": {
|
||||
"hashes": [
|
||||
"sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381",
|
||||
"sha256:3f473ae040ddaa52b52f97f6b4a493cfa9f5920c255a12dc56a7d34397a398a4",
|
||||
"sha256:858588f1983ca497f1cf4ffde01d978a3ea02b01c8a26a8bbc5cd2e66d816917"
|
||||
],
|
||||
"version": "==1.0.15"
|
||||
},
|
||||
"ptyprocess": {
|
||||
"hashes": [
|
||||
"sha256:e64193f0047ad603b71f202332ab5527c5e52aa7c8b609704fc28c0dc20c4365",
|
||||
"sha256:e8c43b5eee76b2083a9badde89fd1bbce6c8942d1045146e100b7b5e014f4f1a"
|
||||
],
|
||||
"version": "==0.5.2"
|
||||
},
|
||||
"pygments": {
|
||||
"hashes": [
|
||||
"sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
|
||||
"sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc"
|
||||
],
|
||||
"version": "==2.2.0"
|
||||
},
|
||||
"simplegeneric": {
|
||||
"hashes": [
|
||||
"sha256:dc972e06094b9af5b855b3df4a646395e43d1c9d0d39ed345b7393560d0b9173"
|
||||
],
|
||||
"version": "==0.8.1"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
|
||||
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
|
||||
],
|
||||
"version": "==1.11.0"
|
||||
},
|
||||
"traitlets": {
|
||||
"hashes": [
|
||||
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
|
||||
"sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
|
||||
],
|
||||
"version": "==4.3.2"
|
||||
},
|
||||
"wcwidth": {
|
||||
"hashes": [
|
||||
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
|
||||
"sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
|
||||
],
|
||||
"version": "==0.1.7"
|
||||
}
|
||||
}
|
||||
}
|
80
README-el.md
Normal file
@@ -0,0 +1,80 @@
|
||||
*[English](README.md)*
|
||||
|
||||
# Paperless
|
||||
|
||||
[](https://paperless.readthedocs.org/) [](https://gitter.im/danielquinn/paperless) [](https://travis-ci.org/danielquinn/paperless) [](https://coveralls.io/github/danielquinn/paperless?branch=master) [](https://github.com/danielquinn/paperless/blob/master/THANKS.md)
|
||||
|
||||
Ευρετήριο και αρχείο για όλα σας τα σκαναρισμένα έγγραφα
|
||||
|
||||
Μισώ το χαρτί. Πέρα από τα περιβαλλοντικά ζητήματα, είναι ο εφιάλτης ενός τεχνικού.
|
||||
|
||||
* Δεν υπάρχει η δυνατότητα της αναζήτησης
|
||||
* Πιάνουν πολύ χώρο
|
||||
* Τα αντίγραφα ασφαλείας σημάινουν περισσότερο χαρτί
|
||||
|
||||
Τους τελευταίους μήνες μου έχει τύχει αρκετές φορές να μην μπορώ να βρω το σωστό έγγραφο. Κάποιες φορές ανακύκλωνα το έγγραφο που χρειαζόμουν (ποιος κρατάει τους λογαριασμούς του νερού για 2 χρόνια;;;) και κάποιες φορές απλά το έχανα ... επειδή έτσι είναι τα χαρτιά. Το έκανα αυτό για να κάνω την ζωή μου πιο εύκολη
|
||||
|
||||
|
||||
## Πως δουλεύει
|
||||
|
||||
Η εφαρμογή Paperless δεν ελέγχει το scanner σας, αλλά σας βοηθάει με τα αποτελέσματα του scanner σας.
|
||||
|
||||
1. Αγοράστε ένα scanner με πρόσβαση στο δίκτυο σας. Αν χρειάζεστε έμπνευση, δείτε την σελίδα με τα [προτεινόμενα scanner](https://paperless.readthedocs.io/en/latest/scanners.html).
|
||||
2. Κάντε την ρύθμιση "scan to FTP" ή κάτι παρόμοιο. Θα μπορεί να αποθηκεύει τις σκαναρισμένες εικόνες σε έναν server χωρίς να χρειάζεται να κάνετε κάτι. Φυσικά άμα το scanner σας δεν μπορεί να αποθηκεύσει κάπου τις εικόνες σας αυτόματα μπορείτε να το κάνετε χειροκίνητα. Το Paperless δεν ενδιαφέρεται πως καταλήγουν κάπου τα αρχεία.
|
||||
3. Να έχετε τον server που τρέχει το OCR script του Paperless να έχει ευρετήριο στην τοπική βάση δεδομένων.
|
||||
4. Χρησιμοποιήστε το web frontend για να επιλέξετε βάση δεδομένων και να βρείτε αυτό που θέλετε.
|
||||
5. Κατεβάστε το PDF που θέλετε/χρειάζεστε μέσω του web interface και κάντε ότι θέλετε με αυτό. Μπορείτε ακόμη να το εκτυπώσετε και να το στείλετε, σαν να ήταν το αρχικό. Στις περισσότερες περιπτώσεις κανείς δεν θα το προσέξει ή θα νοιαστεί.
|
||||
|
||||
Αυτό είναι που θα πάρετε:
|
||||
|
||||

|
||||
|
||||
|
||||
## Documentation
|
||||
|
||||
Είναι όλα διαθέσιμα εδώ [ReadTheDocs](https://paperless.readthedocs.org/).
|
||||
|
||||
|
||||
## Απαιτήσεις
|
||||
|
||||
Όλα αυτά είναι πολύ απλά, και φιλικά προς τον χρήστη, μια συλλογή με πολύτιμα εργαλεία.
|
||||
|
||||
* [ImageMagick](http://imagemagick.org/) μετατρέπει τις εικόνες σε έγχρωμες και ασπρόμαυρες.
|
||||
* [Tesseract](https://github.com/tesseract-ocr) κάνει την αναγνώρηση των χαρακτήρων.
|
||||
* [Unpaper](https://www.flameeyes.eu/projects/unpaper) despeckles and deskews the scanned image.
|
||||
* [GNU Privacy Guard](https://gnupg.org/) χρησιμοποιείται για κρυπτογράφηση στο backend.
|
||||
* [Python 3](https://python.org/) είναι η γλώσσα του project.
|
||||
* [Pillow](https://pypi.python.org/pypi/pillowfight/) Φορτώνει την εικόνα σαν αντικείμενο στην python και μπορεί να χρησιμοποιηθεί με PyOCR
|
||||
* [PyOCR](https://github.com/jflesch/pyocr) is a slick programmatic wrapper around tesseract.
|
||||
* [Django](https://www.djangoproject.com/) το framework με το οποίο έγινε το project.
|
||||
* [Python-GNUPG](http://pythonhosted.org/python-gnupg/) Αποκρυπτογραφεί τα PDF αρχεία στη στιγμή ώστε να κατεβάζετε αποκρυπτογραφημένα αρχεία, αφήνοντας τα κρυπτογραφημένα στον δίσκο.
|
||||
|
||||
|
||||
## Σταθερότητα
|
||||
|
||||
Αυτό το project υπάρχει από το 2015 και υπάρχουν αρκετοί άνθρωποι που το χρησιμοποιούν, παρόλα αυτά βρίσκεται σε διαρκή ανάπτυξη (απλά δείτε πότε commit έχουν γίνει στο git history) οπότε μην περιμένετε να είναι 100% σταθερό. Μπορείτε να κάνετε backup την βάση δεδομένων sqlite3, τον φάκελο media και το configuration αρχείο σας ώστε να είστε ασφαλείς.
|
||||
|
||||
|
||||
## Affiliated Projects
|
||||
|
||||
Το Paperless υπάρχει εδώ και κάποιο καιρό και άνθρωποι έχουν αρχίσει να φτιάχνουν πράγματα γύρω από αυτό. Αν είσαι ένας από αυτούς τους ανθρώπους, μπορούμε να βάλουμε το project σου σε αυτήν την λίστα:
|
||||
|
||||
* [Paperless Desktop](https://github.com/thomasbrueggemann/paperless-desktop): Μια desktop εφαρμογή για εγκατάσταση του Paperless. Τρέχει σε Mac, Linux, και Windows.
|
||||
* [ansible-role-paperless](https://github.com/ovv/ansible-role-paperless): Ένας εύκολο τρόπος για να τρέχει το Paperless μέσω Ansible.
|
||||
|
||||
|
||||
## Παρόμοια Projects
|
||||
|
||||
Υπάρχει ένα άλλο ṕroject που λέγεται [Mayan EDMS](https://mayan.readthedocs.org/en/latest/) το οποίο έχει παρόμοια τεχνικά χαρακτηριστικά με το Paperless σε εντυπωσιακό βαθμό. Επίσης βασισμένο στο Django και χρησιμοποιώντας το consumer model με Tesseract και Unpaper, Mayan EDMS έχει *πολλά* περισσότερα χαρακτηριστικά και έρχεται με ένα επιδέξιο UI, αλλά είναι ακόμα σε Python 2. Μπορεί να είναι ότι το Paperless καταναλώνει λιγότερους πόρους, αλλά για να είμαι ειλικρινής, αυτό είναι μια εικασία την οποία δεν έχω επιβεβαιώσει μόνος μου. Ένα πράγμα είναι σίγουρο, το *Paperless* έχει **πολύ** καλύτερο όνομα.
|
||||
|
||||
|
||||
## Σημαντική Σημείωση
|
||||
|
||||
Τα scanner για αρχεία συνήθως χρησιμοποιούνται για ευαίσθητα αρχεία. Πράγματα όπως το ΑΜΚΑ, φορολογικά αρχεία, τιμολόγια κτλπ. Παρόλο που το Paperless κρυπτογραφεί τα αρχικά αρχεία μέσω του consumption script, το κείμενο OCR *δεν είναι* κρυπτογραφημένο και για αυτό αποθηκεύεται (πρέπει να είναι αναζητήσιμο, οπότε αν κάποιος ξέρει να το κάνει αυτό με κρυπτογραφημένα δεδομένα είμαι όλος αυτιά). Αυτό σημάνει ότι το Paperless δεν πρέπει ποτέ να τρέχει σε μη αξιόπιστο πάροχο. Για αυτό συστήνω αν θέλετε να το τρέξετε να το τρέξετε σε έναν τοπικό server σπίτι σας.
|
||||
|
||||
|
||||
## Δωρεές
|
||||
|
||||
Όπως με όλα τα δωρεάν λογισμικά, η δύναμη δεν βρίσκεται στα οικονομικά αλλά στην συλλογική προσπάθεια. Αλήθεια εκτιμώ κάθε pull request και bug report που προσφέρεται από τους χρήστες του Paperless, οπότε σας παρακαλώ συνεχίστε. Αν παρόλα αυτά, δεν μπορείτε να γράψετε κώδικα/να κάνέτε design/να γράψετε documentation, και θέλετε να συνεισφέρετε οικονομικά, δεν θα πω όχι ;-)
|
||||
|
||||
Το θέμα είναι ότι είμαι οικονομικά εντάξει, οπότε θα σας ζητήσω να δωρίσετε τα χρήματα σας εδώ [United Nations High Commissioner for Refugees](https://donate.unhcr.org/int-en/general). Κάνουν σημαντική δουλειά και χρειάζονται τα χρήματα πολύ περισσότερο από ότι εγώ.
|
82
README.md
Normal file
@@ -0,0 +1,82 @@
|
||||
*[Greek](README-el.md)*
|
||||
|
||||
# Paperless
|
||||
|
||||
[](https://paperless.readthedocs.org/) [](https://gitter.im/danielquinn/paperless) [](https://travis-ci.org/danielquinn/paperless) [](https://coveralls.io/github/danielquinn/paperless?branch=master) [](https://github.com/danielquinn/paperless/blob/master/THANKS.md)
|
||||
|
||||
Index and archive all of your scanned paper documents
|
||||
|
||||
I hate paper. Environmental issues aside, it's a tech person's nightmare:
|
||||
|
||||
* There's no search feature
|
||||
* It takes up physical space
|
||||
* Backups mean more paper
|
||||
|
||||
In the past few months I've been bitten more than a few times by the problem of not having the right document around. Sometimes I recycled a document I needed (who keeps water bills for two years?) and other times I just lost it... because paper. I wrote this to make my life easier.
|
||||
|
||||
|
||||
## How it Works
|
||||
|
||||
Paperless does not control your scanner, it only helps you deal with what your scanner produces
|
||||
|
||||
1. Buy a document scanner that can write to a place on your network. If you need some inspiration, have a look at the [scanner recommendations](https://paperless.readthedocs.io/en/latest/scanners.html) page.
|
||||
2. Set it up to "scan to FTP" or something similar. It should be able to push scanned images to a server without you having to do anything. Of course if your scanner doesn't know how to automatically upload the file somewhere, you can always do that manually. Paperless doesn't care how the documents get into its local consumption directory.
|
||||
3. Have the target server run the Paperless consumption script to OCR the file and index it into a local database.
|
||||
4. Use the web frontend to sift through the database and find what you want.
|
||||
5. Download the PDF you need/want via the web interface and do whatever you like with it. You can even print it and send it as if it's the original. In most cases, no one will care or notice.
|
||||
|
||||
Here's what you get:
|
||||
|
||||

|
||||
|
||||
|
||||
## Documentation
|
||||
|
||||
It's all available on [ReadTheDocs](https://paperless.readthedocs.org/).
|
||||
|
||||
|
||||
## Requirements
|
||||
|
||||
This is all really a quite simple, shiny, user-friendly wrapper around some very powerful tools.
|
||||
|
||||
* [ImageMagick](http://imagemagick.org/) converts the images between colour and greyscale.
|
||||
* [Tesseract](https://github.com/tesseract-ocr) does the character recognition.
|
||||
* [Unpaper](https://www.flameeyes.eu/projects/unpaper) despeckles and deskews the scanned image.
|
||||
* [GNU Privacy Guard](https://gnupg.org/) is used as the encryption backend.
|
||||
* [Python 3](https://python.org/) is the language of the project.
|
||||
* [Pillow](https://pypi.python.org/pypi/pillowfight/) loads the image data as a python object to be used with PyOCR.
|
||||
* [PyOCR](https://github.com/jflesch/pyocr) is a slick programmatic wrapper around tesseract.
|
||||
* [Django](https://www.djangoproject.com/) is the framework this project is written against.
|
||||
* [Python-GNUPG](http://pythonhosted.org/python-gnupg/) decrypts the PDFs on-the-fly to allow you to download unencrypted files, leaving the encrypted ones on-disk.
|
||||
|
||||
|
||||
## Project Status
|
||||
|
||||
This project has been around since 2015, and there's lots of people using it. For some reason, it's really popular in Germany -- maybe someone over there can clue me in as to why?
|
||||
|
||||
I am no longer doing new development on Paperless as it does exactly what I need it to and have since turned my attention to my latest project, [Aletheia](https://github.com/danielquinn/aletheia). However, I'm not abandoning this project. I am happy to field pull requests and answer questions in the issue queue. If you're a developer yourself and want a new feature, float it in the issue queue and/or send me a pull request! I'm happy to add new stuff, but I just don't have the time to do that work myself.
|
||||
|
||||
|
||||
## Affiliated Projects
|
||||
|
||||
Paperless has been around a while now, and people are starting to build stuff on top of it. If you're one of those people, we can add your project to this list:
|
||||
|
||||
* [Paperless Desktop](https://github.com/thomasbrueggemann/paperless-desktop): A desktop UI for your Paperless installation. Runs on Mac, Linux, and Windows.
|
||||
* [ansible-role-paperless](https://github.com/ovv/ansible-role-paperless): An easy way to get Paperless running via Ansible.
|
||||
|
||||
|
||||
## Similar Projects
|
||||
|
||||
There's another project out there called [Mayan EDMS](https://mayan.readthedocs.org/en/latest/) that has a surprising amount of technical overlap with Paperless. Also based on Django and using a consumer model with Tesseract and Unpaper, Mayan EDMS is *much* more featureful and comes with a slick UI as well, but still in Python 2. It may be that Paperless consumes fewer resources, but to be honest, this is just a guess as I haven't tested this myself. One thing's for certain though, *Paperless* is a **way** better name.
|
||||
|
||||
|
||||
## Important Note
|
||||
|
||||
Document scanners are typically used to scan sensitive documents. Things like your social insurance number, tax records, invoices, etc. While Paperless encrypts the original files via the consumption script, the OCR'd text is *not* encrypted and is therefore stored in the clear (it needs to be searchable, so if someone has ideas on how to do that on encrypted data, I'm all ears). This means that Paperless should never be run on an untrusted host. Instead, I recommend that if you do want to use it, run it locally on a server in your own home.
|
||||
|
||||
|
||||
## Donations
|
||||
|
||||
As with all Free software, the power is less in the finances and more in the collective efforts. I really appreciate every pull request and bug report offered up by Paperless' users, so please keep that stuff coming. If however, you're not one for coding/design/documentation, and would like to contribute financially, I won't say no ;-)
|
||||
|
||||
The thing is, I'm doing ok for money, so I would instead ask you to donate to the [United Nations High Commissioner for Refugees](https://donate.unhcr.org/int-en/general). They're doing important work and they need the money a lot more than I do.
|
140
README.rst
@@ -1,140 +0,0 @@
|
||||
Paperless
|
||||
#########
|
||||
|
||||
|Documentation|
|
||||
|Chat|
|
||||
|Travis|
|
||||
|Dependencies|
|
||||
|
||||
Scan, index, and archive all of your paper documents
|
||||
|
||||
I hate paper. Environmental issues aside, it's a tech person's nightmare:
|
||||
|
||||
* There's no search feature
|
||||
* It takes up physical space
|
||||
* Backups mean more paper
|
||||
|
||||
In the past few months I've been bitten more than a few times by the problem
|
||||
of not having the right document around. Sometimes I recycled a document I
|
||||
needed (who keeps water bills for two years?) and other times I just lost
|
||||
it... because paper. I wrote this to make my life easier.
|
||||
|
||||
|
||||
How it Works
|
||||
============
|
||||
|
||||
1. Buy a document scanner like `this one`_.
|
||||
2. Set it up to "scan to FTP" or something similar. It should be able to push
|
||||
scanned images to a server without you having to do anything. If your
|
||||
scanner doesn't know how to automatically upload the file somewhere, you can
|
||||
always do that manually. Paperless doesn't care how the documents get into
|
||||
its local consumption directory.
|
||||
3. Have the target server run the Paperless consumption script to OCR the PDF
|
||||
and index it into a local database.
|
||||
4. Use the web frontend to sift through the database and find what you want.
|
||||
5. Download the PDF you need/want via the web interface and do whatever you
|
||||
like with it. You can even print it and send it as if it's the original.
|
||||
In most cases, no one will care or notice.
|
||||
|
||||
Here's what you get:
|
||||
|
||||
.. image:: docs/_static/screenshot.png
|
||||
:alt: The before and after
|
||||
:target: docs/_static/screenshot.png
|
||||
|
||||
|
||||
Stability
|
||||
=========
|
||||
|
||||
Paperless is still under active development (just look at the git commit
|
||||
history) so don't expect it to be 100% stable. I'm using it for my own
|
||||
documents, but I'm crazy like that. If you use this and it breaks something,
|
||||
you get to keep all the shiny pieces.
|
||||
|
||||
|
||||
Requirements
|
||||
============
|
||||
|
||||
This is all really a quite simple, shiny, user-friendly wrapper around some very
|
||||
powerful tools.
|
||||
|
||||
* `ImageMagick`_ converts the images between colour and greyscale.
|
||||
* `Tesseract`_ does the character recognition.
|
||||
* `Unpaper`_ despeckles and deskews the scanned image.
|
||||
* `GNU Privacy Guard`_ is used as the encryption backend.
|
||||
* `Python 3`_ is the language of the project.
|
||||
|
||||
* `Pillow`_ loads the image data as a python object to be used with PyOCR.
|
||||
* `PyOCR`_ is a slick programmatic wrapper around tesseract.
|
||||
* `Django`_ is the framework this project is written against.
|
||||
* `Python-GNUPG`_ decrypts the PDFs on-the-fly to allow you to download
|
||||
unencrypted files, leaving the encrypted ones on-disk.
|
||||
|
||||
|
||||
Documentation
|
||||
=============
|
||||
|
||||
It's all available on `ReadTheDocs`_.
|
||||
|
||||
|
||||
Similar Projects
|
||||
================
|
||||
|
||||
There's another project out there called `Mayan EDMS`_ that has a surprising
|
||||
amount of technical overlap with Paperless. Also based on Django and using
|
||||
a consumer model with Tesseract and unpaper, Mayan EDMS is *much* more
|
||||
featureful and comes with a slick UI as well. It may be that Paperless is
|
||||
better suited for low-resource environments (like a Rasberry Pi), but to be
|
||||
honest, this is just a guess as I haven't tested this myself. One thing's
|
||||
for certain though, *Paperless* is a **much** better name.
|
||||
|
||||
|
||||
Important Note
|
||||
==============
|
||||
|
||||
Document scanners are typically used to scan sensitive documents. Things like
|
||||
your social insurance number, tax records, invoices, etc. While paperless
|
||||
encrypts the original PDFs via the consumption script, the OCR'd text is *not*
|
||||
encrypted and is therefore stored in the clear (it needs to be searchable, so
|
||||
if someone has ideas on how to do that on encrypted data, I'm all ears). This
|
||||
means that paperless should never be run on an untrusted host. Instead, I
|
||||
recommend that if you do want to use it, run it locally on a server in your own
|
||||
home.
|
||||
|
||||
|
||||
Donations
|
||||
=========
|
||||
|
||||
As with all Free software, the power is less in the finances and more in the
|
||||
collective efforts. I really appreciate every pull request and bug report
|
||||
offered up by Paperless' users, so please keep that stuff coming. If however,
|
||||
you're not one for coding/design/documentation, and would like to contribute
|
||||
financially, I won't say no ;-)
|
||||
|
||||
The thing is, I'm doing ok for money, so I would instead ask you to donate to
|
||||
the `United Nations High Commissioner for Refugees`_. They're doing important
|
||||
work and they need the money a lot more than I do.
|
||||
|
||||
.. _this one: http://www.brother.ca/en-CA/Scanners/11/ProductDetail/ADS1500W?ProductDetail=productdetail
|
||||
.. _ImageMagick: http://imagemagick.org/
|
||||
.. _Tesseract: https://github.com/tesseract-ocr
|
||||
.. _Unpaper: https://www.flameeyes.eu/projects/unpaper
|
||||
.. _GNU Privacy Guard: https://gnupg.org/
|
||||
.. _Python 3: https://python.org/
|
||||
.. _Pillow: https://pypi.python.org/pypi/pillowfight/
|
||||
.. _PyOCR: https://github.com/jflesch/pyocr
|
||||
.. _Django: https://www.djangoproject.com/
|
||||
.. _Python-GNUPG: http://pythonhosted.org/python-gnupg/
|
||||
.. _ReadTheDocs: https://paperless.readthedocs.org/
|
||||
.. _Mayan EDMS: https://mayan.readthedocs.org/en/latest/
|
||||
.. _United Nations High Commissioner for Refugees: https://donate.unhcr.org/int-en/general
|
||||
.. |Documentation| image:: https://readthedocs.org/projects/paperless/badge/?version=latest
|
||||
:alt: Read the documentation at https://paperless.readthedocs.org/
|
||||
:target: https://paperless.readthedocs.org/
|
||||
.. |Chat| image:: https://badges.gitter.im/danielquinn/paperless.svg
|
||||
:alt: Join the chat at https://gitter.im/danielquinn/paperless
|
||||
:target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
|
||||
.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
|
||||
:target: https://travis-ci.org/danielquinn/paperless
|
||||
.. |Dependencies| image:: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500/badge.svg?style=flat-square
|
||||
:target: https://www.versioneye.com/user/projects/57b33b81d9f1b00016faa500
|
19
THANKS.md
Normal file
@@ -0,0 +1,19 @@
|
||||
# Thanks for using Paperless!
|
||||
|
||||
Working on this project has been exhausting, but rewarding at the same time.
|
||||
It's just wonderful that so many people are using this thing, and in so many
|
||||
crazy ways.
|
||||
|
||||
This file is here for everyone to post their own stories about how you use this
|
||||
code. It helps me to understand who's using it and why, and maybe to give
|
||||
others an idea of how it might be used. It's based on a Twitter exchange
|
||||
between [John Glanville](https://twitter.com/hexapodium) and
|
||||
[Julia Evans](https://github.com/jvns) and later better defined [here](https://github.com/paulmolluzzo/thanks-md).
|
||||
|
||||
To contribute, simply issue a pull request that appends to this file something
|
||||
like this:
|
||||
|
||||
```
|
||||
### Your Name
|
||||
Some friendly message
|
||||
```
|
5
Vagrantfile
vendored
@@ -12,4 +12,9 @@ Vagrant.configure(VAGRANT_API_VERSION) do |config|
|
||||
|
||||
# Networking details
|
||||
config.vm.network "private_network", ip: "172.28.128.4"
|
||||
|
||||
config.vm.provider "virtualbox" do |vb|
|
||||
# Customize the amount of memory on the VM:
|
||||
vb.memory = "1024"
|
||||
end
|
||||
end
|
||||
|
@@ -13,3 +13,25 @@ PAPERLESS_PASSPHRASE=CHANGE_ME
|
||||
# You can change the default user and group id to a custom one
|
||||
# USERMAP_UID=1000
|
||||
# USERMAP_GID=1000
|
||||
|
||||
###############################################################################
|
||||
#### Mail Consumption ####
|
||||
###############################################################################
|
||||
|
||||
# These values are required if you want paperless to check a particular email
|
||||
# box every 10 minutes and attempt to consume documents from there. If you
|
||||
# don't define a HOST, mail checking will just be disabled.
|
||||
# Don't use quotes after = or it will crash your docker
|
||||
# PAPERLESS_CONSUME_MAIL_HOST=
|
||||
# PAPERLESS_CONSUME_MAIL_PORT=
|
||||
# PAPERLESS_CONSUME_MAIL_USER=
|
||||
# PAPERLESS_CONSUME_MAIL_PASS=
|
||||
|
||||
# Override the default IMAP inbox here. If it's not set, Paperless defaults to
|
||||
# INBOX.
|
||||
# PAPERLESS_CONSUME_MAIL_INBOX=INBOX
|
||||
|
||||
# Any email sent to the target account that does not contain this text will be
|
||||
# ignored. Mail checking won't work without this.
|
||||
# PAPERLESS_EMAIL_SECRET=
|
||||
|
||||
|
@@ -1,12 +1,19 @@
|
||||
version: '2'
|
||||
version: '2.1'
|
||||
|
||||
services:
|
||||
webserver:
|
||||
image: pitkley/paperless
|
||||
build: ./
|
||||
# uncomment the following line to start automatically on system boot
|
||||
# restart: always
|
||||
ports:
|
||||
# You can adapt the port you want Paperless to listen on by
|
||||
# modifying the part before the `:`.
|
||||
- "8000:8000"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl" , "-f", "http://localhost:8000"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
volumes:
|
||||
- data:/usr/src/paperless/data
|
||||
- media:/usr/src/paperless/media
|
||||
@@ -17,16 +24,21 @@ services:
|
||||
# value with nothing.
|
||||
environment:
|
||||
- PAPERLESS_OCR_LANGUAGES=
|
||||
command: ["runserver", "0.0.0.0:8000"]
|
||||
command: ["runserver", "--insecure", "--noreload", "0.0.0.0:8000"]
|
||||
|
||||
consumer:
|
||||
image: pitkley/paperless
|
||||
build: ./
|
||||
# uncomment the following line to start automatically on system boot
|
||||
# restart: always
|
||||
depends_on:
|
||||
webserver:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- data:/usr/src/paperless/data
|
||||
- media:/usr/src/paperless/media
|
||||
# You have to adapt the local path you want the consumption
|
||||
# directory to mount to by modifying the part before the ':'.
|
||||
- /path/to/arbitrary/place:/consume
|
||||
- ./consume:/consume
|
||||
# Likewise, you can add a local path to mount a directory for
|
||||
# exporting. This is not strictly needed for paperless to
|
||||
# function, only if you're exporting your files: uncomment
|
||||
|
@@ -1,126 +1,360 @@
|
||||
Changelog
|
||||
#########
|
||||
|
||||
* 0.3.2
|
||||
* Fix for #172: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the user
|
||||
to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
|
||||
arise.
|
||||
1.4.0
|
||||
=====
|
||||
|
||||
* 0.3.1
|
||||
* Added a default value for ``CONVERT_BINARY``
|
||||
* `Quentin Dawans`_ has refactored the document consumer to allow for some
|
||||
command-line options. Notably, you can now direct it to consume from a
|
||||
particular ``--directory``, limit the ``--loop-time``, set the time between
|
||||
mail server checks with ``--mail-delta`` or just run it as a one-off with
|
||||
``--one-shot``. See `#305`_ & `#313`_ for more information.
|
||||
* Refactor the use of travis/tox/pytest/coverage into two files:
|
||||
``.travis.yml`` and ``setup.cfg``.
|
||||
* Start generating requirements.txt from a Pipfile. I'll probably switch over
|
||||
to just using pipenv in the future.
|
||||
* All for a alternative FreeBSD-friendly location for ``paperless.conf``.
|
||||
Thanks to `Martin Arendtsen`_ who provided this (`#322`_).
|
||||
* Document consumption events are now logged in the Django admin events log.
|
||||
Thanks to `CkuT`_ for doing the legwork on this one and to `Quentin Dawans`_
|
||||
& `David Martin`_ for helping to coordinate & work out how the feature would
|
||||
be developed.
|
||||
* `erikarvstedt`_ contributed a pull request (`#328`_) to add ``--noreload``
|
||||
to the default server start process. This helps reduce the load imposed
|
||||
by the running webservice.
|
||||
* Through some discussion on `#253`_ and `#323`_, we've removed a few of the
|
||||
hardcoded URL values to make it easier for people to host Paperless on a
|
||||
subdirectory. Thanks to `Quentin Dawans`_ and `Kyle Lucy`_ for helping to
|
||||
work this out.
|
||||
* The clickable area for documents on the listing page has been increased to a
|
||||
more predictable space thanks to a glorious hack from `erikarvstedt`_ in
|
||||
`#344`_.
|
||||
* `Strubbl`_ noticed an annoying bug in the bash script wrapping the Docker
|
||||
entrypoint and fixed it with some very creating Bash skills: `#352`_.
|
||||
* You can now use the search field to find documents by tag thanks to
|
||||
`thinkjk`_'s *first ever issue*: `#354`_.
|
||||
* Inotify is now being used to detect additions to the consume directory thanks
|
||||
to some excellent work from `erikarvstedt`_ on `#351`_
|
||||
|
||||
* 0.3.0
|
||||
* Updated to using django-filter 1.x
|
||||
* Added some system checks so new users aren't confused by misconfigurations.
|
||||
* Consumer loop time is now configurable for systems with slow writes. Just
|
||||
set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds. The default
|
||||
is 10.
|
||||
* As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``,
|
||||
``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``. Please use
|
||||
``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
|
||||
``PAPERLESS_SHARED_SECRET`` respectively instead.
|
||||
1.3.0
|
||||
=====
|
||||
|
||||
* 0.2.0
|
||||
* You can now run Paperless without a login, though you'll still have to create
|
||||
at least one user. This is thanks to a pull-request from `matthewmoto`_:
|
||||
`#295`_. Note that logins are still required by default, and that you need
|
||||
to disable them by setting ``PAPERLESS_DISABLE_LOGIN="true"`` in your
|
||||
environment or in ``/etc/paperless.conf``.
|
||||
* Fix for `#303`_ where sketchily-formatted documents could cause the consumer
|
||||
to break and insert half-records into the database breaking all sorts of
|
||||
things. We now capture the return codes of both ``convert`` and ``unpaper``
|
||||
and fail-out nicely.
|
||||
* Fix for additional date types thanks to input from `Isaac`_ and code from
|
||||
`BastianPoe`_ (`#301`_).
|
||||
* Fix for running migrations in the Docker container (`#299`_). Thanks to
|
||||
`Georgi Todorov`_ for the fix (`#300`_) and to `Pit`_ for the review.
|
||||
* Fix for Docker cases where the issuing user is not UID 1000. This was a
|
||||
collaborative fix between `Jeffrey Portman`_ and `Pit`_ in `#311`_ and
|
||||
`#312`_ to fix `#306`_.
|
||||
* Patch the historical migrations to support MySQL's um, *interesting* way of
|
||||
handing indexes (`#308`_). Thanks to `Simon Taddiken`_ for reporting the
|
||||
problem and helping me find where to fix it.
|
||||
|
||||
* `#150`_: The media root is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
* `#148`_: The database location (sqlite) is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
* `#146`_: Fixed a bug that allowed unauthorised access to the `/fetch` URL.
|
||||
* `#131`_: Document files are now automatically removed from disk when
|
||||
they're deleted in Paperless.
|
||||
* `#121`_: Fixed a bug where Paperless wasn't setting document creation time
|
||||
based on the file naming scheme.
|
||||
* `#81`_: Added a hook to run an arbitrary script after every document is
|
||||
consumed.
|
||||
* `#98`_: Added optional environment variables for ImageMagick so that it
|
||||
doesn't explode when handling Very Large Documents or when it's just
|
||||
running on a low-memory system. Thanks to `Florian Harr`_ for his help on
|
||||
this one.
|
||||
* `#89`_ Ported the auto-tagging code to correspondents as well. Thanks to
|
||||
`Justin Snyman`_ for the pointers in the issue queue.
|
||||
* Added support for guessing the date from the file name along with the
|
||||
correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull
|
||||
request that I took forever to merge and to `Pit`_ for his efforts on the
|
||||
regex front.
|
||||
* `#94`_: Restored support for changing the created date in the UI. Thanks
|
||||
to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
|
||||
1.2.0
|
||||
=====
|
||||
|
||||
* 0.1.1
|
||||
* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
|
||||
and `Pit`_. This new image is dramatically smaller than the Debian-based
|
||||
one, and it also has `a new home on Docker Hub`_. A proper thank-you to
|
||||
`Pit`_ for hosting the image on his Docker account all this time, but after
|
||||
some discussion, we decided the image needed a more *official-looking* home.
|
||||
* `BastianPoe`_ has added the long-awaited feature to automatically skip the
|
||||
OCR step when the PDF already contains text. This can be overridden by
|
||||
setting ``PAPERLESS_OCR_ALWAYS=YES`` either in your ``paperless.conf`` or
|
||||
in the environment. Note that this also means that Paperless now requires
|
||||
``libpoppler-cpp-dev`` to be installed. **Important**: You'll need to run
|
||||
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
||||
properly update.
|
||||
* `BastianPoe`_ has also contributed a monumental amount of work (`#291`_) to
|
||||
solving `#158`_: setting the document creation date based on finding a date
|
||||
in the document text.
|
||||
|
||||
* Potentially **Breaking Change**: All references to "sender" in the code
|
||||
have been renamed to "correspondent" to better reflect the nature of the
|
||||
property (one could quite reasonably scan a document before sending it to
|
||||
someone.)
|
||||
* `#67`_: Rewrote the document exporter and added a new importer that allows
|
||||
for full metadata retention without depending on the file name and
|
||||
modification time. A big thanks to `Tikitu de Jager`_, `Pit`_,
|
||||
`Florian Jung`_, and `Christopher Luu`_ for their code snippets and
|
||||
contributing conversation that lead to this change.
|
||||
* `#20`_: Added *unpaper* support to help in cleaning up the scanned image
|
||||
before it's OCR'd. Thanks to `Pit`_ for this one.
|
||||
* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
|
||||
* `#68`_: Added support for using a proper config file at
|
||||
``/etc/paperless.conf`` and modified the systemd unit files to use it.
|
||||
* Refactored the Vagrant installation process to use environment variables
|
||||
rather than asking the user to modify ``settings.py``.
|
||||
* `#44`_: Harmonise environment variable names with constant names.
|
||||
* `#60`_: Setup logging to actually use the Python native logging framework.
|
||||
* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
|
||||
to be imported but made unavailable.
|
||||
1.1.0
|
||||
=====
|
||||
|
||||
* 0.1.0
|
||||
* Fix for `#283`_, a redirect bug which broke interactions with
|
||||
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
|
||||
* Addition of an optional new financial year filter, courtesy of
|
||||
`David Martin`_ `#256`_
|
||||
* Fixed a typo in how thumbnails were named in exports `#285`_, courtesy of
|
||||
`Dan Panzarella`_
|
||||
|
||||
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
|
||||
`Tikitu de Jager`_ for this one, and especially to `Pit`_
|
||||
who spearheadded this effort.
|
||||
* A simple REST API is in place, but it should be considered unstable.
|
||||
* Cleaned up the consumer to use temporary directories instead of a single
|
||||
scratch space. (Thanks `Pit`_)
|
||||
* Improved the efficiency of the consumer by parsing pages more intelligently
|
||||
and introducing a threaded OCR process (thanks again `Pit`_).
|
||||
* `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_.
|
||||
* `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by
|
||||
`Pit`_.
|
||||
* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
|
||||
* `#54`_: Documented the re-tagger (`zedster`_)
|
||||
* `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
|
||||
* Added tox with pep8 checking
|
||||
1.0.0
|
||||
=====
|
||||
|
||||
* 0.0.6
|
||||
* Upgrade to Django 1.11. **You'll need to run
|
||||
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
||||
properly update**.
|
||||
* Replace the templatetag-based hack we had for document listing in favour of
|
||||
a slightly less ugly solution in the form of another template tag with less
|
||||
copypasta.
|
||||
* Support for multi-word-matches for auto-tagging thanks to an excellent
|
||||
patch from `ishirav`_ `#277`_.
|
||||
* Fixed a CSS bug reported by `Stefan Hagen`_ that caused an overlapping of
|
||||
the text and checkboxes under some resolutions `#272`_.
|
||||
* Patched the Docker config to force the serving of static files. Credit for
|
||||
this one goes to `dev-rke`_ via `#248`_.
|
||||
* Fix file permissions during Docker start up thanks to `Pit`_ on `#268`_.
|
||||
* Date fields in the admin are now expressed as HTML5 date fields thanks to
|
||||
`Lukas Winkler`_'s issue `#278`_
|
||||
|
||||
* Added support for parallel OCR (significant work from `Pit`_)
|
||||
* Sped up the language detection (significant work from `Pit`_)
|
||||
* Added simple logging
|
||||
0.8.0
|
||||
=====
|
||||
|
||||
* 0.0.5
|
||||
* Paperless can now run in a subdirectory on a host (``/paperless``), rather
|
||||
than always running in the root (``/``) thanks to `maphy-psd`_'s work on
|
||||
`#255`_.
|
||||
|
||||
* Added support for image files as documents (png, jpg, gif, tiff)
|
||||
* Added a crude means of HTTP POST for document imports
|
||||
* Added IMAP mail support
|
||||
* Added a re-tagging utility
|
||||
* Documentation for the above as well as data migration
|
||||
0.7.0
|
||||
=====
|
||||
|
||||
* 0.0.4
|
||||
* **Potentially breaking change**: As per `#235`_, Paperless will no longer
|
||||
automatically delete documents attached to correspondents when those
|
||||
correspondents are themselves deleted. This was Django's default
|
||||
behaviour, but didn't make much sense in Paperless' case. Thanks to
|
||||
`Thomas Brueggemann`_ and `David Martin`_ for their input on this one.
|
||||
* Fix for `#232`_ wherein Paperless wasn't recognising ``.tif`` files
|
||||
properly. Thanks to `ayounggun`_ for reporting this one and to
|
||||
`Kusti Skytén`_ for posting the correct solution in the Github issue.
|
||||
|
||||
* Added automated tagging basted on keyword matching
|
||||
* Cleaned up the document listing page
|
||||
* Removed ``User`` and ``Group`` from the admin
|
||||
* Added ``pytz`` to the list of requirements
|
||||
0.6.0
|
||||
=====
|
||||
|
||||
* 0.0.3
|
||||
* Abandon the shared-secret trick we were using for the POST API in favour
|
||||
of BasicAuth or Django session.
|
||||
* Fix the POST API so it actually works. `#236`_
|
||||
* **Breaking change**: We've dropped the use of ``PAPERLESS_SHARED_SECRET``
|
||||
as it was being used both for the API (now replaced with a normal auth)
|
||||
and form email polling. Now that we're only using it for email, this
|
||||
variable has been renamed to ``PAPERLESS_EMAIL_SECRET``. The old value
|
||||
will still work for a while, but you should change your config if you've
|
||||
been using the email polling feature. Thanks to `Joshua Gilman`_ for all
|
||||
the help with this feature.
|
||||
|
||||
* Added basic tagging
|
||||
0.5.0
|
||||
=====
|
||||
|
||||
* 0.0.2
|
||||
* Support for fuzzy matching in the auto-tagger & auto-correspondent systems
|
||||
thanks to `Jake Gysland`_'s patch `#220`_.
|
||||
* Modified the Dockerfile to prepare an export directory (`#212`_). Thanks
|
||||
to combined efforts from `Pit`_ and `Strubbl`_ in working out the kinks on
|
||||
this one.
|
||||
* Updated the import/export scripts to include support for thumbnails. Big
|
||||
thanks to `CkuT`_ for finding this shortcoming and doing the work to get
|
||||
it fixed in `#224`_.
|
||||
* All of the following changes are thanks to `David Martin`_:
|
||||
* Bumped the dependency on pyocr to 0.4.7 so new users can make use of
|
||||
Tesseract 4 if they so prefer (`#226`_).
|
||||
* Fixed a number of issues with the automated mail handler (`#227`_, `#228`_)
|
||||
* Amended the documentation for better handling of systemd service files (`#229`_)
|
||||
* Amended the Django Admin configuration to have nice headers (`#230`_)
|
||||
|
||||
* Added language detection
|
||||
* Added datestamps to ``document_exporter``.
|
||||
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
|
||||
0.4.1
|
||||
=====
|
||||
|
||||
* 0.0.1
|
||||
* Fix for `#206`_ wherein the pluggable parser didn't recognise files with
|
||||
all-caps suffixes like ``.PDF``
|
||||
|
||||
* Initial release
|
||||
0.4.0
|
||||
=====
|
||||
|
||||
* Introducing reminders. See `#199`_ for more information, but the short
|
||||
explanation is that you can now attach simple notes & times to documents
|
||||
which are made available via the API. Currently, the default API
|
||||
(basically just the Django admin) doesn't really make use of this, but
|
||||
`Thomas Brueggemann`_ over at `Paperless Desktop`_ has said that he would
|
||||
like to make use of this feature in his project.
|
||||
|
||||
0.3.6
|
||||
=====
|
||||
|
||||
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
|
||||
correspondent or the tags for a document.
|
||||
* The ``content`` field is now optional, to allow for the edge case of a
|
||||
purely graphical document.
|
||||
* You can no longer add documents via the admin. This never worked in the
|
||||
first place, so all I've done here is remove the link to the broken form.
|
||||
* The consumer code has been heavily refactored to support a pluggable
|
||||
interface. Install a paperless consumer via pip and tell paperless about
|
||||
it with an environment variable, and you're good to go. Proper
|
||||
documentation is on its way.
|
||||
|
||||
0.3.5
|
||||
=====
|
||||
|
||||
* A serious facelift for the documents listing page wherein we drop the
|
||||
tabular layout in favour of a tiled interface.
|
||||
* Users can now configure the number of items per page.
|
||||
* Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value.
|
||||
* Moved the dotenv loading to the top of settings.py
|
||||
* Fix for `#112`_: Added checks for binaries required for document
|
||||
consumption.
|
||||
|
||||
0.3.4
|
||||
=====
|
||||
|
||||
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
|
||||
Note that you *can* use Django Suit with Paperless, but only in a
|
||||
non-profit situation as their free license prohibits for-profit use. As a
|
||||
result, I can't bundle Suit with Paperless without conflicting with the
|
||||
GPL. Further development will be done against the stock Django admin.
|
||||
* I shrunk the thumbnails a little 'cause they were too big for me, even on
|
||||
my high-DPI monitor.
|
||||
* BasicAuth support for document and thumbnail downloads, as well as the Push
|
||||
API thanks to @thomasbrueggemann. See `#179`_.
|
||||
|
||||
0.3.3
|
||||
=====
|
||||
|
||||
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
|
||||
* Timezone, items per page, and default language are now all configurable,
|
||||
also thanks to @ekw.
|
||||
|
||||
0.3.2
|
||||
=====
|
||||
|
||||
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
|
||||
user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
|
||||
arise.
|
||||
|
||||
0.3.1
|
||||
=====
|
||||
|
||||
* Added a default value for ``CONVERT_BINARY``
|
||||
|
||||
0.3.0
|
||||
=====
|
||||
|
||||
* Updated to using django-filter 1.x
|
||||
* Added some system checks so new users aren't confused by misconfigurations.
|
||||
* Consumer loop time is now configurable for systems with slow writes. Just
|
||||
set ``PAPERLESS_CONSUMER_LOOP_TIME`` to a number of seconds. The default
|
||||
is 10.
|
||||
* As per `#44`_, we've removed support for ``PAPERLESS_CONVERT``,
|
||||
``PAPERLESS_CONSUME``, and ``PAPERLESS_SECRET``. Please use
|
||||
``PAPERLESS_CONVERT_BINARY``, ``PAPERLESS_CONSUMPTION_DIR``, and
|
||||
``PAPERLESS_SHARED_SECRET`` respectively instead.
|
||||
|
||||
0.2.0
|
||||
=====
|
||||
|
||||
* `#150`_: The media root is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
* `#148`_: The database location (sqlite) is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
* `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch``
|
||||
URL.
|
||||
* `#131`_: Document files are now automatically removed from disk when
|
||||
they're deleted in Paperless.
|
||||
* `#121`_: Fixed a bug where Paperless wasn't setting document creation time
|
||||
based on the file naming scheme.
|
||||
* `#81`_: Added a hook to run an arbitrary script after every document is
|
||||
consumed.
|
||||
* `#98`_: Added optional environment variables for ImageMagick so that it
|
||||
doesn't explode when handling Very Large Documents or when it's just
|
||||
running on a low-memory system. Thanks to `Florian Harr`_ for his help on
|
||||
this one.
|
||||
* `#89`_ Ported the auto-tagging code to correspondents as well. Thanks to
|
||||
`Justin Snyman`_ for the pointers in the issue queue.
|
||||
* Added support for guessing the date from the file name along with the
|
||||
correspondent, title, and tags. Thanks to `Tikitu de Jager`_ for his pull
|
||||
request that I took forever to merge and to `Pit`_ for his efforts on the
|
||||
regex front.
|
||||
* `#94`_: Restored support for changing the created date in the UI. Thanks
|
||||
to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
|
||||
|
||||
0.1.1
|
||||
=====
|
||||
|
||||
* Potentially **Breaking Change**: All references to "sender" in the code
|
||||
have been renamed to "correspondent" to better reflect the nature of the
|
||||
property (one could quite reasonably scan a document before sending it to
|
||||
someone.)
|
||||
* `#67`_: Rewrote the document exporter and added a new importer that allows
|
||||
for full metadata retention without depending on the file name and
|
||||
modification time. A big thanks to `Tikitu de Jager`_, `Pit`_,
|
||||
`Florian Jung`_, and `Christopher Luu`_ for their code snippets and
|
||||
contributing conversation that lead to this change.
|
||||
* `#20`_: Added *unpaper* support to help in cleaning up the scanned image
|
||||
before it's OCR'd. Thanks to `Pit`_ for this one.
|
||||
* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
|
||||
* `#68`_: Added support for using a proper config file at
|
||||
``/etc/paperless.conf`` and modified the systemd unit files to use it.
|
||||
* Refactored the Vagrant installation process to use environment variables
|
||||
rather than asking the user to modify ``settings.py``.
|
||||
* `#44`_: Harmonise environment variable names with constant names.
|
||||
* `#60`_: Setup logging to actually use the Python native logging framework.
|
||||
* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
|
||||
to be imported but made unavailable.
|
||||
|
||||
0.1.0
|
||||
=====
|
||||
|
||||
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
|
||||
`Tikitu de Jager`_ for this one, and especially to `Pit`_
|
||||
who spearheadded this effort.
|
||||
* A simple REST API is in place, but it should be considered unstable.
|
||||
* Cleaned up the consumer to use temporary directories instead of a single
|
||||
scratch space. (Thanks `Pit`_)
|
||||
* Improved the efficiency of the consumer by parsing pages more intelligently
|
||||
and introducing a threaded OCR process (thanks again `Pit`_).
|
||||
* `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_.
|
||||
* `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by
|
||||
`Pit`_.
|
||||
* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
|
||||
* `#54`_: Documented the re-tagger (`zedster`_)
|
||||
* `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
|
||||
* Added tox with pep8 checking
|
||||
|
||||
0.0.6
|
||||
=====
|
||||
|
||||
* Added support for parallel OCR (significant work from `Pit`_)
|
||||
* Sped up the language detection (significant work from `Pit`_)
|
||||
* Added simple logging
|
||||
|
||||
0.0.5
|
||||
=====
|
||||
|
||||
* Added support for image files as documents (png, jpg, gif, tiff)
|
||||
* Added a crude means of HTTP POST for document imports
|
||||
* Added IMAP mail support
|
||||
* Added a re-tagging utility
|
||||
* Documentation for the above as well as data migration
|
||||
|
||||
0.0.4
|
||||
=====
|
||||
|
||||
* Added automated tagging basted on keyword matching
|
||||
* Cleaned up the document listing page
|
||||
* Removed ``User`` and ``Group`` from the admin
|
||||
* Added ``pytz`` to the list of requirements
|
||||
|
||||
0.0.3
|
||||
=====
|
||||
|
||||
* Added basic tagging
|
||||
|
||||
0.0.2
|
||||
=====
|
||||
|
||||
* Added language detection
|
||||
* Added datestamps to ``document_exporter``.
|
||||
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
|
||||
|
||||
0.0.1
|
||||
=====
|
||||
|
||||
* Initial release
|
||||
|
||||
.. _Brian Conn: https://github.com/TheConnMan
|
||||
.. _Christopher Luu: https://github.com/nuudles
|
||||
@@ -135,6 +369,34 @@ Changelog
|
||||
.. _Tim White: https://github.com/timwhite
|
||||
.. _Florian Harr: https://github.com/evils
|
||||
.. _Justin Snyman: https://github.com/stringlytyped
|
||||
.. _Thomas Brueggemann: https://github.com/thomasbrueggemann
|
||||
.. _Jake Gysland: https://github.com/jgysland
|
||||
.. _Strubbl: https://github.com/strubbl
|
||||
.. _CkuT: https://github.com/CkuT
|
||||
.. _David Martin: https://github.com/ddddavidmartin
|
||||
.. _Paperless Desktop: https://github.com/thomasbrueggemann/paperless-desktop
|
||||
.. _Joshua Gilman: https://github.com/jmgilman
|
||||
.. _ayounggun: https://github.com/ayounggun
|
||||
.. _Kusti Skytén: https://github.com/kskyten
|
||||
.. _maphy-psd: https://github.com/maphy-psd
|
||||
.. _ishirav: https://github.com/ishirav
|
||||
.. _Stefan Hagen: https://github.com/xkpd3
|
||||
.. _dev-rke: https://github.com/dev-rke
|
||||
.. _Lukas Winkler: https://github.com/Findus23
|
||||
.. _chris-aeviator: https://github.com/chris-aeviator
|
||||
.. _Dan Panzarella: https://github.com/pzl
|
||||
.. _addadi: https://github.com/addadi
|
||||
.. _BastianPoe: https://github.com/BastianPoe
|
||||
.. _matthewmoto: https://github.com/matthewmoto
|
||||
.. _Isaac: https://github.com/isaacsando
|
||||
.. _Georgi Todorov: https://github.com/TeraHz
|
||||
.. _Jeffrey Portman: https://github.com/ChromoX
|
||||
.. _Simon Taddiken: https://github.com/skuzzle
|
||||
.. _Quentin Dawans: https://github.com/ovv
|
||||
.. _Martin Arendtsen: https://github.com/Arendtsen
|
||||
.. _erikarvstedt: https://github.com/erikarvstedt
|
||||
.. _Kyle Lucy: https://github.com/kmlucy
|
||||
.. _thinkjk: https://github.com/thinkjk
|
||||
|
||||
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
||||
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
||||
@@ -152,8 +414,59 @@ Changelog
|
||||
.. _#89: https://github.com/danielquinn/paperless/issues/89
|
||||
.. _#94: https://github.com/danielquinn/paperless/issues/94
|
||||
.. _#98: https://github.com/danielquinn/paperless/issues/98
|
||||
.. _#112: https://github.com/danielquinn/paperless/issues/112
|
||||
.. _#121: https://github.com/danielquinn/paperless/issues/121
|
||||
.. _#131: https://github.com/danielquinn/paperless/issues/131
|
||||
.. _#146: https://github.com/danielquinn/paperless/issues/146
|
||||
.. _#148: https://github.com/danielquinn/paperless/pull/148
|
||||
.. _#150: https://github.com/danielquinn/paperless/pull/150
|
||||
.. _#158: https://github.com/danielquinn/paperless/issues/158
|
||||
.. _#171: https://github.com/danielquinn/paperless/issues/171
|
||||
.. _#172: https://github.com/danielquinn/paperless/issues/172
|
||||
.. _#179: https://github.com/danielquinn/paperless/pull/179
|
||||
.. _#199: https://github.com/danielquinn/paperless/issues/199
|
||||
.. _#200: https://github.com/danielquinn/paperless/issues/200
|
||||
.. _#206: https://github.com/danielquinn/paperless/issues/206
|
||||
.. _#212: https://github.com/danielquinn/paperless/pull/212
|
||||
.. _#220: https://github.com/danielquinn/paperless/pull/220
|
||||
.. _#224: https://github.com/danielquinn/paperless/pull/224
|
||||
.. _#226: https://github.com/danielquinn/paperless/pull/226
|
||||
.. _#227: https://github.com/danielquinn/paperless/pull/227
|
||||
.. _#228: https://github.com/danielquinn/paperless/pull/228
|
||||
.. _#229: https://github.com/danielquinn/paperless/pull/229
|
||||
.. _#230: https://github.com/danielquinn/paperless/pull/230
|
||||
.. _#232: https://github.com/danielquinn/paperless/issues/232
|
||||
.. _#235: https://github.com/danielquinn/paperless/issues/235
|
||||
.. _#236: https://github.com/danielquinn/paperless/issues/236
|
||||
.. _#255: https://github.com/danielquinn/paperless/pull/255
|
||||
.. _#268: https://github.com/danielquinn/paperless/pull/268
|
||||
.. _#277: https://github.com/danielquinn/paperless/pull/277
|
||||
.. _#272: https://github.com/danielquinn/paperless/issues/272
|
||||
.. _#248: https://github.com/danielquinn/paperless/issues/248
|
||||
.. _#278: https://github.com/danielquinn/paperless/issues/248
|
||||
.. _#283: https://github.com/danielquinn/paperless/issues/283
|
||||
.. _#256: https://github.com/danielquinn/paperless/pull/256
|
||||
.. _#285: https://github.com/danielquinn/paperless/pull/285
|
||||
.. _#291: https://github.com/danielquinn/paperless/pull/291
|
||||
.. _#295: https://github.com/danielquinn/paperless/pull/295
|
||||
.. _#299: https://github.com/danielquinn/paperless/issues/299
|
||||
.. _#300: https://github.com/danielquinn/paperless/pull/300
|
||||
.. _#301: https://github.com/danielquinn/paperless/issues/301
|
||||
.. _#303: https://github.com/danielquinn/paperless/issues/303
|
||||
.. _#305: https://github.com/danielquinn/paperless/issues/305
|
||||
.. _#306: https://github.com/danielquinn/paperless/issues/306
|
||||
.. _#308: https://github.com/danielquinn/paperless/issues/308
|
||||
.. _#311: https://github.com/danielquinn/paperless/pull/311
|
||||
.. _#312: https://github.com/danielquinn/paperless/pull/312
|
||||
.. _#313: https://github.com/danielquinn/paperless/pull/313
|
||||
.. _#322: https://github.com/danielquinn/paperless/pull/322
|
||||
.. _#328: https://github.com/danielquinn/paperless/pull/328
|
||||
.. _#253: https://github.com/danielquinn/paperless/issues/253
|
||||
.. _#323: https://github.com/danielquinn/paperless/issues/323
|
||||
.. _#344: https://github.com/danielquinn/paperless/pull/344
|
||||
.. _#351: https://github.com/danielquinn/paperless/pull/351
|
||||
.. _#352: https://github.com/danielquinn/paperless/pull/352
|
||||
.. _#354: https://github.com/danielquinn/paperless/issues/354
|
||||
|
||||
.. _pipenv: https://docs.pipenv.org/
|
||||
.. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/
|
||||
|
@@ -40,7 +40,7 @@ extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.todo',
|
||||
'sphinx.ext.pngmath',
|
||||
'sphinx.ext.imgmath',
|
||||
'sphinx.ext.viewcode',
|
||||
]
|
||||
|
||||
|
@@ -121,18 +121,21 @@ So, with all that in mind, here's what you do to get it running:
|
||||
|
||||
1. Setup a new email account somewhere, or if you're feeling daring, create a
|
||||
folder in an existing email box and note the path to that folder.
|
||||
2. In ``settings.py`` set all of the appropriate values in ``MAIL_CONSUMPTION``.
|
||||
2. In ``/etc/paperless.conf`` set all of the appropriate values in
|
||||
``PATHS AND FOLDERS`` and ``SECURITY``.
|
||||
If you decided to use a subfolder of an existing account, then make sure you
|
||||
set ``INBOX`` accordingly here. You also have to set the
|
||||
``UPLOAD_SHARED_SECRET`` to something you can remember 'cause you'll have to
|
||||
include that in every email you send.
|
||||
set ``PAPERLESS_CONSUME_MAIL_INBOX`` accordingly here. You also have to set
|
||||
the ``PAPERLESS_EMAIL_SECRET`` to something you can remember 'cause you'll
|
||||
have to include that in every email you send.
|
||||
3. Restart the :ref:`consumer <utilities-consumer>`. The consumer will check
|
||||
the configured email account every 10 minutes for something new and pull down
|
||||
whatever it finds.
|
||||
the configured email account at startup and from then on every 10 minutes
|
||||
for something new and pulls down whatever it finds.
|
||||
4. Send yourself an email! Note that the subject is treated as the file name,
|
||||
so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
|
||||
get what you expect. Also, you must include the aforementioned secret
|
||||
string in every email so the fetcher knows that it's safe to import.
|
||||
Note that Paperless only allows the email title to consist of safe characters
|
||||
to be imported. These consist of alpha-numeric characters and ``-_ ,.'``.
|
||||
5. After a few minutes, the consumer will poll your mailbox, pull down the
|
||||
message, and place the attachment in the consumption directory with the
|
||||
appropriate name. A few minutes later, the consumer will import it like any
|
||||
@@ -144,46 +147,83 @@ So, with all that in mind, here's what you do to get it running:
|
||||
HTTP POST
|
||||
=========
|
||||
|
||||
You can also submit a document via HTTP POST. It doesn't do tags yet, and the
|
||||
URL schema isn't concrete, but it's a start.
|
||||
|
||||
To push your document to Paperless, send an HTTP POST to the server with the
|
||||
following name/value pairs:
|
||||
You can also submit a document via HTTP POST, so long as you do so after
|
||||
authenticating. To push your document to Paperless, send an HTTP POST to the
|
||||
server with the following name/value pairs:
|
||||
|
||||
* ``correspondent``: The name of the document's correspondent. Note that there
|
||||
are restrictions on what characters you can use here. Specifically,
|
||||
alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else it
|
||||
alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else is
|
||||
out. You also can't use the sequence ` - ` (space, dash, space).
|
||||
* ``title``: The title of the document. The rules for characters is the same
|
||||
here as the correspondent.
|
||||
* ``signature``: For security reasons, we have the correspondent send a
|
||||
signature using a "shared secret" method to make sure that random strangers
|
||||
don't start uploading stuff to your server. The means of generating this
|
||||
signature is defined below.
|
||||
* ``document``: The file you're uploading
|
||||
|
||||
Specify ``enctype="multipart/form-data"``, and then POST your file with::
|
||||
|
||||
Content-Disposition: form-data; name="document"; filename="whatever.pdf"
|
||||
|
||||
An example of this in HTML is a typical form:
|
||||
|
||||
.. _consumption-http-signature:
|
||||
.. code:: html
|
||||
|
||||
Generating the Signature
|
||||
------------------------
|
||||
<form method="post" enctype="multipart/form-data">
|
||||
<input type="text" name="correspondent" value="My Correspondent" />
|
||||
<input type="text" name="title" value="My Title" />
|
||||
<input type="file" name="document" />
|
||||
<input type="submit" name="go" value="Do the thing" />
|
||||
</form>
|
||||
|
||||
Generating a signature based a shared secret is pretty simple: define a secret,
|
||||
and store it on the server and the client. Then use that secret, along with
|
||||
the text you want to verify to generate a string that you can use for
|
||||
verification.
|
||||
|
||||
In the case of Paperless, you configure the server with the secret by setting
|
||||
``UPLOAD_SHARED_SECRET``. Then on your client, you generate your signature by
|
||||
concatenating the correspondent, title, and the secret, and then using sha256
|
||||
to generate a hexdigest.
|
||||
|
||||
If you're using Python, this is what that looks like:
|
||||
But a potentially more useful way to do this would be in Python. Here we use
|
||||
the requests library to handle basic authentication and to send the POST data
|
||||
to the URL.
|
||||
|
||||
.. code:: python
|
||||
|
||||
import os
|
||||
|
||||
from hashlib import sha256
|
||||
signature = sha256(correspondent + title + secret).hexdigest()
|
||||
|
||||
import requests
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
# You authenticate via BasicAuth or with a session id.
|
||||
# We use BasicAuth here
|
||||
username = "my-username"
|
||||
password = "my-super-secret-password"
|
||||
|
||||
# Where you have Paperless installed and listening
|
||||
url = "http://localhost:8000/push"
|
||||
|
||||
# Document metadata
|
||||
correspondent = "Test Correspondent"
|
||||
title = "Test Title"
|
||||
|
||||
# The local file you want to push
|
||||
path = "/path/to/some/directory/my-document.pdf"
|
||||
|
||||
|
||||
with open(path, "rb") as f:
|
||||
|
||||
response = requests.post(
|
||||
url=url,
|
||||
data={"title": title, "correspondent": correspondent},
|
||||
files={"document": (os.path.basename(path), f, "application/pdf")},
|
||||
auth=HTTPBasicAuth(username, password),
|
||||
allow_redirects=False
|
||||
)
|
||||
|
||||
if response.status_code == 202:
|
||||
|
||||
# Everything worked out ok
|
||||
print("Upload successful")
|
||||
|
||||
else:
|
||||
|
||||
# If you don't get a 202, it's probably because your credentials
|
||||
# are wrong or something. This will give you a rough idea of what
|
||||
# happened.
|
||||
|
||||
print("We got HTTP status code: {}".format(response.status_code))
|
||||
for k, v in response.headers.items():
|
||||
print("{}: {}".format(k, v))
|
||||
|
112
docs/extending.rst
Normal file
@@ -0,0 +1,112 @@
|
||||
.. _extending:
|
||||
|
||||
Extending Paperless
|
||||
===================
|
||||
|
||||
For the most part, Paperless is monolithic, so extending it is often best
|
||||
managed by way of modifying the code directly and issuing a pull request on
|
||||
`GitHub`_. However, over time the project has been evolving to be a little
|
||||
more "pluggable" so that users can write their own stuff that talks to it.
|
||||
|
||||
.. _GitHub: https://github.com/danielquinn/paperless
|
||||
|
||||
|
||||
.. _extending-parsers:
|
||||
|
||||
Parsers
|
||||
-------
|
||||
|
||||
You can leverage Paperless' consumption model to have it consume files *other*
|
||||
than ones handled by default like ``.pdf``, ``.jpg``, and ``.tiff``. To do so,
|
||||
you simply follow Django's convention of creating a new app, with a few key
|
||||
requirements.
|
||||
|
||||
|
||||
.. _extending-parsers-parserspy:
|
||||
|
||||
parsers.py
|
||||
..........
|
||||
|
||||
In this file, you create a class that extends
|
||||
``documents.parsers.DocumentParser`` and go about implementing the three
|
||||
required methods:
|
||||
|
||||
* ``get_thumbnail()``: Returns the path to a file we can use as a thumbnail for
|
||||
this document.
|
||||
* ``get_text()``: Returns the text from the document and only the text.
|
||||
* ``get_date()``: If possible, this returns the date of the document, otherwise
|
||||
it should return ``None``.
|
||||
|
||||
|
||||
.. _extending-parsers-signalspy:
|
||||
|
||||
signals.py
|
||||
..........
|
||||
|
||||
At consumption time, Paperless emits a ``document_consumer_declaration``
|
||||
signal which your module has to react to in order to let the consumer know
|
||||
whether or not it's capable of handling a particular file. Think of it like
|
||||
this:
|
||||
|
||||
1. Consumer finds a file in the consumption directory.
|
||||
2. It asks all the available parsers: *"Hey, can you handle this file?"*
|
||||
3. Each parser responds with either ``None`` meaning they can't handle the
|
||||
file, or a dictionary in the following format:
|
||||
|
||||
.. code:: python
|
||||
|
||||
{
|
||||
"parser": <the class name>,
|
||||
"weight": <an integer>
|
||||
}
|
||||
|
||||
The consumer compares the ``weight`` values from all respondents and uses the
|
||||
class with the highest value to consume the document. The default parser,
|
||||
``RasterisedDocumentParser`` has a weight of ``0``.
|
||||
|
||||
|
||||
.. _extending-parsers-appspy:
|
||||
|
||||
apps.py
|
||||
.......
|
||||
|
||||
This is a standard Django file, but you'll need to add some code to it to
|
||||
connect your parser to the ``document_consumer_declaration`` signal.
|
||||
|
||||
|
||||
.. _extending-parsers-finally:
|
||||
|
||||
Finally
|
||||
.......
|
||||
|
||||
The last step is to update ``settings.py`` to include your new module.
|
||||
Eventually, this will be dynamic, but at the moment, you have to edit the
|
||||
``INSTALLED_APPS`` section manually. Simply add the path to your AppConfig to
|
||||
the list like this:
|
||||
|
||||
.. code:: python
|
||||
|
||||
INSTALLED_APPS = [
|
||||
...
|
||||
"my_module.apps.MyModuleConfig",
|
||||
...
|
||||
]
|
||||
|
||||
Order doesn't matter, but generally it's a good idea to place your module lower
|
||||
in the list so that you don't end up accidentally overriding project defaults
|
||||
somewhere.
|
||||
|
||||
|
||||
.. _extending-parsers-example:
|
||||
|
||||
An Example
|
||||
..........
|
||||
|
||||
The core Paperless functionality is based on this design, so if you want to see
|
||||
what a parser module should look like, have a look at `parsers.py`_,
|
||||
`signals.py`_, and `apps.py`_ in the `paperless_tesseract`_ module.
|
||||
|
||||
.. _parsers.py: https://github.com/danielquinn/paperless/blob/master/src/paperless_tesseract/parsers.py
|
||||
.. _signals.py: https://github.com/danielquinn/paperless/blob/master/src/paperless_tesseract/signals.py
|
||||
.. _apps.py: https://github.com/danielquinn/paperless/blob/master/src/paperless_tesseract/apps.py
|
||||
.. _paperless_tesseract: https://github.com/danielquinn/paperless/blob/master/src/paperless_tesseract/
|
@@ -80,6 +80,12 @@ text and matching algorithm. From the help info there:
|
||||
uses a regex to match the PDF. If you don't know what a regex is, you
|
||||
probably don't want this option.
|
||||
|
||||
When using the "any" or "all" matching algorithms, you can search for terms that
|
||||
consist of multiple words by enclosing them in double quotes. For example, defining
|
||||
a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match
|
||||
documents that contain either "Bank of America" or "BofA", but will not match
|
||||
documents containing "Bank of South America".
|
||||
|
||||
Then just save your tag/correspondent and run another document through the
|
||||
consumer. Once complete, you should see the newly-created document,
|
||||
automatically tagged with the appropriate data.
|
||||
|
@@ -3,7 +3,11 @@
|
||||
Paperless
|
||||
=========
|
||||
|
||||
Scan, index, and archive all of your paper documents. Say goodbye to paper.
|
||||
Paperless is a simple Django application running in two parts:
|
||||
a :ref:`consumer <utilities-consumer>` (the thing that does the indexing) and
|
||||
the :ref:`webserver <utilities-webserver>` (the part that lets you search &
|
||||
download already-indexed documents). If you want to learn more about its
|
||||
functions keep on reading after the installation section.
|
||||
|
||||
|
||||
.. _index-why-this-exists:
|
||||
@@ -12,13 +16,15 @@ Why This Exists
|
||||
===============
|
||||
|
||||
Paper is a nightmare. Environmental issues aside, there's no excuse for it in
|
||||
the 21st century. It takes up space, collects dust, doesn't support any form of
|
||||
a search feature, indexing is tedious, it's heavy and prone to damage & loss.
|
||||
the 21st century. It takes up space, collects dust, doesn't support any form
|
||||
of a search feature, indexing is tedious, it's heavy and prone to damage &
|
||||
loss.
|
||||
|
||||
I wrote this to make "going paperless" easier. I do not have to worry about
|
||||
finding stuff again. I feed documents right from the post box into the scanner
|
||||
and then shred them. Perhaps you might find it useful too.
|
||||
|
||||
|
||||
I wrote this to make "going paperless" easier. I wanted to be able to feed
|
||||
documents right from the post box into the scanner and then shred them so I
|
||||
never have to worry about finding stuff again. Perhaps you might find it useful
|
||||
too.
|
||||
|
||||
|
||||
Contents
|
||||
@@ -34,5 +40,7 @@ Contents
|
||||
utilities
|
||||
guesswork
|
||||
migrating
|
||||
extending
|
||||
troubleshooting
|
||||
scanners
|
||||
changelog
|
||||
|
@@ -94,7 +94,7 @@ You may want to take a look at the ``paperless.conf.example`` file to see if
|
||||
there's anything new in there compared to what you've got int ``/etc``.
|
||||
|
||||
If you are :ref:`using Docker <setup-installation-docker>` the update process
|
||||
requires only one additional step:
|
||||
is similar:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
@@ -102,7 +102,6 @@ requires only one additional step:
|
||||
$ git pull
|
||||
$ docker build -t paperless .
|
||||
$ docker-compose up -d
|
||||
$ docker-compose run --rm webserver migrate
|
||||
|
||||
If ``git pull`` doesn't report any changes, there is no need to continue with
|
||||
the remaining steps.
|
||||
|
@@ -4,31 +4,34 @@ Requirements
|
||||
============
|
||||
|
||||
You need a Linux machine or Unix-like setup (theoretically an Apple machine
|
||||
should work) that has the following software installed on it:
|
||||
should work) that has the following software installed:
|
||||
|
||||
* `Python3`_ (with development libraries, pip and virtualenv)
|
||||
* `GNU Privacy Guard`_
|
||||
* `Tesseract`_, plus its language files matching your document base.
|
||||
* `Imagemagick`_ version 6.7.5 or higher
|
||||
* `unpaper`_
|
||||
* `libpoppler-cpp-dev`_ PDF rendering library
|
||||
|
||||
.. _Python3: https://python.org/
|
||||
.. _GNU Privacy Guard: https://gnupg.org
|
||||
.. _Tesseract: https://github.com/tesseract-ocr
|
||||
.. _Imagemagick: http://imagemagick.org/
|
||||
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
|
||||
.. _libpoppler-cpp-dev: https://poppler.freedesktop.org/
|
||||
|
||||
Notably, you should confirm how you access your Python3 installation. Many
|
||||
Linux distributions will install Python3 in parallel to Python2, using the names
|
||||
``python3`` and ``python`` respectively. The same goes for ``pip3`` and
|
||||
``pip``. Using Python2 will likely break things, so make sure that you're using
|
||||
the right version.
|
||||
Linux distributions will install Python3 in parallel to Python2, using the
|
||||
names ``python3`` and ``python`` respectively. The same goes for ``pip3`` and
|
||||
``pip``. Running Paperless with Python2 will likely break things, so make sure
|
||||
that you're using the right version.
|
||||
|
||||
For the purposes of simplicity, ``python`` and ``pip`` is used everywhere to
|
||||
refer to their Python 3 versions.
|
||||
refer to their Python3 versions.
|
||||
|
||||
In addition to the above, there are a number of Python requirements, all of
|
||||
which are listed in a file called ``requirements.txt`` in the project root.
|
||||
which are listed in a file called ``requirements.txt`` in the project root
|
||||
directory.
|
||||
|
||||
If you're not working on a virtual environment (like Vagrant or Docker), you
|
||||
should probably be using a virtualenv, but that's your call. The reasons why
|
||||
@@ -39,12 +42,13 @@ probably figure that out before continuing.
|
||||
|
||||
.. _requirements-apple:
|
||||
|
||||
Apple-tastic Complications
|
||||
--------------------------
|
||||
Problems with Imagemagick & PDFs
|
||||
--------------------------------
|
||||
|
||||
Some users have `run into problems`_ with installing ImageMagick on Apple
|
||||
systems using HomeBrew. The solution appears to be to install ghostscript as
|
||||
well as ImageMagick:
|
||||
Some users have `run into problems`_ with getting ImageMagick to do its thing
|
||||
with PDFs. Often this is the case with Apple systems using HomeBrew, but other
|
||||
Linuxes have been a problem as well. The solution appears to be to install
|
||||
ghostscript as well as ImageMagick:
|
||||
|
||||
.. _run into problems: https://github.com/danielquinn/paperless/issues/25
|
||||
|
||||
@@ -67,7 +71,7 @@ dependencies is easy:
|
||||
|
||||
$ pip install --user --requirement /path/to/paperless/requirements.txt
|
||||
|
||||
This should download and install all of the requirements into
|
||||
This will download and install all of the requirements into
|
||||
``${HOME}/.local``. Remember that your distribution may be using ``pip3`` as
|
||||
mentioned above.
|
||||
|
||||
@@ -86,8 +90,8 @@ enter it, and install the requirements using the ``requirements.txt`` file:
|
||||
$ . /path/to/arbitrary/directory/bin/activate
|
||||
$ pip install --requirement /path/to/paperless/requirements.txt
|
||||
|
||||
Now you're ready to go. Just remember to enter your virtualenv whenever you
|
||||
want to use Paperless.
|
||||
Now you're ready to go. Just remember to enter (activate) your virtualenv
|
||||
whenever you want to use Paperless.
|
||||
|
||||
|
||||
.. _requirements-documentation:
|
||||
@@ -95,7 +99,7 @@ want to use Paperless.
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
As generation of the documentation is not required for use of Paperless,
|
||||
As generation of the documentation is not required for the use of Paperless,
|
||||
dependencies for this process are not included in ``requirements.txt``. If
|
||||
you'd like to generate your own docs locally, you'll need to:
|
||||
|
||||
|
29
docs/scanners.rst
Normal file
@@ -0,0 +1,29 @@
|
||||
.. _scanners:
|
||||
|
||||
Scanner Recommendations
|
||||
=======================
|
||||
|
||||
As Paperless operates by watching a folder for new files, doesn't care what
|
||||
scanner you use, but sometimes finding a scanner that will write to an FTP,
|
||||
NFS, or SMB server can be difficult. This page is here to help you find one
|
||||
that works right for you based on recommentations from other Paperless users.
|
||||
|
||||
+---------+----------------+-----+-----+-----+----------------+
|
||||
| Brand | Model | Supports | Recommended By |
|
||||
+---------+----------------+-----+-----+-----+----------------+
|
||||
| | | FTP | NFS | SMB | |
|
||||
+=========+================+=====+=====+=====+================+
|
||||
| Brother | `ADS-1500W`_ | yes | no | yes | `danielquinn`_ |
|
||||
+---------+----------------+-----+-----+-----+----------------+
|
||||
| Brother | `MFC-J6930DW`_ | yes | | | `ayounggun`_ |
|
||||
+---------+----------------+-----+-----+-----+----------------+
|
||||
| Fujitsu | `ix500`_ | yes | | yes | `eonist`_ |
|
||||
+---------+----------------+-----+-----+-----+----------------+
|
||||
|
||||
.. _ADS-1500W: https://www.brother.ca/en/p/ads1500w
|
||||
.. _MFC-J6930DW: https://www.brother.ca/en/p/MFCJ6930DW
|
||||
.. _ix500: http://www.fujitsu.com/us/products/computing/peripheral/scanners/scansnap/ix500/
|
||||
|
||||
.. _danielquinn: https://github.com/danielquinn
|
||||
.. _ayounggun: https://github.com/ayounggun
|
||||
.. _eonist: https://github.com/eonist
|
184
docs/setup.rst
@@ -4,7 +4,7 @@ Setup
|
||||
=====
|
||||
|
||||
Paperless isn't a very complicated app, but there are a few components, so some
|
||||
basic documentation is in order. If you go follow along in this document and
|
||||
basic documentation is in order. If you follow along in this document and
|
||||
still have trouble, please open an `issue on GitHub`_ so I can fill in the
|
||||
gaps.
|
||||
|
||||
@@ -28,6 +28,7 @@ or just download the tarball and go that route:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ cd to the directory where you want to run Paperless
|
||||
$ wget https://github.com/danielquinn/paperless/archive/master.zip
|
||||
$ unzip master.zip
|
||||
$ cd paperless-master
|
||||
@@ -43,7 +44,9 @@ route`_ is quick & easy, but means you're running a VM which comes with memory
|
||||
consumption etc. We also `support Docker`_, which you can use natively under
|
||||
Linux and in a VM with `Docker Machine`_ (this guide was written for native
|
||||
Docker usage under Linux, you might have to adapt it for Docker Machine.)
|
||||
Alternatively the standard, `bare metal`_ approach is a little more
|
||||
Not to forget the virtualenv, this is similar to `bare metal`_ with the
|
||||
exception that you have to activate the virtualenv first.
|
||||
Last but not least, the standard `bare metal`_ approach is a little more
|
||||
complicated, but worth it because it makes it easier should you want to
|
||||
contribute some code back.
|
||||
|
||||
@@ -59,9 +62,11 @@ Standard (Bare Metal)
|
||||
.....................
|
||||
|
||||
1. Install the requirements as per the :ref:`requirements <requirements>` page.
|
||||
2. Change to the ``src`` directory in this repo.
|
||||
3. Copy ``paperless.conf.example`` to ``/etc/paperless.conf`` and open it in
|
||||
your favourite editor. Set the values for:
|
||||
2. Within the extract of master.zip go to the ``src`` directory.
|
||||
3. Copy ``../paperless.conf.example`` to ``/etc/paperless.conf`` also the virtual
|
||||
envrionment look there for it and open it in your favourite editor.
|
||||
Because this file contains passwords it should only be readable by user root
|
||||
and paperless ! Set the values for:
|
||||
|
||||
* ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be
|
||||
dumped to be consumed by Paperless.
|
||||
@@ -70,18 +75,19 @@ Standard (Bare Metal)
|
||||
* ``PAPERLESS_OCR_THREADS``: this is the number of threads the OCR process
|
||||
will spawn to process document pages in parallel.
|
||||
|
||||
4. Initialise the database with ``./manage.py migrate``.
|
||||
4. Initialise the SQLite database with ``./manage.py migrate``.
|
||||
5. Create a user for your Paperless instance with
|
||||
``./manage.py createsuperuser``. Follow the prompts to create your user.
|
||||
6. Start the webserver with ``./manage.py runserver <IP>:<PORT>``.
|
||||
If no specifc IP or port are given, the default is ``127.0.0.1:8000``.
|
||||
You should now be able to visit your (empty) `Paperless webserver`_ at
|
||||
``127.0.0.1:8000`` (or whatever you chose). You can login with the
|
||||
user/pass you created in #5.
|
||||
If no specifc IP or port are given, the default is ``127.0.0.1:8000``
|
||||
also known as http://localhost:8000/.
|
||||
You should now be able to visit your (empty) at `Paperless webserver`_ or
|
||||
whatever you chose before. You can login with the user/pass you created in
|
||||
#5.
|
||||
7. In a separate window, change to the ``src`` directory in this repo again,
|
||||
but this time, you should start the consumer script with
|
||||
``./manage.py document_consumer``.
|
||||
8. Scan something. Put it in the ``CONSUMPTION_DIR``.
|
||||
8. Scan something or put a file into the ``CONSUMPTION_DIR``.
|
||||
9. Wait a few minutes
|
||||
10. Visit the document list on your webserver, and it should be there, indexed
|
||||
and downloadable.
|
||||
@@ -89,48 +95,6 @@ Standard (Bare Metal)
|
||||
.. _Paperless webserver: http://127.0.0.1:8000
|
||||
|
||||
|
||||
.. _setup-installation-vagrant:
|
||||
|
||||
Vagrant Method
|
||||
..............
|
||||
|
||||
1. Install `Vagrant`_. How you do that is really between you and your OS.
|
||||
2. Run ``vagrant up``. An instance will start up for you. When it's ready and
|
||||
provisioned...
|
||||
3. Run ``vagrant ssh`` and once inside your new vagrant box, edit
|
||||
``/etc/paperless.conf`` and set the values for:
|
||||
|
||||
* ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be
|
||||
dumped to be consumed by Paperless.
|
||||
* ``PAPERLESS_PASSPHRASE``: this is the passphrase Paperless uses to
|
||||
encrypt/decrypt the original document.
|
||||
* ``PAPERLESS_SHARED_SECRET``: this is the "magic word" used when consuming
|
||||
documents from mail or via the API. If you don't use either, leaving it
|
||||
blank is just fine.
|
||||
|
||||
4. Exit the vagrant box and re-enter it with ``vagrant ssh`` again. This
|
||||
updates the environment to make use of the changes you made to the config
|
||||
file.
|
||||
5. Initialise the database with ``/opt/paperless/src/manage.py migrate``.
|
||||
6. Still inside your vagrant box, create a user for your Paperless instance
|
||||
with ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to
|
||||
create your user.
|
||||
7. Start the webserver with
|
||||
``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. You should now be
|
||||
able to visit your (empty) `Paperless webserver`_ at ``172.28.128.4:8000``.
|
||||
You can login with the user/pass you created in #6.
|
||||
8. In a separate window, run ``vagrant ssh`` again, but this time once inside
|
||||
your vagrant instance, you should start the consumer script with
|
||||
``/opt/paperless/src/manage.py document_consumer``.
|
||||
9. Scan something. Put it in the ``CONSUMPTION_DIR``.
|
||||
10. Wait a few minutes
|
||||
11. Visit the document list on your webserver, and it should be there, indexed
|
||||
and downloadable.
|
||||
|
||||
.. _Vagrant: https://vagrantup.com/
|
||||
.. _Paperless server: http://172.28.128.4:8000
|
||||
|
||||
|
||||
.. _setup-installation-docker:
|
||||
|
||||
Docker Method
|
||||
@@ -169,7 +133,8 @@ Docker Method
|
||||
modified versions of the configuration files.
|
||||
4. Modify ``docker-compose.yml`` to your preferences, following the
|
||||
instructions in comments in the file. The only change that is a hard
|
||||
requirement is to specify where the consumption directory should mount.
|
||||
requirement is to specify where the consumption directory should
|
||||
mount.[#dockercomposeyml]_
|
||||
5. Modify ``docker-compose.env`` and adapt the following environment variables:
|
||||
|
||||
``PAPERLESS_PASSPHRASE``
|
||||
@@ -186,7 +151,7 @@ Docker Method
|
||||
default English, set this parameter to a space separated list of
|
||||
three-letter language-codes after `ISO 639-2/T`_. For a list of available
|
||||
languages -- including their three letter codes -- see the
|
||||
`Debian packagelist`_.
|
||||
`Alpine packagelist`_.
|
||||
|
||||
``USERMAP_UID`` and ``USERMAP_GID``
|
||||
If you want to mount the consumption volume (directory ``/consume`` within
|
||||
@@ -276,12 +241,60 @@ Docker Method
|
||||
.. _Docker: https://www.docker.com/
|
||||
.. _docker-compose: https://docs.docker.com/compose/install/
|
||||
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
|
||||
.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
|
||||
.. _Alpine packagelist: https://pkgs.alpinelinux.org/packages?name=tesseract-ocr-data*&arch=x86_64
|
||||
|
||||
.. [#compose] You of course don't have to use docker-compose, but it
|
||||
simplifies deployment immensely. If you know your way around Docker, feel
|
||||
free to tinker around without using compose!
|
||||
|
||||
.. [#dockercomposeyml] If you're upgrading your docker-compose images from
|
||||
version 1.1.0 or earlier, you might need to change in the
|
||||
``docker-compose.yml`` file the ``image: pitkley/paperless`` directive in
|
||||
both the ``webserver`` and ``consumer`` sections to ``build: ./`` as per the
|
||||
newer ``docker-compose.yml.example`` file
|
||||
|
||||
|
||||
.. _setup-installation-vagrant:
|
||||
|
||||
Vagrant Method
|
||||
..............
|
||||
|
||||
1. Install `Vagrant`_. How you do that is really between you and your OS.
|
||||
2. Run ``vagrant up``. An instance will start up for you. When it's ready and
|
||||
provisioned...
|
||||
3. Run ``vagrant ssh`` and once inside your new vagrant box, edit
|
||||
``/etc/paperless.conf`` and set the values for:
|
||||
|
||||
* ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be
|
||||
dumped to be consumed by Paperless.
|
||||
* ``PAPERLESS_PASSPHRASE``: this is the passphrase Paperless uses to
|
||||
encrypt/decrypt the original document.
|
||||
* ``PAPERLESS_EMAIL_SECRET``: this is the "magic word" used when consuming
|
||||
documents from mail or via the API. If you don't use either, leaving it
|
||||
blank is just fine.
|
||||
|
||||
4. Exit the vagrant box and re-enter it with ``vagrant ssh`` again. This
|
||||
updates the environment to make use of the changes you made to the config
|
||||
file.
|
||||
5. Initialise the database with ``/opt/paperless/src/manage.py migrate``.
|
||||
6. Still inside your vagrant box, create a user for your Paperless instance
|
||||
with ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to
|
||||
create your user.
|
||||
7. Start the webserver with
|
||||
``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. You should now be
|
||||
able to visit your (empty) `Paperless webserver`_ at ``172.28.128.4:8000``.
|
||||
You can login with the user/pass you created in #6.
|
||||
8. In a separate window, run ``vagrant ssh`` again, but this time once inside
|
||||
your vagrant instance, you should start the consumer script with
|
||||
``/opt/paperless/src/manage.py document_consumer``.
|
||||
9. Scan something. Put it in the ``CONSUMPTION_DIR``.
|
||||
10. Wait a few minutes
|
||||
11. Visit the document list on your webserver, and it should be there, indexed
|
||||
and downloadable.
|
||||
|
||||
.. _Vagrant: https://vagrantup.com/
|
||||
.. _Paperless server: http://172.28.128.4:8000
|
||||
|
||||
|
||||
.. _setup-permanent:
|
||||
|
||||
@@ -299,17 +312,21 @@ Standard (Bare Metal, Systemd)
|
||||
|
||||
If you're running on a bare metal system that's using Systemd, you can use the
|
||||
service unit files in the ``scripts`` directory to set this up. You'll need to
|
||||
create a user called ``paperless`` and setup Paperless to be in a place that
|
||||
this new user can read and write to. Be sure to edit the service scripts to point
|
||||
to the proper location of your paperless install, referencing the appropriate Python
|
||||
binary. For example: ``ExecStart=/path/to/python3 /path/to/paperless/src/manage.py document_consumer``.
|
||||
If you don't want to make a new user, you can change the ``Group`` and ``User`` variables
|
||||
accordingly.
|
||||
create a user called ``paperless`` (without login (if not already done so #5))
|
||||
and setup Paperless to be in a place that this new user can read and write to.
|
||||
Be sure to edit the service scripts to point to the proper location of your
|
||||
paperless install, referencing the appropriate Python binary. For example:
|
||||
``ExecStart=/path/to/python3 /path/to/paperless/src/manage.py document_consumer``.
|
||||
If you don't want to make a new user, you can change the ``Group`` and ``User``
|
||||
variables accordingly.
|
||||
|
||||
Then, you can just tell Systemd as ``root`` (or using ``sudo``) to enable the two ``.service`` files::
|
||||
Then, as ``root`` (or using ``sudo``) you can just copy the ``.service`` files
|
||||
to the Systemd directory and tell it to enable the two services::
|
||||
|
||||
# systemctl enable /path/to/paperless/scripts/paperless-consumer.service
|
||||
# systemctl enable /path/to/paperless/scripts/paperless-webserver.service
|
||||
# cp /path/to/paperless/scripts/paperless-consumer.service /etc/systemd/system/
|
||||
# cp /path/to/paperless/scripts/paperless-webserver.service /etc/systemd/system/
|
||||
# systemctl enable paperless-consumer
|
||||
# systemctl enable paperless-webserver
|
||||
# systemctl start paperless-consumer
|
||||
# systemctl start paperless-webserver
|
||||
|
||||
@@ -335,7 +352,7 @@ after restarting your system:
|
||||
respawn limit 10 5
|
||||
|
||||
script
|
||||
exec /srv/paperless/src/manage.py runserver 0.0.0.0:80
|
||||
exec /srv/paperless/src/manage.py runserver --noreload 0.0.0.0:80
|
||||
end script
|
||||
|
||||
Note that you'll need to replace ``/srv/paperless/src/manage.py`` with the
|
||||
@@ -344,7 +361,7 @@ after restarting your system:
|
||||
If you are using a network interface other than ``eth0``, you will have to
|
||||
change ``IFACE=eth0``. For example, if you are connected via WiFi, you will
|
||||
likely need to replace ``eth0`` above with ``wlan0``. To see all interfaces,
|
||||
run ``ifconfig``.
|
||||
run ``ifconfig -a``.
|
||||
|
||||
Save the file.
|
||||
|
||||
@@ -384,7 +401,10 @@ Using a Real Webserver
|
||||
The default is to use Django's development server, as that's easy and does the
|
||||
job well enough on a home network. However, if you want to do things right,
|
||||
it's probably a good idea to use a webserver capable of handling more than one
|
||||
thread.
|
||||
thread. You will also have to let the webserver serve the static files (CSS,
|
||||
JavaScript) from the directory configured in ``PAPERLESS_STATICDIR``. For that,
|
||||
you need to run ``./manage.py collectstatic`` in the ``src`` directory. The
|
||||
default static files directory is ``../static``.
|
||||
|
||||
Apache
|
||||
~~~~~~
|
||||
@@ -550,7 +570,8 @@ your gunicorn instance. This should do the trick:
|
||||
Vagrant
|
||||
.......
|
||||
|
||||
You may use the Ubuntu explanation above. Replace ``(local-filesystems and net-device-up IFACE=eth0)`` with ``vagrant-mounted``.
|
||||
You may use the Ubuntu explanation above. Replace
|
||||
``(local-filesystems and net-device-up IFACE=eth0)`` with ``vagrant-mounted``.
|
||||
|
||||
.. _setup-permanent-docker:
|
||||
|
||||
@@ -562,3 +583,28 @@ If you're using Docker, you can set a restart-policy_ in the
|
||||
Docker daemon.
|
||||
|
||||
.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart
|
||||
|
||||
|
||||
.. _setup-subdirectory:
|
||||
|
||||
Hosting Paperless in a Subdirectory
|
||||
-----------------------------------
|
||||
|
||||
Paperless was designed to run off the root of the hosting domain,
|
||||
(ie: ``https://example.com/``) but with a few changes, you can configure
|
||||
it to run in a subdirectory on your server
|
||||
(ie: ``https://example.com/paperless/``).
|
||||
|
||||
Thanks to the efforts of `maphy-psd`_ on `Github`_, running Paperless in a
|
||||
subdirectory is now as easy as setting a config variable. Simply set
|
||||
``PAPERLESS_FORCE_SCRIPT_NAME`` in your environment or
|
||||
``/etc/paperless.conf`` to the path you want Paperless hosted at, configure
|
||||
Nginx/Apache for your needs and you're done. So, if you want Paperless to live
|
||||
at ``https://example.com/arbitrary/path/to/paperless`` then you just set
|
||||
``PAPERLESS_FORCE_SCRIPT_NAME`` to ``/arbitrary/path/to/paperless``. Note the
|
||||
leading ``/`` there.
|
||||
|
||||
As to how to configure Nginx or Apache for this, that's on you :-)
|
||||
|
||||
.. _maphy-psd: https://github.com/maphy-psd
|
||||
.. _Github: https://github.com/danielquinn/paperless/pull/255
|
||||
|
@@ -33,8 +33,11 @@ The webserver is started via the ``manage.py`` script:
|
||||
By default, the server runs on localhost, port 8000, but you can change this
|
||||
with a few arguments, run ``manage.py --help`` for more information.
|
||||
|
||||
Note that this command runs continuously, so exiting it will mean your webserver
|
||||
disappears. If you want to run this full-time (which is kind of the point)
|
||||
Add the option ``--noreload`` to reduce resource usage. Otherwise, the server
|
||||
continuously polls all source files for changes to auto-reload them.
|
||||
|
||||
Note that when exiting this command your webserver will disappear.
|
||||
If you want to run this full-time (which is kind of the point)
|
||||
you'll need to have it start in the background -- something you'll need to
|
||||
figure out for your own system. To get you started though, there are Systemd
|
||||
service files in the ``scripts`` directory.
|
||||
@@ -46,17 +49,18 @@ The Consumer
|
||||
------------
|
||||
|
||||
The consumer script runs in an infinite loop, constantly looking at a directory
|
||||
for PDF files to parse and index. The process is pretty straightforward:
|
||||
for documents to parse and index. The process is pretty straightforward:
|
||||
|
||||
1. Look in ``CONSUMPTION_DIR`` for a PDF. If one is found, go to #2. If not,
|
||||
wait 10 seconds and try again.
|
||||
2. Parse the PDF with Tesseract
|
||||
1. Look in ``CONSUMPTION_DIR`` for a document. If one is found, go to #2.
|
||||
If not, wait 10 seconds and try again. On Linux, new documents are detected
|
||||
instantly via inotify, so there's no waiting involved.
|
||||
2. Parse the document with Tesseract
|
||||
3. Create a new record in the database with the OCR'd text
|
||||
4. Attempt to automatically assign document attributes by doing some guesswork.
|
||||
Read up on the :ref:`guesswork documentation<guesswork>` for more
|
||||
information about this process.
|
||||
5. Encrypt the PDF and store it in the ``media`` directory under
|
||||
``documents/pdf``.
|
||||
5. Encrypt the document and store it in the ``media`` directory under
|
||||
``documents/originals``.
|
||||
6. Go to #1.
|
||||
|
||||
|
||||
@@ -71,8 +75,8 @@ The consumer is started via the ``manage.py`` script:
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_consumer
|
||||
|
||||
This starts the service that will run in a loop, consuming PDF files as they
|
||||
appear in ``CONSUMPTION_DIR``.
|
||||
This starts the service that will consume documents as they appear in
|
||||
``CONSUMPTION_DIR``.
|
||||
|
||||
Note that this command runs continuously, so exiting it will mean your webserver
|
||||
disappears. If you want to run this full-time (which is kind of the point)
|
||||
@@ -80,6 +84,13 @@ you'll need to have it start in the background -- something you'll need to
|
||||
figure out for your own system. To get you started though, there are Systemd
|
||||
service files in the ``scripts`` directory.
|
||||
|
||||
Some command line arguments are available to customize the behavior of the
|
||||
consumer. By default it will use ``/etc/paperless.conf`` values. Display the
|
||||
help with:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_consumer --help
|
||||
|
||||
.. _utilities-exporter:
|
||||
|
||||
@@ -87,8 +98,8 @@ The Exporter
|
||||
------------
|
||||
|
||||
Tired of fiddling with Paperless, or just want to do something stupid and are
|
||||
afraid of accidentally damaging your files? You can export all of your PDFs
|
||||
into neatly named, dated, and unencrypted.
|
||||
afraid of accidentally damaging your files? You can export all of your
|
||||
documents into neatly named, dated, and unencrypted files.
|
||||
|
||||
|
||||
.. _utilities-exporter-howto:
|
||||
@@ -102,10 +113,10 @@ This too is done via the ``manage.py`` script:
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/
|
||||
|
||||
This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you
|
||||
to do with as you please. The files are accompanied with a special file,
|
||||
``manifest.json`` which can be used to
|
||||
:ref:`import the files <utilities-importer>` at a later date if you wish.
|
||||
This will dump all of your unencrypted documents into ``/path/to/somewhere``
|
||||
for you to do with as you please. The files are accompanied with a special
|
||||
file, ``manifest.json`` which can be used to :ref:`import the files
|
||||
<utilities-importer>` at a later date if you wish.
|
||||
|
||||
|
||||
.. _utilities-exporter-howto-docker:
|
||||
|
@@ -1,11 +1,43 @@
|
||||
# Sample paperless.conf
|
||||
# Copy this file to /etc/paperless.conf and modify it to suit your needs.
|
||||
# As this file contains passwords it should only be readable by the user
|
||||
# running paperless.
|
||||
|
||||
|
||||
###############################################################################
|
||||
#### Paths & Folders ####
|
||||
###############################################################################
|
||||
|
||||
# This where your documents should go to be consumed. Make sure that it exists
|
||||
# and that the user running the paperless service can read/write its contents
|
||||
# before you start Paperless.
|
||||
PAPERLESS_CONSUMPTION_DIR=""
|
||||
|
||||
|
||||
# You can specify where you want the SQLite database to be stored instead of
|
||||
# the default location of /data/ within the install directory.
|
||||
#PAPERLESS_DBDIR=/path/to/database/file
|
||||
|
||||
|
||||
# Override the default MEDIA_ROOT here. This is where all files are stored.
|
||||
# The default location is /media/documents/ within the install folder.
|
||||
#PAPERLESS_MEDIADIR=/path/to/media
|
||||
|
||||
|
||||
# Override the default STATIC_ROOT here. This is where all static files
|
||||
# created using "collectstatic" manager command are stored.
|
||||
#PAPERLESS_STATICDIR=""
|
||||
|
||||
|
||||
# Override the MEDIA_URL here. Unless you're hosting Paperless off a subdomain
|
||||
# like /paperless/, you probably don't need to change this.
|
||||
#PAPERLESS_MEDIA_URL="/media/"
|
||||
|
||||
# Override the STATIC_URL here. Unless you're hosting Paperless off a
|
||||
# subdomain like /paperless/, you probably don't need to change this.
|
||||
#PAPERLESS_STATIC_URL="/static/"
|
||||
|
||||
|
||||
# These values are required if you want paperless to check a particular email
|
||||
# box every 10 minutes and attempt to consume documents from there. If you
|
||||
# don't define a HOST, mail checking will just be disabled.
|
||||
@@ -14,6 +46,19 @@ PAPERLESS_CONSUME_MAIL_PORT=""
|
||||
PAPERLESS_CONSUME_MAIL_USER=""
|
||||
PAPERLESS_CONSUME_MAIL_PASS=""
|
||||
|
||||
# Override the default IMAP inbox here. If not set Paperless defaults to
|
||||
# "INBOX".
|
||||
#PAPERLESS_CONSUME_MAIL_INBOX="INBOX"
|
||||
|
||||
# Any email sent to the target account that does not contain this text will be
|
||||
# ignored.
|
||||
PAPERLESS_EMAIL_SECRET=""
|
||||
|
||||
|
||||
###############################################################################
|
||||
#### Security ####
|
||||
###############################################################################
|
||||
|
||||
# You must have a passphrase in order for Paperless to work at all. If you set
|
||||
# this to "", GNUGPG will "encrypt" your PDF by writing it out as a zero-byte
|
||||
# file.
|
||||
@@ -28,20 +73,48 @@ PAPERLESS_CONSUME_MAIL_PASS=""
|
||||
# you've since changed it to a new one.
|
||||
PAPERLESS_PASSPHRASE="secret"
|
||||
|
||||
# If you intend to consume documents either via HTTP POST or by email, you must
|
||||
# have a shared secret here.
|
||||
PAPERLESS_SHARED_SECRET=""
|
||||
|
||||
# The secret key has a default that should be fine so long as you're hosting
|
||||
# Paperless on a closed network. However, if you're putting this anywhere
|
||||
# public, you should change the key to something unique and verbose.
|
||||
#PAPERLESS_SECRET_KEY="change-me"
|
||||
|
||||
|
||||
# If you're planning on putting Paperless on the open internet, then you
|
||||
# really should set this value to the domain name you're using. Failing to do
|
||||
# so leaves you open to HTTP host header attacks:
|
||||
# https://docs.djangoproject.com/en/1.10/topics/security/#host-headers-virtual-hosting
|
||||
#
|
||||
# Just remember that this is a comma-separated list, so "example.com" is fine,
|
||||
# as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
|
||||
#PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com"
|
||||
|
||||
# To host paperless under a subpath url like example.com/paperless you set
|
||||
# this value to /paperless. No trailing slash!
|
||||
#
|
||||
# https://docs.djangoproject.com/en/1.11/ref/settings/#force-script-name
|
||||
#PAPERLESS_FORCE_SCRIPT_NAME=""
|
||||
|
||||
# If you are using alternative authentication means or are just using paperless
|
||||
# as a single user on a small private network, this option allows you to disable
|
||||
# user authentication if you set it to "true"
|
||||
#PAPERLESS_DISABLE_LOGIN="false"
|
||||
|
||||
###############################################################################
|
||||
#### Software Tweaks ####
|
||||
###############################################################################
|
||||
|
||||
# After a document is consumed, Paperless can trigger an arbitrary script if
|
||||
# you like. This script will be passed a number of arguments for you to work
|
||||
# with. The default is blank, which means nothing will be executed. For more
|
||||
# information, take a look at the docs: http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process
|
||||
# information, take a look at the docs:
|
||||
# http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process
|
||||
#PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh"
|
||||
|
||||
|
||||
#
|
||||
# The following values use sensible defaults for modern systems, but if you're
|
||||
# running Paperless on a low-resource machine (like a Raspberry Pi), modifying
|
||||
# running Paperless on a low-resource device (like a Raspberry Pi), modifying
|
||||
# some of these values may be necessary.
|
||||
#
|
||||
|
||||
@@ -51,8 +124,15 @@ PAPERLESS_SHARED_SECRET=""
|
||||
# an integer:
|
||||
#PAPERLESS_OCR_THREADS=1
|
||||
|
||||
|
||||
# Customize the default language that tesseract will attempt to use when
|
||||
# parsing documents. It should be a 3-letter language code consistent with ISO
|
||||
# 639: https://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
#PAPERLESS_OCR_LANGUAGE=eng
|
||||
|
||||
|
||||
# On smaller systems, or even in the case of Very Large Documents, the consumer
|
||||
# may explode, complaining about how it's "unable to extent pixel cache". In
|
||||
# may explode, complaining about how it's "unable to extend pixel cache". In
|
||||
# such cases, try setting this to a reasonably low value, like 32000000. The
|
||||
# default is to use whatever is necessary to do everything without writing to
|
||||
# disk, and units are in megabytes.
|
||||
@@ -61,16 +141,6 @@ PAPERLESS_SHARED_SECRET=""
|
||||
# the web for "MAGICK_MEMORY_LIMIT".
|
||||
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
|
||||
|
||||
# By default the conversion density setting for documents is 300DPI, in some
|
||||
# cases it has proven useful to configure a lesser value.
|
||||
# This setting has a high impact on the physical size of tmp page files,
|
||||
# the speed of document conversion, and can affect the accuracy of OCR
|
||||
# results. Individual results can vary and this setting should be tested
|
||||
# thoroughly against the documents you are importing to see if it has any
|
||||
# impacts either negative or positive. Testing on limited document sets has
|
||||
# shown a setting of 200 can cut the size of tmp files by 1/3, and speed up
|
||||
# conversion by up to 4x with little impact to OCR accuracy.
|
||||
#PAPERLESS_CONVERT_DENSITY=300
|
||||
|
||||
# Similar to the memory limit, if you've got a small system and your OS mounts
|
||||
# /tmp as tmpfs, you should set this to a path that's on a physical disk, like
|
||||
@@ -81,22 +151,45 @@ PAPERLESS_SHARED_SECRET=""
|
||||
# the web for "MAGICK_TMPDIR".
|
||||
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
|
||||
|
||||
# You can specify where you want the SQLite database to be stored instead of
|
||||
# the default location
|
||||
#PAPERLESS_DBDIR=/path/to/database/file
|
||||
|
||||
# Override the default MEDIA_ROOT here. This is where all files are stored.
|
||||
#PAPERLESS_MEDIADIR=/path/to/media
|
||||
# By default the conversion density setting for documents is 300DPI, in some
|
||||
# cases it has proven useful to configure a lesser value.
|
||||
# This setting has a high impact on the physical size of tmp page files,
|
||||
# the speed of document conversion, and can affect the accuracy of OCR
|
||||
# results. Individual results can vary and this setting should be tested
|
||||
# thoroughly against the documents you are importing to see if it has any
|
||||
# impacts either negative or positive.
|
||||
# Testing on limited document sets has shown a setting of 200 can cut the
|
||||
# size of tmp files by 1/3, and speed up conversion by up to 4x
|
||||
# with little impact to OCR accuracy.
|
||||
#PAPERLESS_CONVERT_DENSITY=300
|
||||
|
||||
|
||||
# (This setting is ignored on Linux where inotify is used instead of a
|
||||
# polling loop.)
|
||||
# The number of seconds that Paperless will wait between checking
|
||||
# PAPERLESS_CONSUMPTION_DIR. If you tend to write documents to this directory
|
||||
# very slowly, you may want to use a higher value than the default (10).
|
||||
# PAPERLESS_CONSUMER_LOOP_TIME=10
|
||||
# rarely, you may want to use a higher value than the default (10).
|
||||
#PAPERLESS_CONSUMER_LOOP_TIME=10
|
||||
|
||||
# If you're planning on putting Paperless on the open internet, then you
|
||||
# really should set this value to the domain name you're using. Failing to do
|
||||
# so leaves you open to XSS attacks.
|
||||
# Just remember that this is a comma-separated list, so "example.com" is fine,
|
||||
# as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
|
||||
#PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com"
|
||||
|
||||
###############################################################################
|
||||
#### Interface ####
|
||||
###############################################################################
|
||||
|
||||
# Override the default UTC time zone here.
|
||||
# See https://docs.djangoproject.com/en/1.10/ref/settings/#std:setting-TIME_ZONE
|
||||
# for details on how to set it.
|
||||
#PAPERLESS_TIME_ZONE=UTC
|
||||
|
||||
|
||||
# If set, Paperless will show document filters per financial year.
|
||||
# The dates must be in the format "mm-dd", for example "07-15" for July 15.
|
||||
#PAPERLESS_FINANCIAL_YEAR_START="mm-dd"
|
||||
#PAPERLESS_FINANCIAL_YEAR_END="mm-dd"
|
||||
|
||||
|
||||
# The number of items on each page in the web UI. This value must be a
|
||||
# positive integer, but if you don't define one in paperless.conf, a default of
|
||||
# 100 will be used.
|
||||
#PAPERLESS_LIST_PER_PAGE=100
|
||||
|
@@ -1,22 +1,52 @@
|
||||
Django==1.10.4
|
||||
Pillow>=3.1.1
|
||||
django-crispy-forms>=1.6.0
|
||||
django-extensions>=1.6.1
|
||||
django-filter>=1.0
|
||||
djangorestframework>=3.4.4
|
||||
filemagic>=1.6
|
||||
langdetect>=1.0.5
|
||||
pyocr>=0.3.1
|
||||
python-dateutil>=2.4.2
|
||||
python-dotenv>=0.3.0
|
||||
python-gnupg>=0.3.8
|
||||
pytz>=2015.7
|
||||
gunicorn==19.6.0
|
||||
|
||||
# For the tests
|
||||
pytest
|
||||
pytest-django
|
||||
pytest-sugar
|
||||
pep8
|
||||
flake8
|
||||
tox
|
||||
apipkg==1.4
|
||||
attrs==18.1.0
|
||||
certifi==2018.4.16
|
||||
chardet==3.0.4
|
||||
coverage==4.5.1
|
||||
coveralls==1.3.0
|
||||
dateparser==0.7.0
|
||||
django-crispy-forms==1.7.2
|
||||
django-extensions==2.0.7
|
||||
django-filter==1.1.0
|
||||
django-flat-responsive==2.0
|
||||
django==1.11.13
|
||||
djangorestframework==3.8.2
|
||||
docopt==0.6.2
|
||||
execnet==1.5.0
|
||||
factory-boy==2.11.1
|
||||
faker==0.8.15
|
||||
filemagic==1.6
|
||||
flake8==3.5.0
|
||||
fuzzywuzzy==0.15.0
|
||||
gunicorn==19.8.1
|
||||
idna==2.6
|
||||
inotify_simple==1.1.7; sys_platform == 'linux'
|
||||
langdetect==1.0.7
|
||||
mccabe==0.6.1
|
||||
more-itertools==4.1.0
|
||||
pdftotext==2.0.2
|
||||
pillow==5.1.0
|
||||
pluggy==0.6.0
|
||||
py==1.5.3
|
||||
pycodestyle==2.3.1
|
||||
pyflakes==1.6.0
|
||||
pyocr==0.5.1
|
||||
pytest-cov==2.5.1
|
||||
pytest-django==3.2.1
|
||||
pytest-env==0.6.2
|
||||
pytest-forked==0.2
|
||||
pytest-sugar==0.9.1
|
||||
pytest-xdist==1.22.2
|
||||
pytest==3.5.1
|
||||
python-dateutil==2.7.3
|
||||
python-dotenv==0.8.2
|
||||
python-gnupg==0.4.2
|
||||
python-levenshtein==0.12.0
|
||||
pytz==2018.4
|
||||
regex==2018.2.21
|
||||
requests==2.18.4
|
||||
six==1.11.0
|
||||
termcolor==1.1.0
|
||||
text-unidecode==1.2
|
||||
tzlocal==1.5.1
|
||||
urllib3==1.22
|
||||
|
@@ -4,44 +4,61 @@ set -e
|
||||
# Source: https://github.com/sameersbn/docker-gitlab/
|
||||
map_uidgid() {
|
||||
USERMAP_ORIG_UID=$(id -u paperless)
|
||||
USERMAP_ORIG_UID=$(id -g paperless)
|
||||
USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}}
|
||||
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
|
||||
if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then
|
||||
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
|
||||
groupmod -g ${USERMAP_GID} paperless
|
||||
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
|
||||
USERMAP_ORIG_GID=$(id -g paperless)
|
||||
USERMAP_NEW_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
|
||||
USERMAP_NEW_GID=${USERMAP_GID:-${USERMAP_ORIG_GID:-$USERMAP_NEW_UID}}
|
||||
if [[ ${USERMAP_NEW_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_NEW_GID} != "${USERMAP_ORIG_GID}" ]]; then
|
||||
echo "Mapping UID and GID for paperless:paperless to $USERMAP_NEW_UID:$USERMAP_NEW_GID"
|
||||
usermod -u "${USERMAP_NEW_UID}" paperless
|
||||
groupmod -g "${USERMAP_NEW_GID}" paperless
|
||||
fi
|
||||
}
|
||||
|
||||
set_permissions() {
|
||||
# Set permissions for consumption directory
|
||||
chgrp paperless "$PAPERLESS_CONSUMPTION_DIR" || {
|
||||
echo "Changing group of consumption directory:"
|
||||
echo " $PAPERLESS_CONSUMPTION_DIR"
|
||||
echo "failed."
|
||||
echo ""
|
||||
echo "Either try to set it on your host-mounted directory"
|
||||
echo "directly, or make sure that the directory has \`o+x\`"
|
||||
echo "permissions and the files in it at least \`o+r\`."
|
||||
} >&2
|
||||
chmod g+x "$PAPERLESS_CONSUMPTION_DIR" || {
|
||||
echo "Changing group permissions of consumption directory:"
|
||||
echo " $PAPERLESS_CONSUMPTION_DIR"
|
||||
echo "failed."
|
||||
echo ""
|
||||
echo "Either try to set it on your host-mounted directory"
|
||||
echo "directly, or make sure that the directory has \`o+x\`"
|
||||
echo "permissions and the files in it at least \`o+r\`."
|
||||
} >&2
|
||||
|
||||
# Set permissions for consumption and export directory
|
||||
for dir in PAPERLESS_CONSUMPTION_DIR PAPERLESS_EXPORT_DIR; do
|
||||
# Extract the name of the current directory from $dir for the error message
|
||||
cur_dir_name=$(echo "$dir" | awk -F'_' '{ print tolower($2); }')
|
||||
chgrp paperless "${!dir}" || {
|
||||
echo "Changing group of ${cur_dir_name} directory:"
|
||||
echo " ${!dir}"
|
||||
echo "failed."
|
||||
echo ""
|
||||
echo "Either try to set it on your host-mounted directory"
|
||||
echo "directly, or make sure that the directory has \`g+wx\`"
|
||||
echo "permissions and the files in it at least \`o+r\`."
|
||||
} >&2
|
||||
chmod g+wx "${!dir}" || {
|
||||
echo "Changing group permissions of ${cur_dir_name} directory:"
|
||||
echo " ${!dir}"
|
||||
echo "failed."
|
||||
echo ""
|
||||
echo "Either try to set it on your host-mounted directory"
|
||||
echo "directly, or make sure that the directory has \`g+wx\`"
|
||||
echo "permissions and the files in it at least \`o+r\`."
|
||||
} >&2
|
||||
done
|
||||
# Set permissions for application directory
|
||||
chown -Rh paperless:paperless /usr/src/paperless
|
||||
}
|
||||
|
||||
migrations() {
|
||||
# A simple lock file in case other containers use this startup
|
||||
LOCKFILE="/usr/src/paperless/data/db.sqlite3.migration"
|
||||
|
||||
# check for and create lock file in one command
|
||||
if (set -o noclobber; echo "$$" > "${LOCKFILE}") 2> /dev/null
|
||||
then
|
||||
trap 'rm -f "${LOCKFILE}"; exit $?' INT TERM EXIT
|
||||
sudo -HEu paperless "/usr/src/paperless/src/manage.py" "migrate"
|
||||
rm ${LOCKFILE}
|
||||
fi
|
||||
}
|
||||
|
||||
initialize() {
|
||||
map_uidgid
|
||||
set_permissions
|
||||
migrations
|
||||
}
|
||||
|
||||
install_languages() {
|
||||
@@ -53,25 +70,24 @@ install_languages() {
|
||||
return
|
||||
fi
|
||||
|
||||
# Update apt-lists
|
||||
apt-get update
|
||||
|
||||
# Loop over languages to be installed
|
||||
for lang in "${langs[@]}"; do
|
||||
pkg="tesseract-ocr-$lang"
|
||||
if dpkg -s "$pkg" 2>&1 > /dev/null; then
|
||||
pkg="tesseract-ocr-data-$lang"
|
||||
|
||||
# English is installed by default
|
||||
if [ "$lang" == "eng" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
if ! apt-cache show "$pkg" 2>&1 > /dev/null; then
|
||||
if apk info -e "$pkg" > /dev/null 2>&1; then
|
||||
continue
|
||||
fi
|
||||
if ! apk info "$pkg" > /dev/null 2>&1; then
|
||||
continue
|
||||
fi
|
||||
|
||||
apt-get install "$pkg"
|
||||
apk --no-cache --update add "$pkg"
|
||||
done
|
||||
|
||||
# Remove apt lists
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
}
|
||||
|
||||
|
||||
|
@@ -4,7 +4,7 @@ Description=Paperless webserver
|
||||
[Service]
|
||||
User=paperless
|
||||
Group=paperless
|
||||
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver 0.0.0.0:8000
|
||||
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver --noreload 0.0.0.0:8000
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
@@ -1,3 +1,6 @@
|
||||
from datetime import datetime
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib import admin
|
||||
from django.contrib.auth.models import User, Group
|
||||
from django.core.urlresolvers import reverse
|
||||
@@ -31,34 +34,125 @@ class MonthListFilter(admin.SimpleListFilter):
|
||||
return queryset.filter(created__year=year, created__month=month)
|
||||
|
||||
|
||||
class CorrespondentAdmin(admin.ModelAdmin):
|
||||
class FinancialYearFilter(admin.SimpleListFilter):
|
||||
|
||||
title = "Financial Year"
|
||||
parameter_name = "fy"
|
||||
_fy_wraps = None
|
||||
|
||||
def _fy_start(self, year):
|
||||
"""Return date of the start of financial year for the given year."""
|
||||
fy_start = "{}-{}".format(str(year), settings.FY_START)
|
||||
return datetime.strptime(fy_start, "%Y-%m-%d").date()
|
||||
|
||||
def _fy_end(self, year):
|
||||
"""Return date of the end of financial year for the given year."""
|
||||
fy_end = "{}-{}".format(str(year), settings.FY_END)
|
||||
return datetime.strptime(fy_end, "%Y-%m-%d").date()
|
||||
|
||||
def _fy_does_wrap(self):
|
||||
"""Return whether the financial year spans across two years."""
|
||||
if self._fy_wraps is None:
|
||||
start = "{}".format(settings.FY_START)
|
||||
start = datetime.strptime(start, "%m-%d").date()
|
||||
end = "{}".format(settings.FY_END)
|
||||
end = datetime.strptime(end, "%m-%d").date()
|
||||
self._fy_wraps = end < start
|
||||
|
||||
return self._fy_wraps
|
||||
|
||||
def _determine_fy(self, date):
|
||||
"""Return a (query, display) financial year tuple of the given date."""
|
||||
if self._fy_does_wrap():
|
||||
fy_start = self._fy_start(date.year)
|
||||
|
||||
if date.date() >= fy_start:
|
||||
query = "{}-{}".format(date.year, date.year + 1)
|
||||
else:
|
||||
query = "{}-{}".format(date.year - 1, date.year)
|
||||
|
||||
# To keep it simple we use the same string for both
|
||||
# query parameter and the display.
|
||||
return (query, query)
|
||||
|
||||
else:
|
||||
query = "{0}-{0}".format(date.year)
|
||||
display = "{}".format(date.year)
|
||||
return (query, display)
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
if not settings.FY_START or not settings.FY_END:
|
||||
return None
|
||||
|
||||
r = []
|
||||
for document in Document.objects.all():
|
||||
r.append(self._determine_fy(document.created))
|
||||
|
||||
return sorted(set(r), key=lambda x: x[0], reverse=True)
|
||||
|
||||
def queryset(self, request, queryset):
|
||||
if not self.value() or not settings.FY_START or not settings.FY_END:
|
||||
return None
|
||||
|
||||
start, end = self.value().split("-")
|
||||
return queryset.filter(created__gte=self._fy_start(start),
|
||||
created__lte=self._fy_end(end))
|
||||
|
||||
|
||||
class CommonAdmin(admin.ModelAdmin):
|
||||
list_per_page = settings.PAPERLESS_LIST_PER_PAGE
|
||||
|
||||
|
||||
class CorrespondentAdmin(CommonAdmin):
|
||||
|
||||
list_display = ("name", "match", "matching_algorithm")
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
|
||||
class TagAdmin(admin.ModelAdmin):
|
||||
class TagAdmin(CommonAdmin):
|
||||
|
||||
list_display = ("name", "colour", "match", "matching_algorithm")
|
||||
list_filter = ("colour", "matching_algorithm")
|
||||
list_editable = ("colour", "match", "matching_algorithm")
|
||||
|
||||
|
||||
class DocumentAdmin(admin.ModelAdmin):
|
||||
class DocumentAdmin(CommonAdmin):
|
||||
|
||||
class Media:
|
||||
css = {
|
||||
"all": ("paperless.css",)
|
||||
}
|
||||
|
||||
search_fields = ("correspondent__name", "title", "content")
|
||||
list_display = ("created", "correspondent", "title", "tags_", "document")
|
||||
list_filter = ("tags", "correspondent", MonthListFilter)
|
||||
list_per_page = 25
|
||||
search_fields = ("correspondent__name", "title", "content", "tags__name")
|
||||
readonly_fields = ("added",)
|
||||
list_display = ("title", "created", "added", "thumbnail", "correspondent",
|
||||
"tags_")
|
||||
list_filter = ("tags", "correspondent", FinancialYearFilter,
|
||||
MonthListFilter)
|
||||
|
||||
ordering = ["-created", "correspondent"]
|
||||
|
||||
def has_add_permission(self, request):
|
||||
return False
|
||||
|
||||
def created_(self, obj):
|
||||
return obj.created.date().strftime("%Y-%m-%d")
|
||||
created_.short_description = "Created"
|
||||
|
||||
def thumbnail(self, obj):
|
||||
return self._html_tag(
|
||||
"a",
|
||||
self._html_tag(
|
||||
"img",
|
||||
src=reverse("fetch", kwargs={"kind": "thumb", "pk": obj.pk}),
|
||||
width=180,
|
||||
alt="Thumbnail of {}".format(obj.file_name),
|
||||
title=obj.file_name
|
||||
),
|
||||
href=obj.download_url
|
||||
)
|
||||
thumbnail.allow_tags = True
|
||||
|
||||
def tags_(self, obj):
|
||||
r = ""
|
||||
@@ -108,7 +202,7 @@ class DocumentAdmin(admin.ModelAdmin):
|
||||
return "<{} {}/>".format(kind, " ".join(attributes))
|
||||
|
||||
|
||||
class LogAdmin(admin.ModelAdmin):
|
||||
class LogAdmin(CommonAdmin):
|
||||
|
||||
list_display = ("created", "message", "level",)
|
||||
list_filter = ("level", "created",)
|
||||
|
@@ -15,13 +15,15 @@ class DocumentsConfig(AppConfig):
|
||||
set_tags,
|
||||
run_pre_consume_script,
|
||||
run_post_consume_script,
|
||||
cleanup_document_deletion
|
||||
cleanup_document_deletion,
|
||||
set_log_entry
|
||||
)
|
||||
|
||||
document_consumption_started.connect(run_pre_consume_script)
|
||||
|
||||
document_consumption_finished.connect(set_tags)
|
||||
document_consumption_finished.connect(set_correspondent)
|
||||
document_consumption_finished.connect(set_log_entry)
|
||||
document_consumption_finished.connect(run_post_consume_script)
|
||||
|
||||
post_delete.connect(cleanup_document_deletion)
|
||||
|
@@ -1,42 +1,30 @@
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
import shutil
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import datetime
|
||||
import tempfile
|
||||
import itertools
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
|
||||
import pyocr
|
||||
import langdetect
|
||||
from PIL import Image
|
||||
from operator import itemgetter
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from paperless.db import GnuPG
|
||||
from pyocr.tesseract import TesseractError
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
|
||||
from .models import Tag, Document, FileInfo
|
||||
from .models import Document, FileInfo, Tag
|
||||
from .parsers import ParseError
|
||||
from .signals import (
|
||||
document_consumption_started,
|
||||
document_consumption_finished
|
||||
document_consumer_declaration,
|
||||
document_consumption_finished,
|
||||
document_consumption_started
|
||||
)
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ConsumerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Consumer(object):
|
||||
class Consumer:
|
||||
"""
|
||||
Loop over every file found in CONSUMPTION_DIR and:
|
||||
1. Convert it to a greyscale pnm
|
||||
@@ -46,249 +34,167 @@ class Consumer(object):
|
||||
5. Delete the document and image(s)
|
||||
"""
|
||||
|
||||
SCRATCH = settings.SCRATCH_DIR
|
||||
CONVERT = settings.CONVERT_BINARY
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
CONSUME = settings.CONSUMPTION_DIR
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||
# Files are considered ready for consumption if they have been unmodified
|
||||
# for this duration
|
||||
FILES_MIN_UNMODIFIED_DURATION = 0.5
|
||||
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, consume=settings.CONSUMPTION_DIR,
|
||||
scratch=settings.SCRATCH_DIR):
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = None
|
||||
|
||||
try:
|
||||
os.makedirs(self.SCRATCH)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
self.stats = {}
|
||||
self._ignore = []
|
||||
self.consume = consume
|
||||
self.scratch = scratch
|
||||
|
||||
if not self.CONSUME:
|
||||
os.makedirs(self.scratch, exist_ok=True)
|
||||
|
||||
if not self.consume:
|
||||
raise ConsumerError(
|
||||
"The CONSUMPTION_DIR settings variable does not appear to be "
|
||||
"set."
|
||||
)
|
||||
|
||||
if not os.path.exists(self.CONSUME):
|
||||
if not os.path.exists(self.consume):
|
||||
raise ConsumerError(
|
||||
"Consumption directory {} does not exist".format(self.CONSUME))
|
||||
"Consumption directory {} does not exist".format(self.consume))
|
||||
|
||||
self.parsers = []
|
||||
for response in document_consumer_declaration.send(self):
|
||||
self.parsers.append(response[1])
|
||||
|
||||
if not self.parsers:
|
||||
raise ConsumerError(
|
||||
"No parsers could be found, not even the default. "
|
||||
"This is a problem."
|
||||
)
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
||||
def consume(self):
|
||||
def consume_new_files(self):
|
||||
"""
|
||||
Find non-ignored files in consumption dir and consume them if they have
|
||||
been unmodified for FILES_MIN_UNMODIFIED_DURATION.
|
||||
"""
|
||||
ignored_files = []
|
||||
files = []
|
||||
for entry in os.scandir(self.consume):
|
||||
if entry.is_file():
|
||||
file = (entry.path, entry.stat().st_mtime)
|
||||
if file in self._ignore:
|
||||
ignored_files.append(file)
|
||||
else:
|
||||
files.append(file)
|
||||
|
||||
for doc in os.listdir(self.CONSUME):
|
||||
if not files:
|
||||
return
|
||||
|
||||
doc = os.path.join(self.CONSUME, doc)
|
||||
# Set _ignore to only include files that still exist.
|
||||
# This keeps it from growing indefinitely.
|
||||
self._ignore[:] = ignored_files
|
||||
|
||||
if not os.path.isfile(doc):
|
||||
continue
|
||||
files_old_to_new = sorted(files, key=itemgetter(1))
|
||||
|
||||
if not re.match(FileInfo.REGEXES["title"], doc):
|
||||
continue
|
||||
time.sleep(self.FILES_MIN_UNMODIFIED_DURATION)
|
||||
|
||||
if doc in self._ignore:
|
||||
continue
|
||||
for file, mtime in files_old_to_new:
|
||||
if mtime == os.path.getmtime(file):
|
||||
# File has not been modified and can be consumed
|
||||
if not self.try_consume_file(file):
|
||||
self._ignore.append((file, mtime))
|
||||
|
||||
if not self._is_ready(doc):
|
||||
continue
|
||||
def try_consume_file(self, file):
|
||||
"Return True if file was consumed"
|
||||
|
||||
if self._is_duplicate(doc):
|
||||
self.log(
|
||||
"info",
|
||||
"Skipping {} as it appears to be a duplicate".format(doc)
|
||||
)
|
||||
self._ignore.append(doc)
|
||||
continue
|
||||
if not re.match(FileInfo.REGEXES["title"], file):
|
||||
return False
|
||||
|
||||
self.logging_group = uuid.uuid4()
|
||||
doc = file
|
||||
|
||||
self.log("info", "Consuming {}".format(doc))
|
||||
if self._is_duplicate(doc):
|
||||
self.log(
|
||||
"info",
|
||||
"Skipping {} as it appears to be a duplicate".format(doc)
|
||||
)
|
||||
return False
|
||||
|
||||
document_consumption_started.send(
|
||||
parser_class = self._get_parser_class(doc)
|
||||
if not parser_class:
|
||||
self.log(
|
||||
"error", "No parsers could be found for {}".format(doc))
|
||||
return False
|
||||
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
self.log("info", "Consuming {}".format(doc))
|
||||
|
||||
document_consumption_started.send(
|
||||
sender=self.__class__,
|
||||
filename=doc,
|
||||
logging_group=self.logging_group
|
||||
)
|
||||
|
||||
parsed_document = parser_class(doc)
|
||||
|
||||
try:
|
||||
thumbnail = parsed_document.get_thumbnail()
|
||||
date = parsed_document.get_date()
|
||||
document = self._store(
|
||||
parsed_document.get_text(),
|
||||
doc,
|
||||
thumbnail,
|
||||
date
|
||||
)
|
||||
except ParseError as e:
|
||||
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
|
||||
parsed_document.cleanup()
|
||||
return False
|
||||
else:
|
||||
parsed_document.cleanup()
|
||||
self._cleanup_doc(doc)
|
||||
|
||||
self.log(
|
||||
"info",
|
||||
"Document {} consumption finished".format(document)
|
||||
)
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
filename=doc,
|
||||
document=document,
|
||||
logging_group=self.logging_group
|
||||
)
|
||||
return True
|
||||
|
||||
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||
imgs = self._get_greyscale(tempdir, doc)
|
||||
thumbnail = self._get_thumbnail(tempdir, doc)
|
||||
|
||||
try:
|
||||
|
||||
document = self._store(self._get_ocr(imgs), doc, thumbnail)
|
||||
|
||||
except OCRError as e:
|
||||
|
||||
self._ignore.append(doc)
|
||||
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
|
||||
self._cleanup_tempdir(tempdir)
|
||||
|
||||
continue
|
||||
|
||||
else:
|
||||
|
||||
self._cleanup_tempdir(tempdir)
|
||||
self._cleanup_doc(doc)
|
||||
|
||||
self.log(
|
||||
"info",
|
||||
"Document {} consumption finished".format(document)
|
||||
)
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group
|
||||
)
|
||||
|
||||
def _get_greyscale(self, tempdir, doc):
|
||||
def _get_parser_class(self, doc):
|
||||
"""
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
Determine the appropriate parser class based on the file
|
||||
"""
|
||||
|
||||
self.log("info", "Generating greyscale image from {}".format(doc))
|
||||
options = []
|
||||
for parser in self.parsers:
|
||||
result = parser(doc)
|
||||
if result:
|
||||
options.append(result)
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
pnm = os.path.join(tempdir, "convert-%04d.pnm")
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-density", str(self.DENSITY),
|
||||
"-depth", "8",
|
||||
"-type", "grayscale",
|
||||
doc, pnm,
|
||||
)
|
||||
|
||||
# Get a list of converted images
|
||||
pnms = []
|
||||
for f in os.listdir(tempdir):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(tempdir, f))
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
||||
|
||||
# Return list of converted images, processed with unpaper
|
||||
pnms = []
|
||||
for f in os.listdir(tempdir):
|
||||
if f.endswith(".unpaper.pnm"):
|
||||
pnms.append(os.path.join(tempdir, f))
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
|
||||
def _get_thumbnail(self, tempdir, doc):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
|
||||
self.log("info", "Generating the thumbnail")
|
||||
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-scale", "500x5000",
|
||||
"-alpha", "remove",
|
||||
doc, os.path.join(tempdir, "convert-%04d.png")
|
||||
)
|
||||
|
||||
return os.path.join(tempdir, "convert-0000.png")
|
||||
|
||||
def _guess_language(self, text):
|
||||
try:
|
||||
guess = langdetect.detect(text)
|
||||
self.log("debug", "Language detected: {}".format(guess))
|
||||
return guess
|
||||
except Exception as e:
|
||||
self.log("warning", "Language detection error: {}".format(e))
|
||||
|
||||
def _get_ocr(self, imgs):
|
||||
"""
|
||||
Attempts to do the best job possible OCR'ing the document based on
|
||||
simple language detection trial & error.
|
||||
"""
|
||||
|
||||
if not imgs:
|
||||
raise OCRError("No images found")
|
||||
|
||||
self.log("info", "OCRing the document")
|
||||
|
||||
# Since the division gets rounded down by int, this calculation works
|
||||
# for every edge-case, i.e. 1
|
||||
middle = int(len(imgs) / 2)
|
||||
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||
|
||||
guessed_language = self._guess_language(raw_text)
|
||||
|
||||
if not guessed_language or guessed_language not in ISO639:
|
||||
self.log("warning", "Language detection failed!")
|
||||
if settings.FORGIVING_OCR:
|
||||
self.log(
|
||||
"warning",
|
||||
"As FORGIVING_OCR is enabled, we're going to make the "
|
||||
"best with what we have."
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError("Language detection failed")
|
||||
|
||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
|
||||
try:
|
||||
return self._ocr(imgs, ISO639[guessed_language])
|
||||
except pyocr.pyocr.tesseract.TesseractError:
|
||||
if settings.FORGIVING_OCR:
|
||||
self.log(
|
||||
"warning",
|
||||
"OCR for {} failed, but we're going to stick with what "
|
||||
"we've got since FORGIVING_OCR is enabled.".format(
|
||||
guessed_language
|
||||
)
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError(
|
||||
"The guessed language is not available in this instance of "
|
||||
"Tesseract."
|
||||
self.log(
|
||||
"info",
|
||||
"Parsers available: {}".format(
|
||||
", ".join([str(o["parser"].__name__) for o in options])
|
||||
)
|
||||
)
|
||||
|
||||
def _assemble_ocr_sections(self, imgs, middle, text):
|
||||
"""
|
||||
Given a `middle` value and the text that middle page represents, we OCR
|
||||
the remainder of the document and return the whole thing.
|
||||
"""
|
||||
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||
return text
|
||||
if not options:
|
||||
return None
|
||||
|
||||
def _ocr(self, imgs, lang):
|
||||
"""
|
||||
Performs a single OCR attempt.
|
||||
"""
|
||||
# Return the parser with the highest weight.
|
||||
return sorted(
|
||||
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||
|
||||
if not imgs:
|
||||
return ""
|
||||
|
||||
self.log("info", "Parsing for {}".format(lang))
|
||||
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
r = " ".join(r)
|
||||
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
return strip_excess_whitespace(r)
|
||||
|
||||
def _store(self, text, doc, thumbnail):
|
||||
def _store(self, text, doc, thumbnail, date):
|
||||
|
||||
file_info = FileInfo.from_path(doc)
|
||||
|
||||
@@ -296,7 +202,7 @@ class Consumer(object):
|
||||
|
||||
self.log("debug", "Saving record to database")
|
||||
|
||||
created = file_info.created or timezone.make_aware(
|
||||
created = file_info.created or date or timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
|
||||
with open(doc, "rb") as f:
|
||||
@@ -332,70 +238,12 @@ class Consumer(object):
|
||||
|
||||
return document
|
||||
|
||||
def _cleanup_tempdir(self, d):
|
||||
self.log("debug", "Deleting directory {}".format(d))
|
||||
shutil.rmtree(d)
|
||||
|
||||
def _cleanup_doc(self, doc):
|
||||
self.log("debug", "Deleting document {}".format(doc))
|
||||
os.unlink(doc)
|
||||
|
||||
def _is_ready(self, doc):
|
||||
"""
|
||||
Detect whether `doc` is ready to consume or if it's still being written
|
||||
to by the uploader.
|
||||
"""
|
||||
|
||||
t = os.stat(doc).st_mtime
|
||||
|
||||
if self.stats.get(doc) == t:
|
||||
del(self.stats[doc])
|
||||
return True
|
||||
|
||||
self.stats[doc] = t
|
||||
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _is_duplicate(doc):
|
||||
with open(doc, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
return Document.objects.filter(checksum=checksum).exists()
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(
|
||||
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
|
||||
if ocr.can_detect_orientation():
|
||||
try:
|
||||
orientation = ocr.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except (TesseractError, OtherTesseractError):
|
||||
pass
|
||||
return ocr.image_to_string(f, lang=lang)
|
||||
|
||||
|
||||
def run_unpaper(args):
|
||||
unpaper, pnm = args
|
||||
subprocess.Popen(
|
||||
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
|
||||
|
||||
|
||||
def run_convert(*args):
|
||||
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||
if settings.CONVERT_TMPDIR:
|
||||
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
||||
|
||||
subprocess.Popen(args, env=environment).wait()
|
||||
|
@@ -8,7 +8,7 @@ class CorrespondentFilterSet(FilterSet):
|
||||
class Meta(object):
|
||||
model = Correspondent
|
||||
fields = {
|
||||
'name': [
|
||||
"name": [
|
||||
"startswith", "endswith", "contains",
|
||||
"istartswith", "iendswith", "icontains"
|
||||
],
|
||||
@@ -21,7 +21,7 @@ class TagFilterSet(FilterSet):
|
||||
class Meta(object):
|
||||
model = Tag
|
||||
fields = {
|
||||
'name': [
|
||||
"name": [
|
||||
"startswith", "endswith", "contains",
|
||||
"istartswith", "iendswith", "icontains"
|
||||
],
|
||||
|
@@ -2,7 +2,6 @@ import magic
|
||||
import os
|
||||
|
||||
from datetime import datetime
|
||||
from hashlib import sha256
|
||||
from time import mktime
|
||||
|
||||
from django import forms
|
||||
@@ -14,7 +13,6 @@ from .consumer import Consumer
|
||||
|
||||
class UploadForm(forms.Form):
|
||||
|
||||
SECRET = settings.SHARED_SECRET
|
||||
TYPE_LOOKUP = {
|
||||
"application/pdf": Document.TYPE_PDF,
|
||||
"image/png": Document.TYPE_PNG,
|
||||
@@ -32,10 +30,9 @@ class UploadForm(forms.Form):
|
||||
required=False
|
||||
)
|
||||
document = forms.FileField()
|
||||
signature = forms.CharField(max_length=256)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
forms.Form.__init__(*args, **kwargs)
|
||||
forms.Form.__init__(self, *args, **kwargs)
|
||||
self._file_type = None
|
||||
|
||||
def clean_correspondent(self):
|
||||
@@ -82,17 +79,6 @@ class UploadForm(forms.Form):
|
||||
|
||||
return document
|
||||
|
||||
def clean(self):
|
||||
|
||||
corresp = self.clened_data.get("correspondent")
|
||||
title = self.cleaned_data.get("title")
|
||||
signature = self.cleaned_data.get("signature")
|
||||
|
||||
if sha256(corresp + title + self.SECRET).hexdigest() == signature:
|
||||
return self.cleaned_data
|
||||
|
||||
raise forms.ValidationError("The signature provided did not validate")
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Since the consumer already does a lot of work, it's easier just to save
|
||||
@@ -100,13 +86,13 @@ class UploadForm(forms.Form):
|
||||
form do that as well. Think of it as a poor-man's queue server.
|
||||
"""
|
||||
|
||||
correspondent = self.clened_data.get("correspondent")
|
||||
correspondent = self.cleaned_data.get("correspondent")
|
||||
title = self.cleaned_data.get("title")
|
||||
document = self.cleaned_data.get("document")
|
||||
|
||||
t = int(mktime(datetime.now()))
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
file_name = os.path.join(
|
||||
Consumer.CONSUME,
|
||||
settings.CONSUMPTION_DIR,
|
||||
"{} - {}.{}".format(correspondent, title, self._file_type)
|
||||
)
|
||||
|
||||
|
@@ -13,7 +13,6 @@ from dateutil import parser
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from .consumer import Consumer
|
||||
from .models import Correspondent
|
||||
|
||||
|
||||
@@ -21,7 +20,7 @@ class MailFetcherError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class InvalidMessageError(Exception):
|
||||
class InvalidMessageError(MailFetcherError):
|
||||
pass
|
||||
|
||||
|
||||
@@ -43,7 +42,7 @@ class Message(Loggable):
|
||||
and n attachments, and that we don't care about the message body.
|
||||
"""
|
||||
|
||||
SECRET = settings.SHARED_SECRET
|
||||
SECRET = os.getenv("PAPERLESS_EMAIL_SECRET")
|
||||
|
||||
def __init__(self, data, group=None):
|
||||
"""
|
||||
@@ -76,6 +75,9 @@ class Message(Loggable):
|
||||
continue
|
||||
|
||||
dispositions = content_disposition.strip().split(";")
|
||||
if len(dispositions) < 2:
|
||||
continue
|
||||
|
||||
if not dispositions[0].lower() == "attachment" and \
|
||||
"filename" not in dispositions[1].lower():
|
||||
continue
|
||||
@@ -148,20 +150,23 @@ class Attachment(object):
|
||||
|
||||
class MailFetcher(Loggable):
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, consume=settings.CONSUMPTION_DIR):
|
||||
|
||||
Loggable.__init__(self)
|
||||
|
||||
self._connection = None
|
||||
self._host = settings.MAIL_CONSUMPTION["HOST"]
|
||||
self._port = settings.MAIL_CONSUMPTION["PORT"]
|
||||
self._username = settings.MAIL_CONSUMPTION["USERNAME"]
|
||||
self._password = settings.MAIL_CONSUMPTION["PASSWORD"]
|
||||
self._inbox = settings.MAIL_CONSUMPTION["INBOX"]
|
||||
self._host = os.getenv("PAPERLESS_CONSUME_MAIL_HOST")
|
||||
self._port = os.getenv("PAPERLESS_CONSUME_MAIL_PORT")
|
||||
self._username = os.getenv("PAPERLESS_CONSUME_MAIL_USER")
|
||||
self._password = os.getenv("PAPERLESS_CONSUME_MAIL_PASS")
|
||||
self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
|
||||
|
||||
self._enabled = bool(self._host)
|
||||
if self._enabled and Message.SECRET is None:
|
||||
raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
|
||||
|
||||
self.last_checked = datetime.datetime.now()
|
||||
self.last_checked = time.time()
|
||||
self.consume = consume
|
||||
|
||||
def pull(self):
|
||||
"""
|
||||
@@ -182,12 +187,12 @@ class MailFetcher(Loggable):
|
||||
self.log("info", 'Storing email: "{}"'.format(message.subject))
|
||||
|
||||
t = int(time.mktime(message.time.timetuple()))
|
||||
file_name = os.path.join(Consumer.CONSUME, message.file_name)
|
||||
file_name = os.path.join(self.consume, message.file_name)
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(message.attachment.data)
|
||||
os.utime(file_name, times=(t, t))
|
||||
|
||||
self.last_checked = datetime.datetime.now()
|
||||
self.last_checked = time.time()
|
||||
|
||||
def _get_messages(self):
|
||||
|
||||
@@ -205,7 +210,7 @@ class MailFetcher(Loggable):
|
||||
self._connection.close()
|
||||
self._connection.logout()
|
||||
|
||||
except Exception as e:
|
||||
except MailFetcherError as e:
|
||||
self.log("error", str(e))
|
||||
|
||||
return r
|
||||
@@ -219,7 +224,7 @@ class MailFetcher(Loggable):
|
||||
if not login[0] == "OK":
|
||||
raise MailFetcherError("Can't log into mail: {}".format(login[1]))
|
||||
|
||||
inbox = self._connection.select("INBOX")
|
||||
inbox = self._connection.select(self._inbox)
|
||||
if not inbox[0] == "OK":
|
||||
raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
from django.conf import settings
|
||||
@@ -9,6 +10,11 @@ from django.core.management.base import BaseCommand, CommandError
|
||||
from ...consumer import Consumer, ConsumerError
|
||||
from ...mail import MailFetcher, MailFetcherError
|
||||
|
||||
try:
|
||||
from inotify_simple import INotify, flags
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""
|
||||
@@ -16,9 +22,6 @@ class Command(BaseCommand):
|
||||
consumption directory, and fetch any mail available.
|
||||
"""
|
||||
|
||||
LOOP_TIME = settings.CONSUMER_LOOP_TIME
|
||||
MAIL_DELTA = datetime.timedelta(minutes=10)
|
||||
|
||||
ORIGINAL_DOCS = os.path.join(settings.MEDIA_ROOT, "documents", "originals")
|
||||
THUMB_DOCS = os.path.join(settings.MEDIA_ROOT, "documents", "thumbnails")
|
||||
|
||||
@@ -28,44 +31,117 @@ class Command(BaseCommand):
|
||||
|
||||
self.file_consumer = None
|
||||
self.mail_fetcher = None
|
||||
self.first_iteration = True
|
||||
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"directory",
|
||||
default=settings.CONSUMPTION_DIR,
|
||||
nargs="?",
|
||||
help="The consumption directory."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--loop-time",
|
||||
default=settings.CONSUMER_LOOP_TIME,
|
||||
type=int,
|
||||
help="Wait time between each loop (in seconds)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mail-delta",
|
||||
default=10,
|
||||
type=int,
|
||||
help="Wait time between each mail fetch (in minutes)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--oneshot",
|
||||
action="store_true",
|
||||
help="Run only once."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-inotify",
|
||||
action="store_true",
|
||||
help="Don't use inotify, even if it's available."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
directory = options["directory"]
|
||||
loop_time = options["loop_time"]
|
||||
mail_delta = options["mail_delta"] * 60
|
||||
use_inotify = (not options["no_inotify"]
|
||||
and "inotify_simple" in sys.modules)
|
||||
|
||||
try:
|
||||
self.file_consumer = Consumer()
|
||||
self.mail_fetcher = MailFetcher()
|
||||
self.file_consumer = Consumer(consume=directory)
|
||||
self.mail_fetcher = MailFetcher(consume=directory)
|
||||
except (ConsumerError, MailFetcherError) as e:
|
||||
raise CommandError(e)
|
||||
|
||||
for path in (self.ORIGINAL_DOCS, self.THUMB_DOCS):
|
||||
try:
|
||||
os.makedirs(path)
|
||||
except FileExistsError:
|
||||
pass
|
||||
for d in (self.ORIGINAL_DOCS, self.THUMB_DOCS):
|
||||
os.makedirs(d, exist_ok=True)
|
||||
|
||||
logging.getLogger(__name__).info(
|
||||
"Starting document consumer at {}".format(settings.CONSUMPTION_DIR)
|
||||
"Starting document consumer at {}{}".format(
|
||||
directory,
|
||||
" with inotify" if use_inotify else ""
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
while True:
|
||||
self.loop()
|
||||
time.sleep(self.LOOP_TIME)
|
||||
if self.verbosity > 1:
|
||||
print(".")
|
||||
except KeyboardInterrupt:
|
||||
print("Exiting")
|
||||
if options["oneshot"]:
|
||||
self.loop_step(mail_delta)
|
||||
else:
|
||||
try:
|
||||
if use_inotify:
|
||||
self.loop_inotify(mail_delta)
|
||||
else:
|
||||
self.loop(loop_time, mail_delta)
|
||||
except KeyboardInterrupt:
|
||||
print("Exiting")
|
||||
|
||||
def loop(self):
|
||||
def loop(self, loop_time, mail_delta):
|
||||
while True:
|
||||
start_time = time.time()
|
||||
if self.verbosity > 1:
|
||||
print(".", int(start_time))
|
||||
self.loop_step(mail_delta, start_time)
|
||||
# Sleep until the start of the next loop step
|
||||
time.sleep(max(0, start_time + loop_time - time.time()))
|
||||
|
||||
# Consume whatever files we can
|
||||
self.file_consumer.consume()
|
||||
def loop_step(self, mail_delta, time_now=None):
|
||||
|
||||
# Occasionally fetch mail and store it to be consumed on the next loop
|
||||
delta = self.mail_fetcher.last_checked + self.MAIL_DELTA
|
||||
if delta < datetime.datetime.now():
|
||||
# We fetch email when we first start up so that it is not necessary to
|
||||
# wait for 10 minutes after making changes to the config file.
|
||||
next_mail_time = self.mail_fetcher.last_checked + mail_delta
|
||||
if self.first_iteration or time_now > next_mail_time:
|
||||
self.first_iteration = False
|
||||
self.mail_fetcher.pull()
|
||||
|
||||
self.file_consumer.consume_new_files()
|
||||
|
||||
def loop_inotify(self, mail_delta):
|
||||
directory = self.file_consumer.consume
|
||||
inotify = INotify()
|
||||
inotify.add_watch(directory, flags.CLOSE_WRITE | flags.MOVED_TO)
|
||||
|
||||
# Run initial mail fetch and consume all currently existing documents
|
||||
self.loop_step(mail_delta)
|
||||
next_mail_time = self.mail_fetcher.last_checked + mail_delta
|
||||
|
||||
while True:
|
||||
# Consume documents until next_mail_time
|
||||
while True:
|
||||
delta = next_mail_time - time.time()
|
||||
if delta > 0:
|
||||
for event in inotify.read(timeout=delta):
|
||||
file = os.path.join(directory, event.name)
|
||||
if os.path.isfile(file):
|
||||
self.file_consumer.try_consume_file(file)
|
||||
else:
|
||||
break
|
||||
|
||||
self.mail_fetcher.pull()
|
||||
next_mail_time = self.mail_fetcher.last_checked + mail_delta
|
||||
|
82
src/documents/management/commands/document_correspondents.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import sys
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.models import Correspondent, Document
|
||||
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
|
||||
help = """
|
||||
Using the current set of correspondent rules, apply said rules to all
|
||||
documents in the database, effectively allowing you to back-tag all
|
||||
previously indexed documents with correspondent created (or modified)
|
||||
after their initial import.
|
||||
""".replace(" ", "")
|
||||
|
||||
TOO_MANY_CONTINUE = (
|
||||
"Detected {} potential correspondents for {}, so we've opted for {}")
|
||||
TOO_MANY_SKIP = (
|
||||
"Detected {} potential correspondents for {}, so we're skipping it")
|
||||
CHANGE_MESSAGE = (
|
||||
'Document {}: "{}" was given the correspondent id {}: "{}"')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"--use-first",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="By default this command won't try to assign a correspondent "
|
||||
"if more than one matches the document. Use this flag if "
|
||||
"you'd rather it just pick the first one it finds."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
for document in Document.objects.filter(correspondent__isnull=True):
|
||||
|
||||
potential_correspondents = list(
|
||||
Correspondent.match_all(document.content))
|
||||
|
||||
if not potential_correspondents:
|
||||
continue
|
||||
|
||||
potential_count = len(potential_correspondents)
|
||||
correspondent = potential_correspondents[0]
|
||||
|
||||
if potential_count > 1:
|
||||
if not options["use_first"]:
|
||||
print(
|
||||
self.TOO_MANY_SKIP.format(potential_count, document),
|
||||
file=sys.stderr
|
||||
)
|
||||
continue
|
||||
print(
|
||||
self.TOO_MANY_CONTINUE.format(
|
||||
potential_count,
|
||||
document,
|
||||
correspondent
|
||||
),
|
||||
file=sys.stderr
|
||||
)
|
||||
|
||||
document.correspondent = correspondent
|
||||
document.save(update_fields=("correspondent",))
|
||||
|
||||
print(
|
||||
self.CHANGE_MESSAGE.format(
|
||||
document.pk,
|
||||
document.title,
|
||||
correspondent.pk,
|
||||
correspondent.name
|
||||
),
|
||||
file=sys.stderr
|
||||
)
|
@@ -10,6 +10,7 @@ from documents.models import Document, Correspondent, Tag
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from ...mixins import Renderable
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
@@ -61,15 +62,24 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
target = os.path.join(self.target, document.file_name)
|
||||
document_dict["__exported_file_name__"] = target
|
||||
file_target = os.path.join(self.target, document.file_name)
|
||||
|
||||
print("Exporting: {}".format(target))
|
||||
thumbnail_name = document.file_name + "-thumbnail.png"
|
||||
thumbnail_target = os.path.join(self.target, thumbnail_name)
|
||||
|
||||
with open(target, "wb") as f:
|
||||
document_dict[EXPORTER_FILE_NAME] = document.file_name
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
|
||||
|
||||
print("Exporting: {}".format(file_target))
|
||||
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
with open(file_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.source_file))
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
os.utime(target, times=(t, t))
|
||||
os.utime(file_target, times=(t, t))
|
||||
|
||||
with open(thumbnail_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.thumbnail_file))
|
||||
os.utime(thumbnail_target, times=(t, t))
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", Correspondent.objects.all()))
|
||||
|
@@ -10,6 +10,8 @@ from paperless.db import GnuPG
|
||||
|
||||
from ...mixins import Renderable
|
||||
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
|
||||
@@ -70,13 +72,13 @@ class Command(Renderable, BaseCommand):
|
||||
if not record["model"] == "documents.document":
|
||||
continue
|
||||
|
||||
if "__exported_file_name__" not in record:
|
||||
if EXPORTER_FILE_NAME not in record:
|
||||
raise CommandError(
|
||||
'The manifest file contains a record which does not '
|
||||
'refer to an actual document file.'
|
||||
)
|
||||
|
||||
doc_file = record["__exported_file_name__"]
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
if not os.path.exists(os.path.join(self.source, doc_file)):
|
||||
raise CommandError(
|
||||
'The manifest file refers to "{}" which does not '
|
||||
@@ -90,10 +92,21 @@ class Command(Renderable, BaseCommand):
|
||||
if not record["model"] == "documents.document":
|
||||
continue
|
||||
|
||||
doc_file = record["__exported_file_name__"]
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
with open(doc_file, "rb") as unencrypted:
|
||||
|
||||
document_path = os.path.join(self.source, doc_file)
|
||||
thumbnail_path = os.path.join(self.source, thumb_file)
|
||||
|
||||
with open(document_path, "rb") as unencrypted:
|
||||
with open(document.source_path, "wb") as encrypted:
|
||||
print("Encrypting {} and saving it to {}".format(
|
||||
doc_file, document.source_path))
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
with open(thumbnail_path, "rb") as unencrypted:
|
||||
with open(document.thumbnail_path, "wb") as encrypted:
|
||||
print("Encrypting {} and saving it to {}".format(
|
||||
thumb_file, document.thumbnail_path))
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
@@ -50,7 +50,7 @@ class GroupConcat(models.Aggregate):
|
||||
|
||||
def _get_template(self, separator):
|
||||
if self.engine == self.ENGINE_MYSQL:
|
||||
return "%(function)s(%(expressions)s, SEPARATOR '{}')".format(
|
||||
return "%(function)s(%(expressions)s SEPARATOR '{}')".format(
|
||||
separator)
|
||||
return "%(function)s(%(expressions)s, '{}')".format(separator)
|
||||
|
||||
|
@@ -3,6 +3,7 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -19,7 +20,7 @@ class Migration(migrations.Migration):
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('sender', models.CharField(blank=True, db_index=True, max_length=128)),
|
||||
('title', models.CharField(blank=True, db_index=True, max_length=128)),
|
||||
('content', models.TextField(db_index=True)),
|
||||
('content', models.TextField(db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]))),
|
||||
('created', models.DateTimeField(auto_now_add=True)),
|
||||
('modified', models.DateTimeField(auto_now=True)),
|
||||
],
|
||||
|
@@ -47,7 +47,11 @@ class Migration(migrations.Migration):
|
||||
],
|
||||
),
|
||||
migrations.RunPython(move_sender_strings_to_sender_model),
|
||||
migrations.AlterField(
|
||||
migrations.RemoveField(
|
||||
model_name='document',
|
||||
name='sender',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='sender',
|
||||
field=models.ForeignKey(blank=True, on_delete=django.db.models.deletion.CASCADE, to='documents.Sender'),
|
||||
|
@@ -38,6 +38,9 @@ class GnuPG(object):
|
||||
|
||||
def move_documents_and_create_thumbnails(apps, schema_editor):
|
||||
|
||||
os.makedirs(os.path.join(settings.MEDIA_ROOT, "documents", "originals"), exist_ok=True)
|
||||
os.makedirs(os.path.join(settings.MEDIA_ROOT, "documents", "thumbnails"), exist_ok=True)
|
||||
|
||||
documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents"))
|
||||
|
||||
if set(documents) == {"originals", "thumbnails"}:
|
||||
|
21
src/documents/migrations/0016_auto_20170325_1558.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.10.5 on 2017-03-25 15:58
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0015_add_insensitive_to_match'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='content',
|
||||
field=models.TextField(blank=True, db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]), help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
|
||||
),
|
||||
]
|
25
src/documents/migrations/0017_auto_20170512_0507.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.10.5 on 2017-05-12 05:07
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0016_auto_20170325_1558'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='correspondent',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='matching_algorithm',
|
||||
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
|
||||
),
|
||||
]
|
21
src/documents/migrations/0018_auto_20170715_1712.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.10.5 on 2017-07-15 17:12
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0017_auto_20170512_0507'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='correspondent',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.Correspondent'),
|
||||
),
|
||||
]
|
25
src/documents/migrations/0019_add_consumer_user.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.10.5 on 2017-07-15 17:12
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
def forwards_func(apps, schema_editor):
|
||||
User.objects.create(username="consumer")
|
||||
|
||||
|
||||
def reverse_func(apps, schema_editor):
|
||||
User.objects.get(username="consumer").delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0018_auto_20170715_1712'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(forwards_func, reverse_func),
|
||||
]
|
27
src/documents/migrations/0020_document_added.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
def set_added_time_to_created_time(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
for doc in Document.objects.all():
|
||||
doc.added = doc.created
|
||||
doc.save()
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0019_add_consumer_user'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='added',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, editable=False),
|
||||
),
|
||||
migrations.RunPython(set_added_time_to_created_time)
|
||||
]
|
@@ -1,4 +1,4 @@
|
||||
class Renderable(object):
|
||||
class Renderable:
|
||||
"""
|
||||
A handy mixin to make it easier/cleaner to print output based on a
|
||||
verbosity value.
|
||||
|
@@ -1,3 +1,5 @@
|
||||
# coding=utf-8
|
||||
|
||||
import dateutil.parser
|
||||
import logging
|
||||
import os
|
||||
@@ -5,6 +7,7 @@ import re
|
||||
import uuid
|
||||
|
||||
from collections import OrderedDict
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.urlresolvers import reverse
|
||||
@@ -21,11 +24,13 @@ class MatchingModel(models.Model):
|
||||
MATCH_ALL = 2
|
||||
MATCH_LITERAL = 3
|
||||
MATCH_REGEX = 4
|
||||
MATCH_FUZZY = 5
|
||||
MATCHING_ALGORITHMS = (
|
||||
(MATCH_ANY, "Any"),
|
||||
(MATCH_ALL, "All"),
|
||||
(MATCH_LITERAL, "Literal"),
|
||||
(MATCH_REGEX, "Regular Expression"),
|
||||
(MATCH_FUZZY, "Fuzzy Match"),
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=128, unique=True)
|
||||
@@ -42,8 +47,11 @@ class MatchingModel(models.Model):
|
||||
"provided appear in the PDF, albeit not in the order provided. A "
|
||||
"\"literal\" match means that the text you enter must appear in "
|
||||
"the PDF exactly as you've entered it, and \"regular expression\" "
|
||||
"uses a regex to match the PDF. If you don't know what a regex "
|
||||
"is, you probably don't want this option."
|
||||
"uses a regex to match the PDF. (If you don't know what a regex "
|
||||
"is, you probably don't want this option.) Finally, a \"fuzzy "
|
||||
"match\" looks for words or phrases that are mostly—but not "
|
||||
"exactly—the same, which can be useful for matching against "
|
||||
"documents containg imperfections that foil accurate OCR."
|
||||
)
|
||||
)
|
||||
|
||||
@@ -83,7 +91,7 @@ class MatchingModel(models.Model):
|
||||
search_kwargs = {"flags": re.IGNORECASE}
|
||||
|
||||
if self.matching_algorithm == self.MATCH_ALL:
|
||||
for word in self.match.split(" "):
|
||||
for word in self._split_match():
|
||||
search_result = re.search(
|
||||
r"\b{}\b".format(word), text, **search_kwargs)
|
||||
if not search_result:
|
||||
@@ -91,7 +99,7 @@ class MatchingModel(models.Model):
|
||||
return True
|
||||
|
||||
if self.matching_algorithm == self.MATCH_ANY:
|
||||
for word in self.match.split(" "):
|
||||
for word in self._split_match():
|
||||
if re.search(r"\b{}\b".format(word), text, **search_kwargs):
|
||||
return True
|
||||
return False
|
||||
@@ -104,8 +112,34 @@ class MatchingModel(models.Model):
|
||||
return bool(re.search(
|
||||
re.compile(self.match, **search_kwargs), text))
|
||||
|
||||
if self.matching_algorithm == self.MATCH_FUZZY:
|
||||
match = re.sub(r'[^\w\s]', '', self.match)
|
||||
text = re.sub(r'[^\w\s]', '', text)
|
||||
if self.is_insensitive:
|
||||
match = match.lower()
|
||||
text = text.lower()
|
||||
|
||||
return True if fuzz.partial_ratio(match, text) >= 90 else False
|
||||
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
|
||||
def _split_match(self):
|
||||
"""
|
||||
Splits the match to individual keywords, getting rid of unnecessary
|
||||
spaces and grouping quoted words together.
|
||||
|
||||
Example:
|
||||
' some random words "with quotes " and spaces'
|
||||
==>
|
||||
["some", "random", "words", "with\s+quotes", "and", "spaces"]
|
||||
"""
|
||||
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
|
||||
normspace = re.compile(r"\s+").sub
|
||||
return [
|
||||
normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
|
||||
for t in findterms(self.match)
|
||||
]
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
|
||||
self.match = self.match.lower()
|
||||
@@ -157,14 +191,28 @@ class Document(models.Model):
|
||||
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
|
||||
|
||||
correspondent = models.ForeignKey(
|
||||
Correspondent, blank=True, null=True, related_name="documents")
|
||||
Correspondent,
|
||||
blank=True,
|
||||
null=True,
|
||||
related_name="documents",
|
||||
on_delete=models.SET_NULL
|
||||
)
|
||||
|
||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||
content = models.TextField(db_index=True)
|
||||
|
||||
content = models.TextField(
|
||||
db_index=True,
|
||||
blank=True,
|
||||
help_text="The raw, text-only data of the document. This field is "
|
||||
"primarily used for searching."
|
||||
)
|
||||
|
||||
file_type = models.CharField(
|
||||
max_length=4,
|
||||
editable=False,
|
||||
choices=tuple([(t, t.upper()) for t in TYPES])
|
||||
)
|
||||
|
||||
tags = models.ManyToManyField(
|
||||
Tag, related_name="documents", blank=True)
|
||||
|
||||
@@ -181,6 +229,8 @@ class Document(models.Model):
|
||||
default=timezone.now, db_index=True)
|
||||
modified = models.DateTimeField(
|
||||
auto_now=True, editable=False, db_index=True)
|
||||
added = models.DateTimeField(
|
||||
default=timezone.now, editable=False, db_index=True)
|
||||
|
||||
class Meta(object):
|
||||
ordering = ("correspondent", "title")
|
||||
@@ -271,7 +321,7 @@ class Log(models.Model):
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
||||
|
||||
class FileInfo(object):
|
||||
class FileInfo:
|
||||
|
||||
# This epic regex *almost* worked for our needs, so I'm keeping it here for
|
||||
# posterity, in the hopes that we might find a way to make it work one day.
|
||||
@@ -292,45 +342,45 @@ class FileInfo(object):
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-correspondent-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title-tags", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)?"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("title", re.compile(
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff)$",
|
||||
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
|
||||
flags=re.IGNORECASE
|
||||
))
|
||||
])
|
||||
@@ -346,7 +396,10 @@ class FileInfo(object):
|
||||
|
||||
@classmethod
|
||||
def _get_created(cls, created):
|
||||
return dateutil.parser.parse("{:0<14}Z".format(created[:-1]))
|
||||
try:
|
||||
return dateutil.parser.parse("{:0<14}Z".format(created[:-1]))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _get_correspondent(cls, name):
|
||||
@@ -373,6 +426,8 @@ class FileInfo(object):
|
||||
r = extension.lower()
|
||||
if r == "jpeg":
|
||||
return "jpg"
|
||||
if r == "tif":
|
||||
return "tiff"
|
||||
return r
|
||||
|
||||
@classmethod
|
||||
|
51
src/documents/parsers.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class DocumentParser:
|
||||
"""
|
||||
Subclass this to make your own parser. Have a look at
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
"""
|
||||
|
||||
SCRATCH = settings.SCRATCH_DIR
|
||||
|
||||
def __init__(self, path):
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=self.SCRATCH)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
Returns the path to a file we can use as a thumbnail for this document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_text(self):
|
||||
"""
|
||||
Returns the text from the document and only the text.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_date(self):
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
||||
def cleanup(self):
|
||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||
shutil.rmtree(self.tempdir)
|
@@ -18,12 +18,21 @@ class TagSerializer(serializers.HyperlinkedModelSerializer):
|
||||
"id", "slug", "name", "colour", "match", "matching_algorithm")
|
||||
|
||||
|
||||
class CorrespondentField(serializers.HyperlinkedRelatedField):
|
||||
def get_queryset(self):
|
||||
return Correspondent.objects.all()
|
||||
|
||||
|
||||
class TagsField(serializers.HyperlinkedRelatedField):
|
||||
def get_queryset(self):
|
||||
return Tag.objects.all()
|
||||
|
||||
|
||||
class DocumentSerializer(serializers.ModelSerializer):
|
||||
|
||||
correspondent = serializers.HyperlinkedRelatedField(
|
||||
read_only=True, view_name="drf:correspondent-detail", allow_null=True)
|
||||
tags = serializers.HyperlinkedRelatedField(
|
||||
read_only=True, view_name="drf:tag-detail", many=True)
|
||||
correspondent = CorrespondentField(
|
||||
view_name="drf:correspondent-detail", allow_null=True)
|
||||
tags = TagsField(view_name="drf:tag-detail", many=True)
|
||||
|
||||
class Meta(object):
|
||||
model = Document
|
||||
|
4
src/documents/settings.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# Defines the names of file/thumbnail for the manifest
|
||||
# for exporting/importing commands
|
||||
EXPORTER_FILE_NAME = "__exported_file_name__"
|
||||
EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__"
|
@@ -2,3 +2,4 @@ from django.dispatch import Signal
|
||||
|
||||
document_consumption_started = Signal(providing_args=["filename"])
|
||||
document_consumption_finished = Signal(providing_args=["document"])
|
||||
document_consumer_declaration = Signal(providing_args=[])
|
||||
|
@@ -1,9 +1,12 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
from subprocess import Popen
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib.admin.models import ADDITION, LogEntry
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.utils import timezone
|
||||
|
||||
from ..models import Correspondent, Document, Tag
|
||||
|
||||
@@ -94,3 +97,18 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
os.unlink(f)
|
||||
except FileNotFoundError:
|
||||
pass # The file's already gone, so we're cool with it.
|
||||
|
||||
|
||||
def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
|
||||
ct = ContentType.objects.get(model="document")
|
||||
user = User.objects.get(username="consumer")
|
||||
|
||||
LogEntry.objects.create(
|
||||
action_flag=ADDITION,
|
||||
action_time=timezone.now(),
|
||||
content_type=ct,
|
||||
object_id=document.id,
|
||||
user=user,
|
||||
object_repr=document.__str__(),
|
||||
)
|
||||
|
@@ -10,3 +10,14 @@ td a.tag {
|
||||
margin: 1px;
|
||||
display: inline-block;
|
||||
}
|
||||
|
||||
#result_list th.column-note {
|
||||
text-align: right;
|
||||
}
|
||||
#result_list td.field-note {
|
||||
text-align: right;
|
||||
}
|
||||
#result_list td textarea {
|
||||
width: 90%;
|
||||
height: 5em;
|
||||
}
|
@@ -0,0 +1,13 @@
|
||||
{% extends 'admin/change_form.html' %}
|
||||
|
||||
|
||||
{% block footer %}
|
||||
|
||||
{{ block.super }}
|
||||
|
||||
{# Hack to force Django to make the created date a date input rather than `text` (the default) #}
|
||||
<script>
|
||||
django.jQuery(".field-created input").first().attr("type", "date")
|
||||
</script>
|
||||
|
||||
{% endblock footer %}
|
@@ -0,0 +1,12 @@
|
||||
{% extends 'admin/change_list.html' %}
|
||||
|
||||
|
||||
{% load admin_actions from admin_list%}
|
||||
{% load result_list from hacks %}
|
||||
|
||||
|
||||
{% block result_list %}
|
||||
{% if action_form and actions_on_top and cl.show_admin_actions %}{% admin_actions %}{% endif %}
|
||||
{% result_list cl %}
|
||||
{% if action_form and actions_on_bottom and cl.show_admin_actions %}{% admin_actions %}{% endif %}
|
||||
{% endblock %}
|
@@ -0,0 +1,194 @@
|
||||
{% load i18n %}
|
||||
|
||||
<style>
|
||||
.grid *, .grid *:after, .grid *:before {
|
||||
-webkit-box-sizing: border-box;
|
||||
-moz-box-sizing: border-box;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
.box {
|
||||
width: 12.5%;
|
||||
padding: 1em;
|
||||
float: left;
|
||||
opacity: 0.7;
|
||||
transition: all 0.5s;
|
||||
}
|
||||
.box:hover {
|
||||
opacity: 1;
|
||||
transition: all 0.5s;
|
||||
}
|
||||
.box:last-of-type {
|
||||
padding-right: 0;
|
||||
}
|
||||
.result {
|
||||
border: 1px solid #cccccc;
|
||||
border-radius: 2%;
|
||||
overflow: hidden;
|
||||
height: 300px;
|
||||
}
|
||||
.result .header {
|
||||
padding: 5px;
|
||||
background-color: #79AEC8;
|
||||
position: relative;
|
||||
}
|
||||
.result .header .checkbox {
|
||||
width: 5%;
|
||||
float: left;
|
||||
position: absolute;
|
||||
z-index: 2;
|
||||
}
|
||||
.result .header .info {
|
||||
margin-left: 10%;
|
||||
position: relative;
|
||||
}
|
||||
.headerLink {
|
||||
cursor: pointer;
|
||||
opacity: 0;
|
||||
z-index: 1;
|
||||
position: absolute;
|
||||
top: 0;
|
||||
left: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
}
|
||||
.header > a {
|
||||
z-index: 2;
|
||||
margin-left: 10%;
|
||||
position: relative;
|
||||
}
|
||||
.result .header a,
|
||||
.result a.tag {
|
||||
color: #ffffff;
|
||||
}
|
||||
.result .date {
|
||||
padding: 5px;
|
||||
}
|
||||
.result .tags {
|
||||
float: left;
|
||||
}
|
||||
.result .tags a.tag {
|
||||
padding: 2px 5px;
|
||||
border-radius: 2px;
|
||||
display: inline-block;
|
||||
margin: 2px;
|
||||
}
|
||||
.result .date {
|
||||
float: right;
|
||||
color: #cccccc;
|
||||
}
|
||||
.result .image img {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.grid {
|
||||
margin-right: 260px;
|
||||
}
|
||||
.grid:after {
|
||||
content: "";
|
||||
display: table;
|
||||
clear: both;
|
||||
}
|
||||
|
||||
@media (max-width: 1600px) {
|
||||
.box {
|
||||
width: 25%
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 991px) {
|
||||
.grid {
|
||||
margin-right: 220px;
|
||||
}
|
||||
.box {
|
||||
width: 50%
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 767px) {
|
||||
.grid {
|
||||
margin-right: 0;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 500px) {
|
||||
.box {
|
||||
width: 100%
|
||||
}
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
|
||||
{# This is just copypasta from the parent change_list_results.html file #}
|
||||
<table id="result_list">
|
||||
<thead>
|
||||
<tr>
|
||||
{% for header in result_headers %}
|
||||
<th scope="col" {{ header.class_attrib }}>
|
||||
{% if header.sortable %}
|
||||
{% if header.sort_priority > 0 %}
|
||||
<div class="sortoptions">
|
||||
<a class="sortremove" href="{{ header.url_remove }}" title="{% trans "Remove from sorting" %}"></a>
|
||||
{% if num_sorted_fields > 1 %}<span class="sortpriority" title="{% blocktrans with priority_number=header.sort_priority %}Sorting priority: {{ priority_number }}{% endblocktrans %}">{{ header.sort_priority }}</span>{% endif %}
|
||||
<a href="{{ header.url_toggle }}" class="toggle {% if header.ascending %}ascending{% else %}descending{% endif %}" title="{% trans "Toggle sorting" %}"></a>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
<div class="text">{% if header.sortable %}<a href="{{ header.url_primary }}">{{ header.text|capfirst }}</a>{% else %}<span>{{ header.text|capfirst }}</span>{% endif %}</div>
|
||||
<div class="clear"></div>
|
||||
</th>{% endfor %}
|
||||
</tr>
|
||||
</thead>
|
||||
</table>
|
||||
{# /copypasta #}
|
||||
|
||||
|
||||
<div class="grid">
|
||||
{% for result in results %}
|
||||
{# 0: Checkbox #}
|
||||
{# 1: Title #}
|
||||
{# 2: Date #}
|
||||
{# 3: Added #}
|
||||
{# 4: Image #}
|
||||
{# 5: Correspondent #}
|
||||
{# 6: Tags #}
|
||||
{# 7: Document edit url #}
|
||||
<div class="box">
|
||||
<div class="result">
|
||||
<div class="header">
|
||||
{% comment %}
|
||||
The purpose of 'headerLink' is to make the whole header
|
||||
background clickable.
|
||||
We use an onclick handler here instead of a native link ('<a>')
|
||||
to allow selecting (and copying) the overlying doc title text
|
||||
with the mouse cursor.
|
||||
If the title link were layered upon another link ('<a>'), title text
|
||||
selection would not be possible with mouse click + drag. Instead,
|
||||
the underlying link would be dragged.
|
||||
{% endcomment %}
|
||||
<div class="headerLink" onclick="location.href='{{ result.7 }}';"></div>
|
||||
<div class="checkbox">{{ result.0 }}</div>
|
||||
<div class="info">
|
||||
{{ result.5 }}
|
||||
</div>
|
||||
{{ result.1 }}
|
||||
<div style="clear: both;"></div>
|
||||
</div>
|
||||
<div class="tags">{{ result.6 }}</div>
|
||||
<div class="date">{{ result.2 }}</div>
|
||||
<div style="clear: both;"></div>
|
||||
<div class="image">{{ result.4 }}</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
|
||||
<script>
|
||||
// We need to re-build the select-all functionality as the old logic pointed
|
||||
// to a table and we're using divs now.
|
||||
django.jQuery("#action-toggle").on("change", function(){
|
||||
django.jQuery(".grid .box .result .checkbox input")
|
||||
.prop("checked", this.checked);
|
||||
});
|
||||
</script>
|
39
src/documents/templates/admin/index.html
Normal file
@@ -0,0 +1,39 @@
|
||||
{% extends "admin/index.html" %}
|
||||
|
||||
|
||||
{% load i18n static %}
|
||||
|
||||
|
||||
{# This whole block is here just to override the `get_admin_log` line so #}
|
||||
{# that the log entries aren't limited to the current user #}
|
||||
{% block sidebar %}
|
||||
<div id="content-related">
|
||||
<div class="module" id="recent-actions-module">
|
||||
<h2>{% trans 'Recent actions' %}</h2>
|
||||
<h3>{% trans 'My actions' %}</h3>
|
||||
{% load log %}
|
||||
{% get_admin_log 10 as admin_log %}
|
||||
{% if not admin_log %}
|
||||
<p>{% trans 'None available' %}</p>
|
||||
{% else %}
|
||||
<ul class="actionlist">
|
||||
{% for entry in admin_log %}
|
||||
<li class="{% if entry.is_addition %}addlink{% endif %}{% if entry.is_change %}changelink{% endif %}{% if entry.is_deletion %}deletelink{% endif %}">
|
||||
{% if entry.is_deletion or not entry.get_admin_url %}
|
||||
{{ entry.object_repr }}
|
||||
{% else %}
|
||||
<a href="{{ entry.get_admin_url }}">{{ entry.object_repr }}</a>
|
||||
{% endif %}
|
||||
<br/>
|
||||
{% if entry.content_type %}
|
||||
<span class="mini quiet">{% filter capfirst %}{{ entry.content_type }}{% endfilter %}</span>
|
||||
{% else %}
|
||||
<span class="mini quiet">{% trans 'Unknown content' %}</span>
|
||||
{% endif %}
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
@@ -6,5 +6,6 @@
|
||||
<meta charset="utf-8">
|
||||
</head>
|
||||
<body>
|
||||
{# One day someone (maybe even myself) is going to write a proper web front-end for Paperless, and this is where it'll start. #}
|
||||
</body>
|
||||
</html>
|
||||
|
0
src/documents/templatetags/__init__.py
Normal file
43
src/documents/templatetags/hacks.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import re
|
||||
|
||||
from django.contrib.admin.templatetags.admin_list import (
|
||||
result_headers,
|
||||
result_hidden_fields,
|
||||
results
|
||||
)
|
||||
from django.template import Library
|
||||
|
||||
|
||||
EXTRACT_URL = re.compile(r'href="(.*?)"')
|
||||
|
||||
register = Library()
|
||||
|
||||
|
||||
@register.inclusion_tag("admin/documents/document/change_list_results.html")
|
||||
def result_list(cl):
|
||||
"""
|
||||
Copy/pasted from django.contrib.admin.templatetags.admin_list just so I can
|
||||
modify the value passed to `.inclusion_tag()` in the decorator here. There
|
||||
must be a cleaner way... right?
|
||||
"""
|
||||
headers = list(result_headers(cl))
|
||||
num_sorted_fields = 0
|
||||
for h in headers:
|
||||
if h['sortable'] and h['sorted']:
|
||||
num_sorted_fields += 1
|
||||
return {'cl': cl,
|
||||
'result_hidden_fields': list(result_hidden_fields(cl)),
|
||||
'result_headers': headers,
|
||||
'num_sorted_fields': num_sorted_fields,
|
||||
'results': map(add_doc_edit_url, results(cl))}
|
||||
|
||||
|
||||
def add_doc_edit_url(result):
|
||||
"""
|
||||
Make the document edit URL accessible to the view as a separate item
|
||||
"""
|
||||
title = result[1]
|
||||
match = re.search(EXTRACT_URL, title)
|
||||
edit_doc_url = match.group(1)
|
||||
result.append(edit_doc_url)
|
||||
return result
|
17
src/documents/tests/factories.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import factory
|
||||
|
||||
from ..models import Document, Correspondent
|
||||
|
||||
|
||||
class CorrespondentFactory(factory.DjangoModelFactory):
|
||||
|
||||
class Meta:
|
||||
model = Correspondent
|
||||
|
||||
name = factory.Faker("name")
|
||||
|
||||
|
||||
class DocumentFactory(factory.DjangoModelFactory):
|
||||
|
||||
class Meta:
|
||||
model = Document
|
@@ -1,22 +1,71 @@
|
||||
import os
|
||||
from unittest import mock, skipIf
|
||||
|
||||
import pyocr
|
||||
from django.test import TestCase
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
from unittest import mock
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from ..consumer import Consumer
|
||||
from ..models import FileInfo
|
||||
from ..consumer import image_to_string, strip_excess_whitespace
|
||||
|
||||
|
||||
class TestConsumer(TestCase):
|
||||
|
||||
class DummyParser(object):
|
||||
pass
|
||||
|
||||
def test__get_parser_class_1_parser(self):
|
||||
self.assertEqual(
|
||||
self._get_consumer()._get_parser_class("doc.pdf"),
|
||||
self.DummyParser
|
||||
)
|
||||
|
||||
@mock.patch("documents.consumer.os.makedirs")
|
||||
@mock.patch("documents.consumer.os.path.exists", return_value=True)
|
||||
@mock.patch("documents.consumer.document_consumer_declaration.send")
|
||||
def test__get_parser_class_n_parsers(self, m, *args):
|
||||
|
||||
class DummyParser1(object):
|
||||
pass
|
||||
|
||||
class DummyParser2(object):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
|
||||
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
|
||||
)
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
self.assertEqual(
|
||||
Consumer(consume=tmpdir)._get_parser_class("doc.pdf"),
|
||||
DummyParser2
|
||||
)
|
||||
|
||||
@mock.patch("documents.consumer.os.makedirs")
|
||||
@mock.patch("documents.consumer.os.path.exists", return_value=True)
|
||||
@mock.patch("documents.consumer.document_consumer_declaration.send")
|
||||
def test__get_parser_class_0_parsers(self, m, *args):
|
||||
m.return_value = ((None, lambda _: None),)
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
self.assertIsNone(
|
||||
Consumer(consume=tmpdir)._get_parser_class("doc.pdf")
|
||||
)
|
||||
|
||||
@mock.patch("documents.consumer.os.makedirs")
|
||||
@mock.patch("documents.consumer.os.path.exists", return_value=True)
|
||||
@mock.patch("documents.consumer.document_consumer_declaration.send")
|
||||
def _get_consumer(self, m, *args):
|
||||
m.return_value = (
|
||||
(None, lambda _: {"weight": 0, "parser": self.DummyParser}),
|
||||
)
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
return Consumer(consume=tmpdir)
|
||||
|
||||
|
||||
class TestAttributes(TestCase):
|
||||
|
||||
TAGS = ("tag1", "tag2", "tag3")
|
||||
EXTENSIONS = (
|
||||
"pdf", "png", "jpg", "jpeg", "gif",
|
||||
"PDF", "PNG", "JPG", "JPEG", "GIF",
|
||||
"PdF", "PnG", "JpG", "JPeG", "GiF",
|
||||
"pdf", "png", "jpg", "jpeg", "gif", "tiff", "tif",
|
||||
"PDF", "PNG", "JPG", "JPEG", "GIF", "TIFF", "TIF",
|
||||
"PdF", "PnG", "JpG", "JPeG", "GiF", "TiFf", "TiF",
|
||||
)
|
||||
|
||||
def _test_guess_attributes_from_name(self, path, sender, title, tags):
|
||||
@@ -36,6 +85,8 @@ class TestAttributes(TestCase):
|
||||
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
|
||||
if extension.lower() == "jpeg":
|
||||
self.assertEqual(file_info.extension, "jpg", f)
|
||||
elif extension.lower() == "tif":
|
||||
self.assertEqual(file_info.extension, "tiff", f)
|
||||
else:
|
||||
self.assertEqual(file_info.extension, extension.lower(), f)
|
||||
|
||||
@@ -225,11 +276,13 @@ class TestFieldPermutations(TestCase):
|
||||
|
||||
def test_created_and_correspondent_and_title_and_tags(self):
|
||||
|
||||
template = ("/path/to/{created} - "
|
||||
"{correspondent} - "
|
||||
"{title} - "
|
||||
"{tags}"
|
||||
".{extension}")
|
||||
template = (
|
||||
"/path/to/{created} - "
|
||||
"{correspondent} - "
|
||||
"{title} - "
|
||||
"{tags}"
|
||||
".{extension}"
|
||||
)
|
||||
|
||||
for created in self.valid_dates:
|
||||
for correspondent in self.valid_correspondents:
|
||||
@@ -248,10 +301,7 @@ class TestFieldPermutations(TestCase):
|
||||
|
||||
def test_created_and_correspondent_and_title(self):
|
||||
|
||||
template = ("/path/to/{created} - "
|
||||
"{correspondent} - "
|
||||
"{title}"
|
||||
".{extension}")
|
||||
template = "/path/to/{created} - {correspondent} - {title}.{extension}"
|
||||
|
||||
for created in self.valid_dates:
|
||||
for correspondent in self.valid_correspondents:
|
||||
@@ -274,9 +324,7 @@ class TestFieldPermutations(TestCase):
|
||||
|
||||
def test_created_and_title(self):
|
||||
|
||||
template = ("/path/to/{created} - "
|
||||
"{title}"
|
||||
".{extension}")
|
||||
template = "/path/to/{created} - {title}.{extension}"
|
||||
|
||||
for created in self.valid_dates:
|
||||
for title in self.valid_titles:
|
||||
@@ -291,10 +339,7 @@ class TestFieldPermutations(TestCase):
|
||||
|
||||
def test_created_and_title_and_tags(self):
|
||||
|
||||
template = ("/path/to/{created} - "
|
||||
"{title} - "
|
||||
"{tags}"
|
||||
".{extension}")
|
||||
template = "/path/to/{created} - {title} - {tags}.{extension}"
|
||||
|
||||
for created in self.valid_dates:
|
||||
for title in self.valid_titles:
|
||||
@@ -309,70 +354,7 @@ class TestFieldPermutations(TestCase):
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise OtherTesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
return "This is test text"
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
class TestOCR(TestCase):
|
||||
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
(
|
||||
"simple newline\n testing string",
|
||||
"simple newline\ntesting string"
|
||||
),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце"
|
||||
)
|
||||
]
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||
source,
|
||||
result,
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
||||
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
||||
@mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
|
||||
@mock.patch("documents.consumer.pyocr", FakePyOcr)
|
||||
def test_image_to_string_with_text_free_page(self):
|
||||
"""
|
||||
This test is sort of silly, since it's really just reproducing an odd
|
||||
exception thrown by pyocr when it encounters a page with no text.
|
||||
Actually running this test against an installation of Tesseract results
|
||||
in a segmentation fault rooted somewhere deep inside pyocr where I
|
||||
don't care to dig. Regardless, if you run the consumer normally,
|
||||
text-free pages are now handled correctly so long as we work around
|
||||
this weird exception.
|
||||
"""
|
||||
image_to_string(["no-text.png", "en"])
|
||||
def test_invalid_date_format(self):
|
||||
info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
|
||||
self.assertEqual(info.title, "title")
|
||||
self.assertIsNone(info.created)
|
||||
|
@@ -3,6 +3,8 @@ from django.test import TestCase
|
||||
|
||||
from ..management.commands.document_importer import Command
|
||||
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
|
||||
|
||||
class TestImporter(TestCase):
|
||||
|
||||
@@ -27,7 +29,7 @@ class TestImporter(TestCase):
|
||||
|
||||
cmd.manifest = [{
|
||||
"model": "documents.document",
|
||||
"__exported_file_name__": "noexist.pdf"
|
||||
EXPORTER_FILE_NAME: "noexist.pdf"
|
||||
}]
|
||||
# self.assertRaises(CommandError, cmd._check_manifest)
|
||||
with self.assertRaises(CommandError) as cm:
|
||||
|
@@ -1,6 +1,8 @@
|
||||
from random import randint
|
||||
|
||||
from django.test import TestCase
|
||||
from django.contrib.admin.models import LogEntry
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..models import Correspondent, Document, Tag
|
||||
from ..signals import document_consumption_finished
|
||||
@@ -16,9 +18,15 @@ class TestMatching(TestCase):
|
||||
matching_algorithm=getattr(klass, algorithm)
|
||||
)
|
||||
for string in true:
|
||||
self.assertTrue(instance.matches(string))
|
||||
self.assertTrue(
|
||||
instance.matches(string),
|
||||
'"%s" should match "%s" but it does not' % (text, string)
|
||||
)
|
||||
for string in false:
|
||||
self.assertFalse(instance.matches(string))
|
||||
self.assertFalse(
|
||||
instance.matches(string),
|
||||
'"%s" should not match "%s" but it does' % (text, string)
|
||||
)
|
||||
|
||||
def test_match_all(self):
|
||||
|
||||
@@ -54,6 +62,21 @@ class TestMatching(TestCase):
|
||||
)
|
||||
)
|
||||
|
||||
self._test_matching(
|
||||
'brown fox "lazy dogs"',
|
||||
"MATCH_ALL",
|
||||
(
|
||||
"the quick brown fox jumped over the lazy dogs",
|
||||
"the quick brown fox jumped over the lazy dogs",
|
||||
),
|
||||
(
|
||||
"the quick fox jumped over the lazy dogs",
|
||||
"the quick brown wolf jumped over the lazy dogs",
|
||||
"the quick brown fox jumped over the fat dogs",
|
||||
"the quick brown fox jumped over the lazy... dogs",
|
||||
)
|
||||
)
|
||||
|
||||
def test_match_any(self):
|
||||
|
||||
self._test_matching(
|
||||
@@ -89,6 +112,18 @@ class TestMatching(TestCase):
|
||||
)
|
||||
)
|
||||
|
||||
self._test_matching(
|
||||
'"brown fox" " lazy dogs "',
|
||||
"MATCH_ANY",
|
||||
(
|
||||
"the quick brown fox",
|
||||
"jumped over the lazy dogs.",
|
||||
),
|
||||
(
|
||||
"the lazy fox jumped over the brown dogs",
|
||||
)
|
||||
)
|
||||
|
||||
def test_match_literal(self):
|
||||
|
||||
self._test_matching(
|
||||
@@ -149,8 +184,25 @@ class TestMatching(TestCase):
|
||||
)
|
||||
)
|
||||
|
||||
def test_match_fuzzy(self):
|
||||
|
||||
class TestApplications(TestCase):
|
||||
self._test_matching(
|
||||
"Springfield, Miss.",
|
||||
"MATCH_FUZZY",
|
||||
(
|
||||
"1220 Main Street, Springf eld, Miss.",
|
||||
"1220 Main Street, Spring field, Miss.",
|
||||
"1220 Main Street, Springfeld, Miss.",
|
||||
"1220 Main Street Springfield Miss",
|
||||
),
|
||||
(
|
||||
"1220 Main Street, Springfield, Mich.",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@override_settings(POST_CONSUME_SCRIPT=None)
|
||||
class TestDocumentConsumptionFinishedSignal(TestCase):
|
||||
"""
|
||||
We make use of document_consumption_finished, so we should test that it's
|
||||
doing what we expect wrt to tag & correspondent matching.
|
||||
@@ -158,6 +210,7 @@ class TestApplications(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
TestCase.setUp(self)
|
||||
User.objects.create_user(username='test_consumer', password='12345')
|
||||
self.doc_contains = Document.objects.create(
|
||||
content="I contain the keyword.", file_type="pdf")
|
||||
|
||||
@@ -194,3 +247,9 @@ class TestApplications(TestCase):
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__, document=self.doc_contains)
|
||||
self.assertEqual(self.doc_contains.correspondent, None)
|
||||
|
||||
def test_logentry_created(self):
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__, document=self.doc_contains)
|
||||
|
||||
self.assertEqual(LogEntry.objects.count(), 1)
|
||||
|
31
src/documents/tests/test_models.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from ..models import Document, Correspondent
|
||||
from .factories import DocumentFactory, CorrespondentFactory
|
||||
|
||||
|
||||
class CorrespondentTestCase(TestCase):
|
||||
|
||||
def test___str__(self):
|
||||
for s in ("test", "οχι", "test with fun_charÅc'\"terß"):
|
||||
correspondent = CorrespondentFactory.create(name=s)
|
||||
self.assertEqual(str(correspondent), s)
|
||||
|
||||
|
||||
class DocumentTestCase(TestCase):
|
||||
|
||||
def test_correspondent_deletion_does_not_cascade(self):
|
||||
|
||||
self.assertEqual(Correspondent.objects.all().count(), 0)
|
||||
correspondent = CorrespondentFactory.create()
|
||||
self.assertEqual(Correspondent.objects.all().count(), 1)
|
||||
|
||||
self.assertEqual(Document.objects.all().count(), 0)
|
||||
DocumentFactory.create(correspondent=correspondent)
|
||||
self.assertEqual(Document.objects.all().count(), 1)
|
||||
self.assertIsNotNone(Document.objects.all().first().correspondent)
|
||||
|
||||
correspondent.delete()
|
||||
self.assertEqual(Correspondent.objects.all().count(), 0)
|
||||
self.assertEqual(Document.objects.all().count(), 1)
|
||||
self.assertIsNone(Document.objects.all().first().correspondent)
|
@@ -1,17 +1,16 @@
|
||||
from django.contrib.auth.mixins import LoginRequiredMixin
|
||||
from django.http import HttpResponse
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.http import HttpResponse, HttpResponseBadRequest
|
||||
from django.views.generic import DetailView, FormView, TemplateView
|
||||
from django_filters.rest_framework import DjangoFilterBackend
|
||||
from rest_framework.filters import SearchFilter, OrderingFilter
|
||||
from paperless.db import GnuPG
|
||||
from paperless.mixins import SessionOrBasicAuthMixin
|
||||
from paperless.views import StandardPagination
|
||||
from rest_framework.filters import OrderingFilter, SearchFilter
|
||||
from rest_framework.mixins import (
|
||||
DestroyModelMixin,
|
||||
ListModelMixin,
|
||||
RetrieveModelMixin,
|
||||
UpdateModelMixin
|
||||
)
|
||||
from rest_framework.pagination import PageNumberPagination
|
||||
from rest_framework.permissions import IsAuthenticated
|
||||
from rest_framework.viewsets import (
|
||||
GenericViewSet,
|
||||
@@ -31,17 +30,10 @@ from .serialisers import (
|
||||
|
||||
|
||||
class IndexView(TemplateView):
|
||||
|
||||
template_name = "documents/index.html"
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
print(kwargs)
|
||||
print(self.request.GET)
|
||||
print(self.request.POST)
|
||||
return TemplateView.get_context_data(self, **kwargs)
|
||||
|
||||
|
||||
class FetchView(LoginRequiredMixin, DetailView):
|
||||
class FetchView(SessionOrBasicAuthMixin, DetailView):
|
||||
|
||||
model = Document
|
||||
|
||||
@@ -74,28 +66,19 @@ class FetchView(LoginRequiredMixin, DetailView):
|
||||
return response
|
||||
|
||||
|
||||
class PushView(LoginRequiredMixin, FormView):
|
||||
class PushView(SessionOrBasicAuthMixin, FormView):
|
||||
"""
|
||||
A crude REST-ish API for creating documents.
|
||||
"""
|
||||
|
||||
form_class = UploadForm
|
||||
|
||||
@classmethod
|
||||
def as_view(cls, **kwargs):
|
||||
return csrf_exempt(FormView.as_view(**kwargs))
|
||||
|
||||
def form_valid(self, form):
|
||||
return HttpResponse("1")
|
||||
form.save()
|
||||
return HttpResponse("1", status=202)
|
||||
|
||||
def form_invalid(self, form):
|
||||
return HttpResponse("0")
|
||||
|
||||
|
||||
class StandardPagination(PageNumberPagination):
|
||||
page_size = 25
|
||||
page_size_query_param = "page-size"
|
||||
max_page_size = 100000
|
||||
return HttpResponseBadRequest(str(form.errors))
|
||||
|
||||
|
||||
class CorrespondentViewSet(ModelViewSet):
|
||||
|
@@ -1 +1 @@
|
||||
from .checks import paths_check
|
||||
from .checks import paths_check, binaries_check
|
||||
|
@@ -1,10 +1,15 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error, register, Warning
|
||||
|
||||
|
||||
@register()
|
||||
def paths_check(app_configs, **kwargs):
|
||||
"""
|
||||
Check the various paths for existence, readability and writeability
|
||||
"""
|
||||
|
||||
check_messages = []
|
||||
|
||||
@@ -44,4 +49,55 @@ def paths_check(app_configs, **kwargs):
|
||||
writeable_hint.format(directory)
|
||||
))
|
||||
|
||||
directory = os.getenv("PAPERLESS_STATICDIR")
|
||||
if directory:
|
||||
if not os.path.exists(directory):
|
||||
check_messages.append(Error(
|
||||
exists_message.format("PAPERLESS_STATICDIR"),
|
||||
exists_hint.format(directory)
|
||||
))
|
||||
if not check_messages:
|
||||
if not os.access(directory, os.W_OK | os.X_OK):
|
||||
check_messages.append(Error(
|
||||
writeable_message.format("PAPERLESS_STATICDIR"),
|
||||
writeable_hint.format(directory)
|
||||
))
|
||||
|
||||
return check_messages
|
||||
|
||||
|
||||
@register()
|
||||
def binaries_check(app_configs, **kwargs):
|
||||
"""
|
||||
Paperless requires the existence of a few binaries, so we do some checks
|
||||
for those here.
|
||||
"""
|
||||
|
||||
error = "Paperless can't find {}. Without it, consumption is impossible."
|
||||
hint = "Either it's not in your ${PATH} or it's not installed."
|
||||
|
||||
binaries = (settings.CONVERT_BINARY, settings.UNPAPER_BINARY, "tesseract")
|
||||
|
||||
check_messages = []
|
||||
for binary in binaries:
|
||||
if shutil.which(binary) is None:
|
||||
check_messages.append(Warning(error.format(binary), hint))
|
||||
|
||||
return check_messages
|
||||
|
||||
|
||||
@register()
|
||||
def config_check(app_configs, **kwargs):
|
||||
warning = (
|
||||
"It looks like you have PAPERLESS_SHARED_SECRET defined. Note that "
|
||||
"in the \npast, this variable was used for both API authentication "
|
||||
"and as the mail \nkeyword. As the API no no longer uses it, this "
|
||||
"variable has been renamed to \nPAPERLESS_EMAIL_SECRET, so if you're "
|
||||
"using the mail feature, you'd best update \nyour variable name.\n\n"
|
||||
"The old variable will stop working in a few months."
|
||||
)
|
||||
|
||||
if os.getenv("PAPERLESS_SHARED_SECRET"):
|
||||
return [Warning(warning)]
|
||||
|
||||
return []
|
||||
|
14
src/paperless/middleware.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from django.utils.deprecation import MiddlewareMixin
|
||||
from .models import User
|
||||
|
||||
|
||||
class Middleware (MiddlewareMixin):
|
||||
"""
|
||||
This is a dummy authentication middleware class that creates what
|
||||
is roughly an Anonymous authenticated user so we can disable login
|
||||
and not interfere with existing user ID's. It's only used if
|
||||
login is disabled in paperless.conf (default is to require login)
|
||||
"""
|
||||
|
||||
def process_request(self, request):
|
||||
request.user = User()
|
46
src/paperless/mixins.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from django.contrib.auth.mixins import AccessMixin
|
||||
from django.contrib.auth import authenticate, login
|
||||
import base64
|
||||
|
||||
|
||||
class SessionOrBasicAuthMixin(AccessMixin):
|
||||
"""
|
||||
Session or Basic Authentication mixin for Django.
|
||||
It determines if the requester is already logged in or if they have
|
||||
provided proper http-authorization and returning the view if all goes
|
||||
well, otherwise responding with a 401.
|
||||
|
||||
Base for mixin found here: https://djangosnippets.org/snippets/3073/
|
||||
"""
|
||||
|
||||
def dispatch(self, request, *args, **kwargs):
|
||||
|
||||
# check if user is authenticated via the session
|
||||
if request.user.is_authenticated:
|
||||
|
||||
# Already logged in, just return the view.
|
||||
return super(SessionOrBasicAuthMixin, self).dispatch(
|
||||
request, *args, **kwargs
|
||||
)
|
||||
|
||||
# apparently not authenticated via session, maybe via HTTP Basic?
|
||||
if 'HTTP_AUTHORIZATION' in request.META:
|
||||
auth = request.META['HTTP_AUTHORIZATION'].split()
|
||||
if len(auth) == 2:
|
||||
# NOTE: Support for only basic authentication
|
||||
if auth[0].lower() == "basic":
|
||||
authString = base64.b64decode(auth[1]).decode('utf-8')
|
||||
uname, passwd = authString.split(':')
|
||||
user = authenticate(username=uname, password=passwd)
|
||||
if user is not None:
|
||||
if user.is_active:
|
||||
login(request, user)
|
||||
request.user = user
|
||||
return super(
|
||||
SessionOrBasicAuthMixin, self
|
||||
).dispatch(
|
||||
request, *args, **kwargs
|
||||
)
|
||||
|
||||
# nope, really not authenticated
|
||||
return self.handle_no_permission()
|
26
src/paperless/models.py
Normal file
@@ -0,0 +1,26 @@
|
||||
class User:
|
||||
"""
|
||||
This is a dummy django User used with our middleware to disable
|
||||
login authentication if that is configured in paperless.conf
|
||||
"""
|
||||
is_superuser = True
|
||||
is_active = True
|
||||
is_staff = True
|
||||
is_authenticated = True
|
||||
|
||||
# Must be -1 to avoid colliding with real user ID's (which start at 1)
|
||||
id = -1
|
||||
|
||||
@property
|
||||
def pk(self):
|
||||
return self.id
|
||||
|
||||
|
||||
"""
|
||||
NOTE: These are here as a hack instead of being in the User definition
|
||||
above due to the way pycodestyle handles lamdbdas.
|
||||
See https://github.com/PyCQA/pycodestyle/issues/379 for more.
|
||||
"""
|
||||
|
||||
User.has_module_perms = lambda *_: True
|
||||
User.has_perm = lambda *_: True
|
@@ -4,62 +4,83 @@ Django settings for paperless project.
|
||||
Generated by 'django-admin startproject' using Django 1.9.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/1.9/topics/settings/
|
||||
https://docs.djangoproject.com/en/1.10/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/1.9/ref/settings/
|
||||
https://docs.djangoproject.com/en/1.10/ref/settings/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
# Tap paperless.conf if it's available
|
||||
if os.path.exists("/etc/paperless.conf"):
|
||||
load_dotenv("/etc/paperless.conf")
|
||||
elif os.path.exists("/usr/local/etc/paperless.conf"):
|
||||
load_dotenv("/usr/local/etc/paperless.conf")
|
||||
|
||||
|
||||
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/
|
||||
# See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/
|
||||
|
||||
# The secret key has a default that should be fine so long as you're hosting
|
||||
# Paperless on a closed network. However, if you're putting this anywhere
|
||||
# public, you should change the key to something unique and verbose.
|
||||
SECRET_KEY = os.getenv(
|
||||
"PAPERLESS_SECRET_KEY",
|
||||
"e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee"
|
||||
)
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee'
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
|
||||
LOGIN_URL = '/admin/login'
|
||||
LOGIN_URL = "admin:login"
|
||||
|
||||
ALLOWED_HOSTS = ["*"]
|
||||
|
||||
_allowed_hosts = os.getenv("PAPERLESS_ALLOWED_HOSTS")
|
||||
if allowed_hosts:
|
||||
if _allowed_hosts:
|
||||
ALLOWED_HOSTS = _allowed_hosts.split(",")
|
||||
|
||||
# Tap paperless.conf if it's available
|
||||
if os.path.exists("/etc/paperless.conf"):
|
||||
load_dotenv("/etc/paperless.conf")
|
||||
|
||||
|
||||
FORCE_SCRIPT_NAME = os.getenv("PAPERLESS_FORCE_SCRIPT_NAME")
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
"django.contrib.auth",
|
||||
"django.contrib.contenttypes",
|
||||
"django.contrib.sessions",
|
||||
"django.contrib.messages",
|
||||
"django.contrib.staticfiles",
|
||||
|
||||
"django_extensions",
|
||||
|
||||
"documents.apps.DocumentsConfig",
|
||||
"reminders.apps.RemindersConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
|
||||
"flat_responsive",
|
||||
"django.contrib.admin",
|
||||
|
||||
"rest_framework",
|
||||
"crispy_forms",
|
||||
"django_filters"
|
||||
|
||||
]
|
||||
|
||||
if os.getenv("PAPERLESS_INSTALLED_APPS"):
|
||||
INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",")
|
||||
|
||||
|
||||
|
||||
MIDDLEWARE_CLASSES = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
@@ -71,6 +92,12 @@ MIDDLEWARE_CLASSES = [
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
]
|
||||
|
||||
# If auth is disabled, we just use our "bypass" authentication middleware
|
||||
if bool(os.getenv("PAPERLESS_DISABLE_LOGIN", "false").lower() in ("yes", "y", "1", "t", "true")):
|
||||
_index = MIDDLEWARE_CLASSES.index("django.contrib.auth.middleware.AuthenticationMiddleware")
|
||||
MIDDLEWARE_CLASSES[_index] = "paperless.middleware.Middleware"
|
||||
MIDDLEWARE_CLASSES.remove("django.contrib.auth.middleware.SessionAuthenticationMiddleware")
|
||||
|
||||
ROOT_URLCONF = 'paperless.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
@@ -93,7 +120,7 @@ WSGI_APPLICATION = 'paperless.wsgi.application'
|
||||
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/1.9/ref/settings/#databases
|
||||
# https://docs.djangoproject.com/en/1.10/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
"default": {
|
||||
@@ -118,7 +145,7 @@ if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"):
|
||||
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators
|
||||
# https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{
|
||||
@@ -137,11 +164,11 @@ AUTH_PASSWORD_VALIDATORS = [
|
||||
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/1.9/topics/i18n/
|
||||
# https://docs.djangoproject.com/en/1.10/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
|
||||
TIME_ZONE = 'UTC'
|
||||
TIME_ZONE = os.getenv("PAPERLESS_TIME_ZONE", "UTC")
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
@@ -151,14 +178,15 @@ USE_TZ = True
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/1.9/howto/static-files/
|
||||
# https://docs.djangoproject.com/en/1.10/howto/static-files/
|
||||
|
||||
STATIC_ROOT = os.path.join(BASE_DIR, "..", "static")
|
||||
STATIC_ROOT = os.getenv(
|
||||
"PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "static"))
|
||||
MEDIA_ROOT = os.getenv(
|
||||
"PAPERLESS_MEDIADIR", os.path.join(BASE_DIR, "..", "media"))
|
||||
|
||||
STATIC_URL = '/static/'
|
||||
MEDIA_URL = "/media/"
|
||||
STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", "/static/")
|
||||
MEDIA_URL = os.getenv("PAPERLESS_MEDIA_URL", "/media/")
|
||||
|
||||
|
||||
# Paperless-specific stuff
|
||||
@@ -187,11 +215,14 @@ LOGGING = {
|
||||
|
||||
# The default language that tesseract will attempt to use when parsing
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
OCR_LANGUAGE = "eng"
|
||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
|
||||
# The amount of threads to use for OCR
|
||||
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
|
||||
|
||||
# OCR all documents?
|
||||
OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
# If this is true, any failed attempts to OCR a PDF will result in the PDF
|
||||
# being indexed anyway, with whatever we could get. If it's False, the file
|
||||
# will simply be left in the CONSUMPTION_DIR.
|
||||
@@ -215,23 +246,13 @@ SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
|
||||
# This is where Paperless will look for PDFs to index
|
||||
CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR")
|
||||
|
||||
# (This setting is ignored on Linux where inotify is used instead of a
|
||||
# polling loop.)
|
||||
# The number of seconds that Paperless will wait between checking
|
||||
# CONSUMPTION_DIR. If you tend to write documents to this directory very
|
||||
# slowly, you may want to use a higher value than the default.
|
||||
CONSUMER_LOOP_TIME = int(os.getenv("PAPERLESS_CONSUMER_LOOP_TIME", 10))
|
||||
|
||||
# If you want to use IMAP mail consumption, populate this with useful values.
|
||||
# If you leave HOST set to None, we assume you're not going to use this
|
||||
# feature.
|
||||
MAIL_CONSUMPTION = {
|
||||
"HOST": os.getenv("PAPERLESS_CONSUME_MAIL_HOST"),
|
||||
"PORT": os.getenv("PAPERLESS_CONSUME_MAIL_PORT"),
|
||||
"USERNAME": os.getenv("PAPERLESS_CONSUME_MAIL_USER"),
|
||||
"PASSWORD": os.getenv("PAPERLESS_CONSUME_MAIL_PASS"),
|
||||
"USE_SSL": os.getenv("PAPERLESS_CONSUME_MAIL_USE_SSL", "y").lower() == "y", # If True, use SSL/TLS to connect
|
||||
"INBOX": "INBOX" # The name of the inbox on the server
|
||||
}
|
||||
|
||||
# This is used to encrypt the original documents and decrypt them later when
|
||||
# you want to download them. Set it and change the permissions on this file to
|
||||
# 0600, or set it to `None` and you'll be prompted for the passphrase at
|
||||
@@ -241,11 +262,17 @@ MAIL_CONSUMPTION = {
|
||||
# files.
|
||||
PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
|
||||
|
||||
# If you intend to use the "API" to push files into the consumer, you'll need
|
||||
# to provide a shared secret here. Leaving this as the default will disable
|
||||
# the API.
|
||||
SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "")
|
||||
|
||||
# Trigger a script after every successful document consumption?
|
||||
PRE_CONSUME_SCRIPT = os.getenv("PAPERLESS_PRE_CONSUME_SCRIPT")
|
||||
POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
|
||||
|
||||
# The number of items on each page in the web UI. This value must be a
|
||||
# positive integer, but if you don't define one in paperless.conf, a default of
|
||||
# 100 will be used.
|
||||
PAPERLESS_LIST_PER_PAGE = int(os.getenv("PAPERLESS_LIST_PER_PAGE", 100))
|
||||
|
||||
FY_START = os.getenv("PAPERLESS_FINANCIAL_YEAR_START")
|
||||
FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
|
||||
|
||||
# Specify the default date order (for autodetected dates)
|
||||
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||
|
@@ -1,35 +1,27 @@
|
||||
"""paperless URL Configuration
|
||||
|
||||
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||
https://docs.djangoproject.com/en/1.9/topics/http/urls/
|
||||
Examples:
|
||||
Function views
|
||||
1. Add an import: from my_app import views
|
||||
2. Add a URL to urlpatterns: url(r'^$', views.home, name='home')
|
||||
Class-based views
|
||||
1. Add an import: from other_app.views import Home
|
||||
2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home')
|
||||
Including another URLconf
|
||||
1. Add an import: from blog import urls as blog_urls
|
||||
2. Import the include() function: from django.conf.urls import url, include
|
||||
3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls))
|
||||
"""
|
||||
from django.conf import settings
|
||||
from django.conf.urls import url, static, include
|
||||
from django.conf.urls import include, static, url
|
||||
from django.contrib import admin
|
||||
|
||||
from django.urls import reverse_lazy
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.views.generic import RedirectView
|
||||
from rest_framework.routers import DefaultRouter
|
||||
|
||||
from documents.views import (
|
||||
IndexView, FetchView, PushView,
|
||||
CorrespondentViewSet, TagViewSet, DocumentViewSet, LogViewSet
|
||||
CorrespondentViewSet,
|
||||
DocumentViewSet,
|
||||
FetchView,
|
||||
LogViewSet,
|
||||
PushView,
|
||||
TagViewSet
|
||||
)
|
||||
from reminders.views import ReminderViewSet
|
||||
|
||||
router = DefaultRouter()
|
||||
router.register(r'correspondents', CorrespondentViewSet)
|
||||
router.register(r'tags', TagViewSet)
|
||||
router.register(r'documents', DocumentViewSet)
|
||||
router.register(r'logs', LogViewSet)
|
||||
router.register(r"correspondents", CorrespondentViewSet)
|
||||
router.register(r"documents", DocumentViewSet)
|
||||
router.register(r"logs", LogViewSet)
|
||||
router.register(r"reminders", ReminderViewSet)
|
||||
router.register(r"tags", TagViewSet)
|
||||
|
||||
urlpatterns = [
|
||||
|
||||
@@ -40,9 +32,6 @@ urlpatterns = [
|
||||
),
|
||||
url(r"^api/", include(router.urls, namespace="drf")),
|
||||
|
||||
# Normal pages (coming soon)
|
||||
# url(r"^$", IndexView.as_view(), name="index"),
|
||||
|
||||
# File downloads
|
||||
url(
|
||||
r"^fetch/(?P<kind>doc|thumb)/(?P<pk>\d+)$",
|
||||
@@ -50,11 +39,21 @@ urlpatterns = [
|
||||
name="fetch"
|
||||
),
|
||||
|
||||
# File uploads
|
||||
url(r"^push$", csrf_exempt(PushView.as_view()), name="push"),
|
||||
|
||||
# The Django admin
|
||||
url(r"admin/", admin.site.urls),
|
||||
url(r"", admin.site.urls), # This is going away
|
||||
|
||||
# Redirect / to /admin
|
||||
url(r"^$", RedirectView.as_view(
|
||||
permanent=True, url=reverse_lazy("admin:index"))),
|
||||
|
||||
] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
||||
|
||||
if settings.SHARED_SECRET:
|
||||
urlpatterns.insert(0, url(r"^push$", PushView.as_view(), name="push"))
|
||||
# Text in each page's <h1> (and above login form).
|
||||
admin.site.site_header = 'Paperless'
|
||||
# Text at the end of each page's <title>.
|
||||
admin.site.site_title = 'Paperless'
|
||||
# Text at the top of the admin index page.
|
||||
admin.site.index_title = 'Paperless administration'
|
||||
|
@@ -1 +1 @@
|
||||
__version__ = (0, 3, 2)
|
||||
__version__ = (1, 3, 0)
|
||||
|
7
src/paperless/views.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from rest_framework.pagination import PageNumberPagination
|
||||
|
||||
|
||||
class StandardPagination(PageNumberPagination):
|
||||
page_size = 25
|
||||
page_size_query_param = "page-size"
|
||||
max_page_size = 100000
|
@@ -4,7 +4,7 @@ WSGI config for paperless project.
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/
|
||||
https://docs.djangoproject.com/en/1.10/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
0
src/paperless_tesseract/__init__.py
Normal file
16
src/paperless_tesseract/apps.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class PaperlessTesseractConfig(AppConfig):
|
||||
|
||||
name = "paperless_tesseract"
|
||||
|
||||
def ready(self):
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
from .signals import ConsumerDeclaration
|
||||
|
||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
||||
|
||||
AppConfig.ready(self)
|
301
src/paperless_tesseract/parsers.py
Normal file
@@ -0,0 +1,301 @@
|
||||
import itertools
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import dateparser
|
||||
import langdetect
|
||||
import pyocr
|
||||
from django.conf import settings
|
||||
from PIL import Image
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
import pdftotext
|
||||
from documents.parsers import DocumentParser, ParseError
|
||||
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
"""
|
||||
This parser uses Tesseract to try and get some text out of a rasterised
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
CONVERT = settings.CONVERT_BINARY
|
||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
DATE_ORDER = settings.DATE_ORDER
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
self._text = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-scale", "500x5000",
|
||||
"-alpha", "remove",
|
||||
self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
|
||||
)
|
||||
|
||||
return os.path.join(self.tempdir, "convert-0000.png")
|
||||
|
||||
def _is_ocred(self):
|
||||
|
||||
# Extract text from PDF using pdftotext
|
||||
text = get_text_from_pdf(self.document_path)
|
||||
|
||||
# We assume, that a PDF with at least 50 characters contains text
|
||||
# (so no OCR required)
|
||||
return len(text) > 50
|
||||
|
||||
def get_text(self):
|
||||
|
||||
if self._text is not None:
|
||||
return self._text
|
||||
|
||||
if not self.OCR_ALWAYS and self._is_ocred():
|
||||
self.log("info", "Skipping OCR, using Text from PDF")
|
||||
self._text = get_text_from_pdf(self.document_path)
|
||||
return self._text
|
||||
|
||||
images = self._get_greyscale()
|
||||
|
||||
try:
|
||||
self._text = self._get_ocr(images)
|
||||
return self._text
|
||||
except OCRError as e:
|
||||
raise ParseError(e)
|
||||
|
||||
def _get_greyscale(self):
|
||||
"""
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
"""
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-density", str(self.DENSITY),
|
||||
"-depth", "8",
|
||||
"-type", "grayscale",
|
||||
self.document_path, pnm,
|
||||
)
|
||||
|
||||
# Get a list of converted images
|
||||
pnms = []
|
||||
for f in os.listdir(self.tempdir):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
||||
|
||||
# Return list of converted images, processed with unpaper
|
||||
pnms = []
|
||||
for f in os.listdir(self.tempdir):
|
||||
if f.endswith(".unpaper.pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
|
||||
def _guess_language(self, text):
|
||||
try:
|
||||
guess = langdetect.detect(text)
|
||||
self.log("debug", "Language detected: {}".format(guess))
|
||||
return guess
|
||||
except Exception as e:
|
||||
self.log("warning", "Language detection error: {}".format(e))
|
||||
|
||||
def _get_ocr(self, imgs):
|
||||
"""
|
||||
Attempts to do the best job possible OCR'ing the document based on
|
||||
simple language detection trial & error.
|
||||
"""
|
||||
|
||||
if not imgs:
|
||||
raise OCRError("No images found")
|
||||
|
||||
self.log("info", "OCRing the document")
|
||||
|
||||
# Since the division gets rounded down by int, this calculation works
|
||||
# for every edge-case, i.e. 1
|
||||
middle = int(len(imgs) / 2)
|
||||
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||
|
||||
guessed_language = self._guess_language(raw_text)
|
||||
|
||||
if not guessed_language or guessed_language not in ISO639:
|
||||
self.log("warning", "Language detection failed!")
|
||||
if settings.FORGIVING_OCR:
|
||||
self.log(
|
||||
"warning",
|
||||
"As FORGIVING_OCR is enabled, we're going to make the "
|
||||
"best with what we have."
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError("Language detection failed")
|
||||
|
||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
|
||||
try:
|
||||
return self._ocr(imgs, ISO639[guessed_language])
|
||||
except pyocr.pyocr.tesseract.TesseractError:
|
||||
if settings.FORGIVING_OCR:
|
||||
self.log(
|
||||
"warning",
|
||||
"OCR for {} failed, but we're going to stick with what "
|
||||
"we've got since FORGIVING_OCR is enabled.".format(
|
||||
guessed_language
|
||||
)
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError(
|
||||
"The guessed language is not available in this instance of "
|
||||
"Tesseract."
|
||||
)
|
||||
|
||||
def _ocr(self, imgs, lang):
|
||||
"""
|
||||
Performs a single OCR attempt.
|
||||
"""
|
||||
|
||||
if not imgs:
|
||||
return ""
|
||||
|
||||
self.log("info", "Parsing for {}".format(lang))
|
||||
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
r = " ".join(r)
|
||||
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
return strip_excess_whitespace(r)
|
||||
|
||||
def _assemble_ocr_sections(self, imgs, middle, text):
|
||||
"""
|
||||
Given a `middle` value and the text that middle page represents, we OCR
|
||||
the remainder of the document and return the whole thing.
|
||||
"""
|
||||
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||
return text
|
||||
|
||||
def get_date(self):
|
||||
date = None
|
||||
datestring = None
|
||||
|
||||
try:
|
||||
text = self.get_text()
|
||||
except ParseError as e:
|
||||
return None
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
pattern = re.compile(
|
||||
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
|
||||
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
|
||||
|
||||
# Iterate through all regex matches and try to parse the date
|
||||
for m in re.finditer(pattern, text):
|
||||
datestring = m.group(0)
|
||||
|
||||
try:
|
||||
date = dateparser.parse(
|
||||
datestring,
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None:
|
||||
break
|
||||
|
||||
if date is not None:
|
||||
self.log("info", "Detected document date " + date.isoformat() +
|
||||
" based on string " + datestring)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
|
||||
|
||||
def run_convert(*args):
|
||||
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||
if settings.CONVERT_TMPDIR:
|
||||
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
||||
|
||||
if not subprocess.Popen(args, env=environment).wait() == 0:
|
||||
raise ParseError("Convert failed at {}".format(args))
|
||||
|
||||
|
||||
def run_unpaper(args):
|
||||
unpaper, pnm = args
|
||||
command_args = unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
|
||||
if not subprocess.Popen(command_args).wait() == 0:
|
||||
raise ParseError("Unpaper failed at {}".format(command_args))
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(
|
||||
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
with Image.open(os.path.join(RasterisedDocumentParser.SCRATCH, img)) as f:
|
||||
if ocr.can_detect_orientation():
|
||||
try:
|
||||
orientation = ocr.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except (TesseractError, OtherTesseractError, AttributeError):
|
||||
pass
|
||||
return ocr.image_to_string(f, lang=lang)
|
||||
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
|
||||
with open(pdf_file, "rb") as f:
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
except pdftotext.Error:
|
||||
return ""
|
||||
|
||||
return "\n".join(pdf)
|
23
src/paperless_tesseract/signals.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import re
|
||||
|
||||
from .parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class ConsumerDeclaration:
|
||||
|
||||
MATCHING_FILES = re.compile("^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
||||
|
||||
@classmethod
|
||||
def handle(cls, sender, **kwargs):
|
||||
return cls.test
|
||||
|
||||
@classmethod
|
||||
def test(cls, doc):
|
||||
|
||||
if cls.MATCHING_FILES.match(doc.lower()):
|
||||
return {
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0
|
||||
}
|
||||
|
||||
return None
|
0
src/paperless_tesseract/tests/__init__.py
Normal file
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_1.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_1.png
Normal file
After Width: | Height: | Size: 136 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_2.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_2.png
Normal file
After Width: | Height: | Size: 135 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_3.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_3.png
Normal file
After Width: | Height: | Size: 138 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_4.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_4.png
Normal file
After Width: | Height: | Size: 138 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_5.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_5.png
Normal file
After Width: | Height: | Size: 136 KiB |
BIN
src/paperless_tesseract/tests/samples/tests_date_6.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_6.png
Normal file
After Width: | Height: | Size: 136 KiB |