mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-01 18:37:42 -05:00
Compare commits
47 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
b7cb708053 | ||
![]() |
7611c2b3d5 | ||
![]() |
5f964830aa | ||
![]() |
7ec4f906af | ||
![]() |
b5f6c06b8b | ||
![]() |
55e81ca4bb | ||
![]() |
0f7bfc547a | ||
![]() |
9525725c28 | ||
![]() |
2a2196fa4d | ||
![]() |
237efbcaa0 | ||
![]() |
351cd06ef7 | ||
![]() |
8b37160953 | ||
![]() |
db64478d9f | ||
![]() |
8bc2dfe4c6 | ||
![]() |
3a427c9130 | ||
![]() |
01da48693e | ||
![]() |
499abd38f6 | ||
![]() |
ca2a556259 | ||
![]() |
7699a881d0 | ||
![]() |
425f87618a | ||
![]() |
701ef2f919 | ||
![]() |
ce5c5f0837 | ||
![]() |
75884285cf | ||
![]() |
d7d9c1edc0 | ||
![]() |
92db3fec2e | ||
![]() |
69ea039e31 | ||
![]() |
f640bef5fc | ||
![]() |
88f9fb6fc8 | ||
![]() |
855e9f6c83 | ||
![]() |
e63e9e389e | ||
![]() |
c646cd4977 | ||
![]() |
6b53c0dc27 | ||
![]() |
afb4e317f0 | ||
![]() |
a698c69b4d | ||
![]() |
4121876116 | ||
![]() |
060d3011f7 | ||
![]() |
1f0fea2937 | ||
![]() |
92e178cc59 | ||
![]() |
dc222cefd4 | ||
![]() |
0defa9d0ba | ||
![]() |
e3edb02090 | ||
![]() |
a32625ca04 | ||
![]() |
3c08fa9b33 | ||
![]() |
e6526d3fd4 | ||
![]() |
bee0867a2a | ||
![]() |
1711030cb5 | ||
![]() |
7b586e6857 |
@@ -8,7 +8,9 @@ matrix:
|
||||
env: TOXENV=py34
|
||||
- python: 3.5
|
||||
env: TOXENV=py35
|
||||
- python: 3.5
|
||||
- python: 3.6
|
||||
env: TOXENV=py36
|
||||
- python: 3.6
|
||||
env: TOXENV=pep8
|
||||
|
||||
install:
|
||||
|
@@ -1,6 +1,48 @@
|
||||
Changelog
|
||||
#########
|
||||
|
||||
* 0.3.6
|
||||
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
|
||||
correspondent or the tags for a document.
|
||||
* The ``content`` field is now optional, to allow for the edge case of a
|
||||
purely graphical document.
|
||||
* You can no longer add documents via the admin. This never worked in the
|
||||
first place, so all I've done here is remove the link to the broken form.
|
||||
* The consumer code has been heavily refactored to support a pluggable
|
||||
interface. Install a paperless consumer via pip and tell paperless about
|
||||
it with an environment variable, and you're good to go. Proper
|
||||
documentation is on its way.
|
||||
|
||||
* 0.3.5
|
||||
* A serious facelift for the documents listing page wherein we drop the
|
||||
tabular layout in favour of a tiled interface.
|
||||
* Users can now configure the number of items per page.
|
||||
* Fix for `#171`_: Allow users to specify their own ``SECRET_KEY`` value.
|
||||
* Moved the dotenv loading to the top of settings.py
|
||||
* Fix for `#112`_: Added checks for binaries required for document
|
||||
consumption.
|
||||
|
||||
* 0.3.4
|
||||
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
|
||||
Note that you *can* use Django Suit with Paperless, but only in a
|
||||
non-profit situation as their free license prohibits for-profit use. As a
|
||||
result, I can't bundle Suit with Paperless without conflicting with the
|
||||
GPL. Further development will be done against the stock Django admin.
|
||||
* I shrunk the thumbnails a little 'cause they were too big for me, even on
|
||||
my high-DPI monitor.
|
||||
* BasicAuth support for document and thumbnail downloads, as well as the Push
|
||||
API thanks to @thomasbrueggemann. See `#179`_.
|
||||
|
||||
* 0.3.3
|
||||
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
|
||||
* Timezone, items per page, and default language are now all configurable,
|
||||
also thanks to @ekw.
|
||||
|
||||
* 0.3.2
|
||||
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
|
||||
user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
|
||||
arise.
|
||||
|
||||
* 0.3.1
|
||||
* Added a default value for ``CONVERT_BINARY``
|
||||
|
||||
@@ -21,7 +63,8 @@ Changelog
|
||||
``paperless.conf``.
|
||||
* `#148`_: The database location (sqlite) is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
* `#146`_: Fixed a bug that allowed unauthorised access to the `/fetch` URL.
|
||||
* `#146`_: Fixed a bug that allowed unauthorised access to the ``/fetch``
|
||||
URL.
|
||||
* `#131`_: Document files are now automatically removed from disk when
|
||||
they're deleted in Paperless.
|
||||
* `#121`_: Fixed a bug where Paperless wasn't setting document creation time
|
||||
@@ -147,8 +190,13 @@ Changelog
|
||||
.. _#89: https://github.com/danielquinn/paperless/issues/89
|
||||
.. _#94: https://github.com/danielquinn/paperless/issues/94
|
||||
.. _#98: https://github.com/danielquinn/paperless/issues/98
|
||||
.. _#112: https://github.com/danielquinn/paperless/issues/112
|
||||
.. _#121: https://github.com/danielquinn/paperless/issues/121
|
||||
.. _#131: https://github.com/danielquinn/paperless/issues/131
|
||||
.. _#146: https://github.com/danielquinn/paperless/issues/146
|
||||
.. _#148: https://github.com/danielquinn/paperless/pull/148
|
||||
.. _#150: https://github.com/danielquinn/paperless/pull/150
|
||||
.. _#171: https://github.com/danielquinn/paperless/issues/171
|
||||
.. _#172: https://github.com/danielquinn/paperless/issues/172
|
||||
.. _#179: https://github.com/danielquinn/paperless/pull/179
|
||||
.. _#200: https://github.com/danielquinn/paperless/issues/200
|
||||
|
@@ -65,8 +65,8 @@ PAPERLESS_SHARED_SECRET=""
|
||||
# cases it has proven useful to configure a lesser value.
|
||||
# This setting has a high impact on the physical size of tmp page files,
|
||||
# the speed of document conversion, and can affect the accuracy of OCR
|
||||
# results. Individual results can vary and this setting should be tested
|
||||
# thoroughly against the documents you are importing to see if it has any
|
||||
# results. Individual results can vary and this setting should be tested
|
||||
# thoroughly against the documents you are importing to see if it has any
|
||||
# impacts either negative or positive. Testing on limited document sets has
|
||||
# shown a setting of 200 can cut the size of tmp files by 1/3, and speed up
|
||||
# conversion by up to 4x with little impact to OCR accuracy.
|
||||
@@ -81,14 +81,47 @@ PAPERLESS_SHARED_SECRET=""
|
||||
# the web for "MAGICK_TMPDIR".
|
||||
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
|
||||
|
||||
# You can specify where you want the SQLite database to be stored instead of
|
||||
# You can specify where you want the SQLite database to be stored instead of
|
||||
# the default location
|
||||
#PAPERLESS_DBDIR=/path/to/database/file
|
||||
|
||||
# Override the default MEDIA_ROOT here. This is where all files are stored.
|
||||
#PAPERLESS_MEDIADIR=/path/to/media
|
||||
|
||||
# Override the default STATIC_ROOT here. This is where all static files created
|
||||
# using "collectstatic" manager command are stored.
|
||||
#PAPERLESS_STATICDIR=""
|
||||
|
||||
# The number of seconds that Paperless will wait between checking
|
||||
# PAPERLESS_CONSUMPTION_DIR. If you tend to write documents to this directory
|
||||
# very slowly, you may want to use a higher value than the default (10).
|
||||
# PAPERLESS_CONSUMER_LOOP_TIME=10
|
||||
|
||||
# If you're planning on putting Paperless on the open internet, then you
|
||||
# really should set this value to the domain name you're using. Failing to do
|
||||
# so leaves you open to HTTP host header attacks:
|
||||
# https://docs.djangoproject.com/en/1.10/topics/security/#host-headers-virtual-hosting
|
||||
#
|
||||
# Just remember that this is a comma-separated list, so "example.com" is fine,
|
||||
# as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
|
||||
#PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com"
|
||||
|
||||
# Override the default UTC time zone here
|
||||
#PAPERLESS_TIME_ZONE=UTC
|
||||
|
||||
# Customize number of list items to show per page
|
||||
#PAPERLESS_LIST_PER_PAGE=50
|
||||
|
||||
# Customize the default language that tesseract will attempt to use when parsing
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
#PAPERLESS_OCR_LANGUAGE=eng
|
||||
|
||||
# The number of items on each page in the web UI. This value must be a
|
||||
# positive integer, but if you don't define one in paperless.conf, a default of
|
||||
# 100 will be used.
|
||||
#PAPERLESS_LIST_PER_PAGE=100
|
||||
|
||||
# The secret key has a default that should be fine so long as you're hosting
|
||||
# Paperless on a closed network. However, if you're putting this anywhere
|
||||
# public, you should change the key to something unique and verbose.
|
||||
#PAPERLESS_SECRET_KEY="change-me"
|
||||
|
@@ -1,16 +1,17 @@
|
||||
Django==1.10.4
|
||||
Django==1.10.5
|
||||
Pillow>=3.1.1
|
||||
django-crispy-forms>=1.6.0
|
||||
django-extensions>=1.6.1
|
||||
django-crispy-forms>=1.6.1
|
||||
django-extensions>=1.7.6
|
||||
django-filter>=1.0
|
||||
djangorestframework>=3.4.4
|
||||
django-flat-responsive>=1.2.0
|
||||
djangorestframework>=3.5.3
|
||||
filemagic>=1.6
|
||||
langdetect>=1.0.5
|
||||
pyocr>=0.3.1
|
||||
python-dateutil>=2.4.2
|
||||
python-dotenv>=0.3.0
|
||||
python-gnupg>=0.3.8
|
||||
pytz>=2015.7
|
||||
langdetect>=1.0.7
|
||||
pyocr>=0.4.6
|
||||
python-dateutil>=2.6.0
|
||||
python-dotenv>=0.6.2
|
||||
python-gnupg>=0.3.9
|
||||
pytz>=2016.10
|
||||
gunicorn==19.6.0
|
||||
|
||||
# For the tests
|
||||
|
@@ -1,3 +1,4 @@
|
||||
from django.conf import settings
|
||||
from django.contrib import admin
|
||||
from django.contrib.auth.models import User, Group
|
||||
from django.core.urlresolvers import reverse
|
||||
@@ -31,21 +32,25 @@ class MonthListFilter(admin.SimpleListFilter):
|
||||
return queryset.filter(created__year=year, created__month=month)
|
||||
|
||||
|
||||
class CorrespondentAdmin(admin.ModelAdmin):
|
||||
class CommonAdmin(admin.ModelAdmin):
|
||||
list_per_page = settings.PAPERLESS_LIST_PER_PAGE
|
||||
|
||||
|
||||
class CorrespondentAdmin(CommonAdmin):
|
||||
|
||||
list_display = ("name", "match", "matching_algorithm")
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
|
||||
class TagAdmin(admin.ModelAdmin):
|
||||
class TagAdmin(CommonAdmin):
|
||||
|
||||
list_display = ("name", "colour", "match", "matching_algorithm")
|
||||
list_filter = ("colour", "matching_algorithm")
|
||||
list_editable = ("colour", "match", "matching_algorithm")
|
||||
|
||||
|
||||
class DocumentAdmin(admin.ModelAdmin):
|
||||
class DocumentAdmin(CommonAdmin):
|
||||
|
||||
class Media:
|
||||
css = {
|
||||
@@ -53,12 +58,27 @@ class DocumentAdmin(admin.ModelAdmin):
|
||||
}
|
||||
|
||||
search_fields = ("correspondent__name", "title", "content")
|
||||
list_display = ("created", "correspondent", "title", "tags_", "document")
|
||||
list_display = ("title", "created", "thumbnail", "correspondent", "tags_")
|
||||
list_filter = ("tags", "correspondent", MonthListFilter)
|
||||
list_per_page = 25
|
||||
ordering = ["-created", "correspondent"]
|
||||
|
||||
def has_add_permission(self, request):
|
||||
return False
|
||||
|
||||
def created_(self, obj):
|
||||
return obj.created.date().strftime("%Y-%m-%d")
|
||||
created_.short_description = "Created"
|
||||
|
||||
def thumbnail(self, obj):
|
||||
png_img = self._html_tag(
|
||||
"img",
|
||||
src="/fetch/thumb/{}".format(obj.id),
|
||||
width=180,
|
||||
alt="Thumbnail of {}".format(obj.file_name),
|
||||
title=obj.file_name
|
||||
)
|
||||
return self._html_tag("a", png_img, href=obj.download_url)
|
||||
thumbnail.allow_tags = True
|
||||
|
||||
def tags_(self, obj):
|
||||
r = ""
|
||||
@@ -108,7 +128,7 @@ class DocumentAdmin(admin.ModelAdmin):
|
||||
return "<{} {}/>".format(kind, " ".join(attributes))
|
||||
|
||||
|
||||
class LogAdmin(admin.ModelAdmin):
|
||||
class LogAdmin(CommonAdmin):
|
||||
|
||||
list_display = ("created", "message", "level",)
|
||||
list_filter = ("level", "created",)
|
||||
|
@@ -1,35 +1,21 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
import shutil
|
||||
import hashlib
|
||||
import logging
|
||||
import datetime
|
||||
import tempfile
|
||||
import itertools
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import pyocr
|
||||
import langdetect
|
||||
from PIL import Image
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from paperless.db import GnuPG
|
||||
from pyocr.tesseract import TesseractError
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
|
||||
from .models import Tag, Document, FileInfo
|
||||
from .models import Document, FileInfo, Tag
|
||||
from .parsers import ParseError
|
||||
from .signals import (
|
||||
document_consumption_started,
|
||||
document_consumption_finished
|
||||
document_consumer_declaration,
|
||||
document_consumption_finished,
|
||||
document_consumption_started
|
||||
)
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ConsumerError(Exception):
|
||||
@@ -47,13 +33,7 @@ class Consumer(object):
|
||||
"""
|
||||
|
||||
SCRATCH = settings.SCRATCH_DIR
|
||||
CONVERT = settings.CONVERT_BINARY
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
CONSUME = settings.CONSUMPTION_DIR
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
|
||||
def __init__(self):
|
||||
|
||||
@@ -78,6 +58,16 @@ class Consumer(object):
|
||||
raise ConsumerError(
|
||||
"Consumption directory {} does not exist".format(self.CONSUME))
|
||||
|
||||
self.parsers = []
|
||||
for response in document_consumer_declaration.send(self):
|
||||
self.parsers.append(response[1])
|
||||
|
||||
if not self.parsers:
|
||||
raise ConsumerError(
|
||||
"No parsers could be found, not even the default. "
|
||||
"This is a problem."
|
||||
)
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
@@ -109,6 +99,13 @@ class Consumer(object):
|
||||
self._ignore.append(doc)
|
||||
continue
|
||||
|
||||
parser_class = self._get_parser_class(doc)
|
||||
if not parser_class:
|
||||
self.log(
|
||||
"info", "No parsers could be found for {}".format(doc))
|
||||
self._ignore.append(doc)
|
||||
continue
|
||||
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
self.log("info", "Consuming {}".format(doc))
|
||||
@@ -119,25 +116,26 @@ class Consumer(object):
|
||||
logging_group=self.logging_group
|
||||
)
|
||||
|
||||
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||
imgs = self._get_greyscale(tempdir, doc)
|
||||
thumbnail = self._get_thumbnail(tempdir, doc)
|
||||
parsed_document = parser_class(doc)
|
||||
thumbnail = parsed_document.get_thumbnail()
|
||||
|
||||
try:
|
||||
|
||||
document = self._store(self._get_ocr(imgs), doc, thumbnail)
|
||||
|
||||
except OCRError as e:
|
||||
document = self._store(
|
||||
parsed_document.get_text(),
|
||||
doc,
|
||||
thumbnail
|
||||
)
|
||||
except ParseError as e:
|
||||
|
||||
self._ignore.append(doc)
|
||||
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
|
||||
self._cleanup_tempdir(tempdir)
|
||||
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
|
||||
parsed_document.cleanup()
|
||||
|
||||
continue
|
||||
|
||||
else:
|
||||
|
||||
self._cleanup_tempdir(tempdir)
|
||||
parsed_document.cleanup()
|
||||
self._cleanup_doc(doc)
|
||||
|
||||
self.log(
|
||||
@@ -151,142 +149,20 @@ class Consumer(object):
|
||||
logging_group=self.logging_group
|
||||
)
|
||||
|
||||
def _get_greyscale(self, tempdir, doc):
|
||||
def _get_parser_class(self, doc):
|
||||
"""
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
Determine the appropriate parser class based on the file
|
||||
"""
|
||||
|
||||
self.log("info", "Generating greyscale image from {}".format(doc))
|
||||
options = []
|
||||
for parser in self.parsers:
|
||||
result = parser(doc)
|
||||
if result:
|
||||
options.append(result)
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
pnm = os.path.join(tempdir, "convert-%04d.pnm")
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-density", str(self.DENSITY),
|
||||
"-depth", "8",
|
||||
"-type", "grayscale",
|
||||
doc, pnm,
|
||||
)
|
||||
|
||||
# Get a list of converted images
|
||||
pnms = []
|
||||
for f in os.listdir(tempdir):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(tempdir, f))
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
||||
|
||||
# Return list of converted images, processed with unpaper
|
||||
pnms = []
|
||||
for f in os.listdir(tempdir):
|
||||
if f.endswith(".unpaper.pnm"):
|
||||
pnms.append(os.path.join(tempdir, f))
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
|
||||
def _get_thumbnail(self, tempdir, doc):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
|
||||
self.log("info", "Generating the thumbnail")
|
||||
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-scale", "500x5000",
|
||||
"-alpha", "remove",
|
||||
doc, os.path.join(tempdir, "convert-%04d.png")
|
||||
)
|
||||
|
||||
return os.path.join(tempdir, "convert-0000.png")
|
||||
|
||||
def _guess_language(self, text):
|
||||
try:
|
||||
guess = langdetect.detect(text)
|
||||
self.log("debug", "Language detected: {}".format(guess))
|
||||
return guess
|
||||
except Exception as e:
|
||||
self.log("warning", "Language detection error: {}".format(e))
|
||||
|
||||
def _get_ocr(self, imgs):
|
||||
"""
|
||||
Attempts to do the best job possible OCR'ing the document based on
|
||||
simple language detection trial & error.
|
||||
"""
|
||||
|
||||
if not imgs:
|
||||
raise OCRError("No images found")
|
||||
|
||||
self.log("info", "OCRing the document")
|
||||
|
||||
# Since the division gets rounded down by int, this calculation works
|
||||
# for every edge-case, i.e. 1
|
||||
middle = int(len(imgs) / 2)
|
||||
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||
|
||||
guessed_language = self._guess_language(raw_text)
|
||||
|
||||
if not guessed_language or guessed_language not in ISO639:
|
||||
self.log("warning", "Language detection failed!")
|
||||
if settings.FORGIVING_OCR:
|
||||
self.log(
|
||||
"warning",
|
||||
"As FORGIVING_OCR is enabled, we're going to make the "
|
||||
"best with what we have."
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError("Language detection failed")
|
||||
|
||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
|
||||
try:
|
||||
return self._ocr(imgs, ISO639[guessed_language])
|
||||
except pyocr.pyocr.tesseract.TesseractError:
|
||||
if settings.FORGIVING_OCR:
|
||||
self.log(
|
||||
"warning",
|
||||
"OCR for {} failed, but we're going to stick with what "
|
||||
"we've got since FORGIVING_OCR is enabled.".format(
|
||||
guessed_language
|
||||
)
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError(
|
||||
"The guessed language is not available in this instance of "
|
||||
"Tesseract."
|
||||
)
|
||||
|
||||
def _assemble_ocr_sections(self, imgs, middle, text):
|
||||
"""
|
||||
Given a `middle` value and the text that middle page represents, we OCR
|
||||
the remainder of the document and return the whole thing.
|
||||
"""
|
||||
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||
return text
|
||||
|
||||
def _ocr(self, imgs, lang):
|
||||
"""
|
||||
Performs a single OCR attempt.
|
||||
"""
|
||||
|
||||
if not imgs:
|
||||
return ""
|
||||
|
||||
self.log("info", "Parsing for {}".format(lang))
|
||||
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
r = " ".join(r)
|
||||
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
return strip_excess_whitespace(r)
|
||||
# Return the parser with the highest weight.
|
||||
return sorted(
|
||||
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||
|
||||
def _store(self, text, doc, thumbnail):
|
||||
|
||||
@@ -332,10 +208,6 @@ class Consumer(object):
|
||||
|
||||
return document
|
||||
|
||||
def _cleanup_tempdir(self, d):
|
||||
self.log("debug", "Deleting directory {}".format(d))
|
||||
shutil.rmtree(d)
|
||||
|
||||
def _cleanup_doc(self, doc):
|
||||
self.log("debug", "Deleting document {}".format(doc))
|
||||
os.unlink(doc)
|
||||
@@ -361,41 +233,3 @@ class Consumer(object):
|
||||
with open(doc, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
return Document.objects.filter(checksum=checksum).exists()
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(
|
||||
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
|
||||
if ocr.can_detect_orientation():
|
||||
try:
|
||||
orientation = ocr.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except (TesseractError, OtherTesseractError):
|
||||
pass
|
||||
return ocr.image_to_string(f, lang=lang)
|
||||
|
||||
|
||||
def run_unpaper(args):
|
||||
unpaper, pnm = args
|
||||
subprocess.Popen(
|
||||
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
|
||||
|
||||
|
||||
def run_convert(*args):
|
||||
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||
if settings.CONVERT_TMPDIR:
|
||||
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
||||
|
||||
subprocess.Popen(args, env=environment).wait()
|
||||
|
@@ -3,6 +3,7 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -19,7 +20,7 @@ class Migration(migrations.Migration):
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('sender', models.CharField(blank=True, db_index=True, max_length=128)),
|
||||
('title', models.CharField(blank=True, db_index=True, max_length=128)),
|
||||
('content', models.TextField(db_index=True)),
|
||||
('content', models.TextField(db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]))),
|
||||
('created', models.DateTimeField(auto_now_add=True)),
|
||||
('modified', models.DateTimeField(auto_now=True)),
|
||||
],
|
||||
|
@@ -47,7 +47,11 @@ class Migration(migrations.Migration):
|
||||
],
|
||||
),
|
||||
migrations.RunPython(move_sender_strings_to_sender_model),
|
||||
migrations.AlterField(
|
||||
migrations.RemoveField(
|
||||
model_name='document',
|
||||
name='sender',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='sender',
|
||||
field=models.ForeignKey(blank=True, on_delete=django.db.models.deletion.CASCADE, to='documents.Sender'),
|
||||
|
@@ -1,3 +1,8 @@
|
||||
from django.contrib.auth.mixins import AccessMixin
|
||||
from django.contrib.auth import authenticate, login
|
||||
import base64
|
||||
|
||||
|
||||
class Renderable(object):
|
||||
"""
|
||||
A handy mixin to make it easier/cleaner to print output based on a
|
||||
@@ -7,3 +12,46 @@ class Renderable(object):
|
||||
def _render(self, text, verbosity):
|
||||
if self.verbosity >= verbosity:
|
||||
print(text)
|
||||
|
||||
|
||||
class SessionOrBasicAuthMixin(AccessMixin):
|
||||
"""
|
||||
Session or Basic Authentication mixin for Django.
|
||||
It determines if the requester is already logged in or if they have
|
||||
provided proper http-authorization and returning the view if all goes
|
||||
well, otherwise responding with a 401.
|
||||
|
||||
Base for mixin found here: https://djangosnippets.org/snippets/3073/
|
||||
"""
|
||||
|
||||
def dispatch(self, request, *args, **kwargs):
|
||||
|
||||
# check if user is authenticated via the session
|
||||
if request.user.is_authenticated:
|
||||
|
||||
# Already logged in, just return the view.
|
||||
return super(SessionOrBasicAuthMixin, self).dispatch(
|
||||
request, *args, **kwargs
|
||||
)
|
||||
|
||||
# apparently not authenticated via session, maybe via HTTP Basic?
|
||||
if 'HTTP_AUTHORIZATION' in request.META:
|
||||
auth = request.META['HTTP_AUTHORIZATION'].split()
|
||||
if len(auth) == 2:
|
||||
# NOTE: Support for only basic authentication
|
||||
if auth[0].lower() == "basic":
|
||||
authString = base64.b64decode(auth[1]).decode('utf-8')
|
||||
uname, passwd = authString.split(':')
|
||||
user = authenticate(username=uname, password=passwd)
|
||||
if user is not None:
|
||||
if user.is_active:
|
||||
login(request, user)
|
||||
request.user = user
|
||||
return super(
|
||||
SessionOrBasicAuthMixin, self
|
||||
).dispatch(
|
||||
request, *args, **kwargs
|
||||
)
|
||||
|
||||
# nope, really not authenticated
|
||||
return self.handle_no_permission()
|
||||
|
@@ -158,13 +158,22 @@ class Document(models.Model):
|
||||
|
||||
correspondent = models.ForeignKey(
|
||||
Correspondent, blank=True, null=True, related_name="documents")
|
||||
|
||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||
content = models.TextField(db_index=True)
|
||||
|
||||
content = models.TextField(
|
||||
db_index=True,
|
||||
blank=True,
|
||||
help_text="The raw, text-only data of the document. This field is "
|
||||
"primarily used for searching."
|
||||
)
|
||||
|
||||
file_type = models.CharField(
|
||||
max_length=4,
|
||||
editable=False,
|
||||
choices=tuple([(t, t.upper()) for t in TYPES])
|
||||
)
|
||||
|
||||
tags = models.ManyToManyField(
|
||||
Tag, related_name="documents", blank=True)
|
||||
|
||||
|
45
src/documents/parsers.py
Normal file
45
src/documents/parsers.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class DocumentParser(object):
|
||||
"""
|
||||
Subclass this to make your own parser. Have a look at
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
"""
|
||||
|
||||
SCRATCH = settings.SCRATCH_DIR
|
||||
|
||||
def __init__(self, path):
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
Returns the path to a file we can use as a thumbnail for this document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_text(self):
|
||||
"""
|
||||
Returns the text from the document and only the text.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
||||
def cleanup(self):
|
||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||
shutil.rmtree(self.tempdir)
|
@@ -18,12 +18,21 @@ class TagSerializer(serializers.HyperlinkedModelSerializer):
|
||||
"id", "slug", "name", "colour", "match", "matching_algorithm")
|
||||
|
||||
|
||||
class CorrespondentField(serializers.HyperlinkedRelatedField):
|
||||
def get_queryset(self):
|
||||
return Correspondent.objects.all()
|
||||
|
||||
|
||||
class TagsField(serializers.HyperlinkedRelatedField):
|
||||
def get_queryset(self):
|
||||
return Tag.objects.all()
|
||||
|
||||
|
||||
class DocumentSerializer(serializers.ModelSerializer):
|
||||
|
||||
correspondent = serializers.HyperlinkedRelatedField(
|
||||
read_only=True, view_name="drf:correspondent-detail", allow_null=True)
|
||||
tags = serializers.HyperlinkedRelatedField(
|
||||
read_only=True, view_name="drf:tag-detail", many=True)
|
||||
correspondent = CorrespondentField(
|
||||
view_name="drf:correspondent-detail", allow_null=True)
|
||||
tags = TagsField(view_name="drf:tag-detail", many=True)
|
||||
|
||||
class Meta(object):
|
||||
model = Document
|
||||
|
@@ -2,3 +2,4 @@ from django.dispatch import Signal
|
||||
|
||||
document_consumption_started = Signal(providing_args=["filename"])
|
||||
document_consumption_finished = Signal(providing_args=["document"])
|
||||
document_consumer_declaration = Signal(providing_args=[])
|
||||
|
@@ -1,6 +1,5 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
from subprocess import Popen
|
||||
|
||||
from django.conf import settings
|
||||
|
6
src/documents/templates/admin/change_list_results.html
Normal file
6
src/documents/templates/admin/change_list_results.html
Normal file
@@ -0,0 +1,6 @@
|
||||
{% load hacks %}
|
||||
|
||||
{# See documents.templatetags.hacks.change_list_results for an explanation #}
|
||||
|
||||
{% change_list_results %}
|
||||
|
@@ -0,0 +1,167 @@
|
||||
{% load i18n %}
|
||||
|
||||
<style>
|
||||
.grid *, .grid *:after, .grid *:before {
|
||||
-webkit-box-sizing: border-box;
|
||||
-moz-box-sizing: border-box;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
.box {
|
||||
width: 12.5%;
|
||||
padding: 1em;
|
||||
float: left;
|
||||
opacity: 0.7;
|
||||
transition: all 0.5s;
|
||||
}
|
||||
.box:hover {
|
||||
opacity: 1;
|
||||
transition: all 0.5s;
|
||||
}
|
||||
.box:last-of-type {
|
||||
padding-right: 0;
|
||||
}
|
||||
.result {
|
||||
border: 1px solid #cccccc;
|
||||
border-radius: 2%;
|
||||
overflow: hidden;
|
||||
height: 300px;
|
||||
}
|
||||
.result .header {
|
||||
padding: 5px;
|
||||
background-color: #79AEC8;
|
||||
height: 6em;
|
||||
}
|
||||
.result .header .checkbox {
|
||||
margin-right: 5px;
|
||||
}
|
||||
.result .header .checkbox{
|
||||
width: 5%;
|
||||
float: left;
|
||||
}
|
||||
.result .header .info {
|
||||
width: 90%;
|
||||
float: left;
|
||||
}
|
||||
.result .header a,
|
||||
.result a.tag {
|
||||
color: #ffffff;
|
||||
}
|
||||
.result .date {
|
||||
padding: 5px;
|
||||
}
|
||||
.result .tags {
|
||||
float: left;
|
||||
}
|
||||
.result .tags a.tag {
|
||||
padding: 2px 5px;
|
||||
border-radius: 2px;
|
||||
display: inline-block;
|
||||
margin: 2px;
|
||||
}
|
||||
.result .date {
|
||||
float: right;
|
||||
color: #cccccc;
|
||||
}
|
||||
.result .image img {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.grid {
|
||||
margin-right: 260px;
|
||||
}
|
||||
.grid:after {
|
||||
content: "";
|
||||
display: table;
|
||||
clear: both;
|
||||
}
|
||||
|
||||
@media (max-width: 1600px) {
|
||||
.box {
|
||||
width: 25%
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 991px) {
|
||||
.grid {
|
||||
margin-right: 220px;
|
||||
}
|
||||
.box {
|
||||
width: 50%
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 767px) {
|
||||
.grid {
|
||||
margin-right: 0;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 500px) {
|
||||
.box {
|
||||
width: 100%
|
||||
}
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
|
||||
{# This is just copypasta from the parent change_list_results.html file #}
|
||||
<table id="result_list">
|
||||
<thead>
|
||||
<tr>
|
||||
{% for header in result_headers %}
|
||||
<th scope="col" {{ header.class_attrib }}>
|
||||
{% if header.sortable %}
|
||||
{% if header.sort_priority > 0 %}
|
||||
<div class="sortoptions">
|
||||
<a class="sortremove" href="{{ header.url_remove }}" title="{% trans "Remove from sorting" %}"></a>
|
||||
{% if num_sorted_fields > 1 %}<span class="sortpriority" title="{% blocktrans with priority_number=header.sort_priority %}Sorting priority: {{ priority_number }}{% endblocktrans %}">{{ header.sort_priority }}</span>{% endif %}
|
||||
<a href="{{ header.url_toggle }}" class="toggle {% if header.ascending %}ascending{% else %}descending{% endif %}" title="{% trans "Toggle sorting" %}"></a>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
<div class="text">{% if header.sortable %}<a href="{{ header.url_primary }}">{{ header.text|capfirst }}</a>{% else %}<span>{{ header.text|capfirst }}</span>{% endif %}</div>
|
||||
<div class="clear"></div>
|
||||
</th>{% endfor %}
|
||||
</tr>
|
||||
</thead>
|
||||
</table>
|
||||
{# /copypasta #}
|
||||
|
||||
|
||||
<div class="grid">
|
||||
{% for result in results %}
|
||||
{# 0: Checkbox #}
|
||||
{# 1: Title #}
|
||||
{# 2: Date #}
|
||||
{# 3: Image #}
|
||||
{# 4: Correspondent #}
|
||||
{# 5: Tags #}
|
||||
<div class="box">
|
||||
<div class="result">
|
||||
<div class="header">
|
||||
<div class="checkbox">{{ result.0 }}</div>
|
||||
<div class="info">
|
||||
{{ result.4 }}<br />
|
||||
{{ result.1 }}
|
||||
</div>
|
||||
<div style="clear: both;"></div>
|
||||
</div>
|
||||
<div class="tags">{{ result.5 }}</div>
|
||||
<div class="date">{{ result.2 }}</div>
|
||||
<div style="clear: both;"></div>
|
||||
<div class="image">{{ result.3 }}</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
|
||||
<script>
|
||||
// We need to re-build the select-all functionality as the old logic pointed
|
||||
// to a table and we're using divs now.
|
||||
django.jQuery("#action-toggle").on("change", function(){
|
||||
django.jQuery(".grid .box .result .checkbox input")
|
||||
.prop("checked", this.checked);
|
||||
});
|
||||
</script>
|
0
src/documents/templatetags/__init__.py
Normal file
0
src/documents/templatetags/__init__.py
Normal file
41
src/documents/templatetags/hacks.py
Normal file
41
src/documents/templatetags/hacks.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import os
|
||||
|
||||
from django.contrib import admin
|
||||
from django.template import Library
|
||||
from django.template.loader import get_template
|
||||
|
||||
from ..models import Document
|
||||
|
||||
|
||||
register = Library()
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def change_list_results(context):
|
||||
"""
|
||||
Django has a lot of places where you can override defaults, but
|
||||
unfortunately, `change_list_results.html` is not one of them. In fact,
|
||||
it's a downright pain in the ass to override this file on a per-model basis
|
||||
and this is the cleanest way I could come up with.
|
||||
|
||||
Basically all we've done here is defined `change_list_results.html` in an
|
||||
`admin` directory which globally overrides that file for *every* model.
|
||||
That template however simply loads this templatetag which determines
|
||||
whether we're currently looking at a `Document` listing or something else
|
||||
and loads the appropriate file in each case.
|
||||
|
||||
Better work arounds for this are welcome as I hate this myself, but at the
|
||||
moment, it's all I could come up with.
|
||||
"""
|
||||
|
||||
path = os.path.join(
|
||||
os.path.dirname(admin.__file__),
|
||||
"templates",
|
||||
"admin",
|
||||
"change_list_results.html"
|
||||
)
|
||||
|
||||
if context["cl"].model == Document:
|
||||
path = "admin/documents/document/change_list_results.html"
|
||||
|
||||
return get_template(path).render(context)
|
@@ -1,13 +1,6 @@
|
||||
import os
|
||||
from unittest import mock, skipIf
|
||||
|
||||
import pyocr
|
||||
from django.test import TestCase
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
|
||||
from ..models import FileInfo
|
||||
from ..consumer import image_to_string, strip_excess_whitespace
|
||||
|
||||
|
||||
class TestAttributes(TestCase):
|
||||
@@ -308,71 +301,3 @@ class TestFieldPermutations(TestCase):
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise OtherTesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
return "This is test text"
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
class TestOCR(TestCase):
|
||||
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
(
|
||||
"simple newline\n testing string",
|
||||
"simple newline\ntesting string"
|
||||
),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце"
|
||||
)
|
||||
]
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||
source,
|
||||
result,
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
||||
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
||||
@mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
|
||||
@mock.patch("documents.consumer.pyocr", FakePyOcr)
|
||||
def test_image_to_string_with_text_free_page(self):
|
||||
"""
|
||||
This test is sort of silly, since it's really just reproducing an odd
|
||||
exception thrown by pyocr when it encounters a page with no text.
|
||||
Actually running this test against an installation of Tesseract results
|
||||
in a segmentation fault rooted somewhere deep inside pyocr where I
|
||||
don't care to dig. Regardless, if you run the consumer normally,
|
||||
text-free pages are now handled correctly so long as we work around
|
||||
this weird exception.
|
||||
"""
|
||||
image_to_string(["no-text.png", "en"])
|
||||
|
@@ -1,4 +1,3 @@
|
||||
from django.contrib.auth.mixins import LoginRequiredMixin
|
||||
from django.http import HttpResponse
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.views.generic import DetailView, FormView, TemplateView
|
||||
@@ -28,6 +27,7 @@ from .serialisers import (
|
||||
LogSerializer,
|
||||
TagSerializer
|
||||
)
|
||||
from .mixins import SessionOrBasicAuthMixin
|
||||
|
||||
|
||||
class IndexView(TemplateView):
|
||||
@@ -41,7 +41,7 @@ class IndexView(TemplateView):
|
||||
return TemplateView.get_context_data(self, **kwargs)
|
||||
|
||||
|
||||
class FetchView(LoginRequiredMixin, DetailView):
|
||||
class FetchView(SessionOrBasicAuthMixin, DetailView):
|
||||
|
||||
model = Document
|
||||
|
||||
@@ -74,7 +74,7 @@ class FetchView(LoginRequiredMixin, DetailView):
|
||||
return response
|
||||
|
||||
|
||||
class PushView(LoginRequiredMixin, FormView):
|
||||
class PushView(SessionOrBasicAuthMixin, FormView):
|
||||
"""
|
||||
A crude REST-ish API for creating documents.
|
||||
"""
|
||||
|
@@ -1 +1 @@
|
||||
from .checks import paths_check
|
||||
from .checks import paths_check, binaries_check
|
||||
|
@@ -1,10 +1,15 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error, register, Warning
|
||||
|
||||
|
||||
@register()
|
||||
def paths_check(app_configs, **kwargs):
|
||||
"""
|
||||
Check the various paths for existence, readability and writeability
|
||||
"""
|
||||
|
||||
check_messages = []
|
||||
|
||||
@@ -44,4 +49,38 @@ def paths_check(app_configs, **kwargs):
|
||||
writeable_hint.format(directory)
|
||||
))
|
||||
|
||||
directory = os.getenv("PAPERLESS_STATICDIR")
|
||||
if directory:
|
||||
if not os.path.exists(directory):
|
||||
check_messages.append(Error(
|
||||
exists_message.format("PAPERLESS_STATICDIR"),
|
||||
exists_hint.format(directory)
|
||||
))
|
||||
if not check_messages:
|
||||
if not os.access(directory, os.W_OK | os.X_OK):
|
||||
check_messages.append(Error(
|
||||
writeable_message.format("PAPERLESS_STATICDIR"),
|
||||
writeable_hint.format(directory)
|
||||
))
|
||||
|
||||
return check_messages
|
||||
|
||||
|
||||
@register()
|
||||
def binaries_check(app_configs, **kwargs):
|
||||
"""
|
||||
Paperless requires the existence of a few binaries, so we do some checks
|
||||
for those here.
|
||||
"""
|
||||
|
||||
error = "Paperless can't find {}. Without it, consumption is impossible."
|
||||
hint = "Either it's not in your ${PATH} or it's not installed."
|
||||
|
||||
binaries = (settings.CONVERT_BINARY, settings.UNPAPER_BINARY, "tesseract")
|
||||
|
||||
check_messages = []
|
||||
for binary in binaries:
|
||||
if shutil.which(binary) is None:
|
||||
check_messages.append(Warning(error.format(binary), hint))
|
||||
|
||||
return check_messages
|
||||
|
@@ -14,6 +14,12 @@ import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
# Tap paperless.conf if it's available
|
||||
if os.path.exists("/etc/paperless.conf"):
|
||||
load_dotenv("/etc/paperless.conf")
|
||||
|
||||
|
||||
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
@@ -21,41 +27,53 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee'
|
||||
# The secret key has a default that should be fine so long as you're hosting
|
||||
# Paperless on a closed network. However, if you're putting this anywhere
|
||||
# public, you should change the key to something unique and verbose.
|
||||
SECRET_KEY = os.getenv(
|
||||
"PAPERLESS_SECRET_KEY",
|
||||
"e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee"
|
||||
)
|
||||
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
|
||||
LOGIN_URL = '/admin/login'
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
ALLOWED_HOSTS = ["*"]
|
||||
|
||||
# Tap paperless.conf if it's available
|
||||
if os.path.exists("/etc/paperless.conf"):
|
||||
load_dotenv("/etc/paperless.conf")
|
||||
_allowed_hosts = os.getenv("PAPERLESS_ALLOWED_HOSTS")
|
||||
if _allowed_hosts:
|
||||
ALLOWED_HOSTS = _allowed_hosts.split(",")
|
||||
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
"django.contrib.auth",
|
||||
"django.contrib.contenttypes",
|
||||
"django.contrib.sessions",
|
||||
"django.contrib.messages",
|
||||
"django.contrib.staticfiles",
|
||||
|
||||
"django_extensions",
|
||||
|
||||
"documents.apps.DocumentsConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
|
||||
"flat_responsive",
|
||||
"django.contrib.admin",
|
||||
|
||||
"rest_framework",
|
||||
"crispy_forms",
|
||||
|
||||
]
|
||||
|
||||
if os.getenv("PAPERLESS_INSTALLED_APPS"):
|
||||
INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",")
|
||||
|
||||
MIDDLEWARE_CLASSES = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
@@ -137,7 +155,7 @@ AUTH_PASSWORD_VALIDATORS = [
|
||||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
|
||||
TIME_ZONE = 'UTC'
|
||||
TIME_ZONE = os.getenv("PAPERLESS_TIME_ZONE", "UTC")
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
@@ -149,7 +167,8 @@ USE_TZ = True
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/1.9/howto/static-files/
|
||||
|
||||
STATIC_ROOT = os.path.join(BASE_DIR, "..", "static")
|
||||
STATIC_ROOT = os.getenv(
|
||||
"PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "static"))
|
||||
MEDIA_ROOT = os.getenv(
|
||||
"PAPERLESS_MEDIADIR", os.path.join(BASE_DIR, "..", "media"))
|
||||
|
||||
@@ -183,7 +202,7 @@ LOGGING = {
|
||||
|
||||
# The default language that tesseract will attempt to use when parsing
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
OCR_LANGUAGE = "eng"
|
||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
|
||||
# The amount of threads to use for OCR
|
||||
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
|
||||
@@ -245,3 +264,8 @@ SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "")
|
||||
# Trigger a script after every successful document consumption?
|
||||
PRE_CONSUME_SCRIPT = os.getenv("PAPERLESS_PRE_CONSUME_SCRIPT")
|
||||
POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
|
||||
|
||||
# The number of items on each page in the web UI. This value must be a
|
||||
# positive integer, but if you don't define one in paperless.conf, a default of
|
||||
# 100 will be used.
|
||||
PAPERLESS_LIST_PER_PAGE = int(os.getenv("PAPERLESS_LIST_PER_PAGE", 100))
|
||||
|
@@ -1 +1 @@
|
||||
__version__ = (0, 3, 1)
|
||||
__version__ = (0, 3, 6)
|
||||
|
0
src/paperless_tesseract/__init__.py
Normal file
0
src/paperless_tesseract/__init__.py
Normal file
16
src/paperless_tesseract/apps.py
Normal file
16
src/paperless_tesseract/apps.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class PaperlessTesseractConfig(AppConfig):
|
||||
|
||||
name = "paperless_tesseract"
|
||||
|
||||
def ready(self):
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
from .signals import ConsumerDeclaration
|
||||
|
||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
||||
|
||||
AppConfig.ready(self)
|
214
src/paperless_tesseract/parsers.py
Normal file
214
src/paperless_tesseract/parsers.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import itertools
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import langdetect
|
||||
import pyocr
|
||||
from django.conf import settings
|
||||
from documents.parsers import DocumentParser, ParseError
|
||||
from PIL import Image
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
"""
|
||||
This parser uses Tesseract to try and get some text out of a rasterised
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
CONVERT = settings.CONVERT_BINARY
|
||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-scale", "500x5000",
|
||||
"-alpha", "remove",
|
||||
self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
|
||||
)
|
||||
|
||||
return os.path.join(self.tempdir, "convert-0000.png")
|
||||
|
||||
def get_text(self):
|
||||
|
||||
images = self._get_greyscale()
|
||||
|
||||
try:
|
||||
|
||||
return self._get_ocr(images)
|
||||
except OCRError as e:
|
||||
raise ParseError(e)
|
||||
|
||||
def _get_greyscale(self):
|
||||
"""
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
"""
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||
run_convert(
|
||||
self.CONVERT,
|
||||
"-density", str(self.DENSITY),
|
||||
"-depth", "8",
|
||||
"-type", "grayscale",
|
||||
self.document_path, pnm,
|
||||
)
|
||||
|
||||
# Get a list of converted images
|
||||
pnms = []
|
||||
for f in os.listdir(self.tempdir):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
||||
|
||||
# Return list of converted images, processed with unpaper
|
||||
pnms = []
|
||||
for f in os.listdir(self.tempdir):
|
||||
if f.endswith(".unpaper.pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
|
||||
def _guess_language(self, text):
|
||||
try:
|
||||
guess = langdetect.detect(text)
|
||||
self.log("debug", "Language detected: {}".format(guess))
|
||||
return guess
|
||||
except Exception as e:
|
||||
self.log("warning", "Language detection error: {}".format(e))
|
||||
|
||||
def _get_ocr(self, imgs):
|
||||
"""
|
||||
Attempts to do the best job possible OCR'ing the document based on
|
||||
simple language detection trial & error.
|
||||
"""
|
||||
|
||||
if not imgs:
|
||||
raise OCRError("No images found")
|
||||
|
||||
self.log("info", "OCRing the document")
|
||||
|
||||
# Since the division gets rounded down by int, this calculation works
|
||||
# for every edge-case, i.e. 1
|
||||
middle = int(len(imgs) / 2)
|
||||
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||
|
||||
guessed_language = self._guess_language(raw_text)
|
||||
|
||||
if not guessed_language or guessed_language not in ISO639:
|
||||
self.log("warning", "Language detection failed!")
|
||||
if settings.FORGIVING_OCR:
|
||||
self.log(
|
||||
"warning",
|
||||
"As FORGIVING_OCR is enabled, we're going to make the "
|
||||
"best with what we have."
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError("Language detection failed")
|
||||
|
||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
|
||||
try:
|
||||
return self._ocr(imgs, ISO639[guessed_language])
|
||||
except pyocr.pyocr.tesseract.TesseractError:
|
||||
if settings.FORGIVING_OCR:
|
||||
self.log(
|
||||
"warning",
|
||||
"OCR for {} failed, but we're going to stick with what "
|
||||
"we've got since FORGIVING_OCR is enabled.".format(
|
||||
guessed_language
|
||||
)
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError(
|
||||
"The guessed language is not available in this instance of "
|
||||
"Tesseract."
|
||||
)
|
||||
|
||||
def _ocr(self, imgs, lang):
|
||||
"""
|
||||
Performs a single OCR attempt.
|
||||
"""
|
||||
|
||||
if not imgs:
|
||||
return ""
|
||||
|
||||
self.log("info", "Parsing for {}".format(lang))
|
||||
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
r = " ".join(r)
|
||||
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
return strip_excess_whitespace(r)
|
||||
|
||||
def _assemble_ocr_sections(self, imgs, middle, text):
|
||||
"""
|
||||
Given a `middle` value and the text that middle page represents, we OCR
|
||||
the remainder of the document and return the whole thing.
|
||||
"""
|
||||
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||
return text
|
||||
|
||||
|
||||
def run_convert(*args):
|
||||
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||
if settings.CONVERT_TMPDIR:
|
||||
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
||||
|
||||
subprocess.Popen(args, env=environment).wait()
|
||||
|
||||
|
||||
def run_unpaper(args):
|
||||
unpaper, pnm = args
|
||||
subprocess.Popen(
|
||||
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(
|
||||
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
with Image.open(os.path.join(RasterisedDocumentParser.SCRATCH, img)) as f:
|
||||
if ocr.can_detect_orientation():
|
||||
try:
|
||||
orientation = ocr.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except (TesseractError, OtherTesseractError):
|
||||
pass
|
||||
return ocr.image_to_string(f, lang=lang)
|
23
src/paperless_tesseract/signals.py
Normal file
23
src/paperless_tesseract/signals.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import re
|
||||
|
||||
from .parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class ConsumerDeclaration(object):
|
||||
|
||||
MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$")
|
||||
|
||||
@classmethod
|
||||
def handle(cls, sender, **kwargs):
|
||||
return cls.test
|
||||
|
||||
@classmethod
|
||||
def test(cls, doc):
|
||||
|
||||
if cls.MATCHING_FILES.match(doc):
|
||||
return {
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0
|
||||
}
|
||||
|
||||
return None
|
0
src/paperless_tesseract/tests/__init__.py
Normal file
0
src/paperless_tesseract/tests/__init__.py
Normal file
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
80
src/paperless_tesseract/tests/test_ocr.py
Normal file
80
src/paperless_tesseract/tests/test_ocr.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import os
|
||||
from unittest import mock, skipIf
|
||||
|
||||
import pyocr
|
||||
from django.test import TestCase
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
|
||||
from ..parsers import image_to_string, strip_excess_whitespace
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise OtherTesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
return "This is test text"
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
class TestOCR(TestCase):
|
||||
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
(
|
||||
"simple newline\n testing string",
|
||||
"simple newline\ntesting string"
|
||||
),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце"
|
||||
)
|
||||
]
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||
source,
|
||||
result,
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
||||
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
|
||||
def test_image_to_string_with_text_free_page(self):
|
||||
"""
|
||||
This test is sort of silly, since it's really just reproducing an odd
|
||||
exception thrown by pyocr when it encounters a page with no text.
|
||||
Actually running this test against an installation of Tesseract results
|
||||
in a segmentation fault rooted somewhere deep inside pyocr where I
|
||||
don't care to dig. Regardless, if you run the consumer normally,
|
||||
text-free pages are now handled correctly so long as we work around
|
||||
this weird exception.
|
||||
"""
|
||||
image_to_string(["no-text.png", "en"])
|
@@ -5,7 +5,7 @@
|
||||
|
||||
[tox]
|
||||
skipsdist = True
|
||||
envlist = py34, py35, pep8
|
||||
envlist = py34, py35, py36, pep8
|
||||
|
||||
[testenv]
|
||||
commands = {envpython} manage.py test
|
||||
|
Reference in New Issue
Block a user