diff --git a/.gitignore b/.gitignore index ba7466050..07b9a2f46 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,15 @@ docs/_build/ # PyBuilder target/ + +# Stored PDFs & JPGs +media/* + +# Sqlite database +db.sqlite3 + +# PyCharm +.idea + +# Fixtures +src/paperless/fixtures/ diff --git a/src/documents/__init__.py b/src/documents/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/documents/admin.py b/src/documents/admin.py new file mode 100644 index 000000000..9b515864f --- /dev/null +++ b/src/documents/admin.py @@ -0,0 +1,24 @@ +from django.conf import settings +from django.contrib import admin + +from .models import Document + + +class DocumentAdmin(admin.ModelAdmin): + + search_fields = ("sender", "title", "content",) + list_display = ("created", "sender", "title", "thumbnail", "pdf") + list_filter = ("created", "sender") + save_on_top = True + + def thumbnail(self, obj): + return ''.format( + settings.MEDIA_URL, obj.pk) + thumbnail.allow_tags = True + + def pdf(self, obj): + return 'Download'.format( + settings.MEDIA_URL, obj.pk) + pdf.allow_tags = True + +admin.site.register(Document, DocumentAdmin) diff --git a/src/documents/apps.py b/src/documents/apps.py new file mode 100644 index 000000000..93ca7550a --- /dev/null +++ b/src/documents/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class DocumentsConfig(AppConfig): + name = 'documents' diff --git a/src/documents/management/__init__.py b/src/documents/management/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/documents/management/commands/__init__.py b/src/documents/management/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/documents/management/commands/consume.py b/src/documents/management/commands/consume.py new file mode 100644 index 000000000..45bdba464 --- /dev/null +++ b/src/documents/management/commands/consume.py @@ -0,0 +1,122 @@ +import glob +import os +import random +import re +import shutil +import subprocess + +import pyocr + +from PIL import Image + +from django.conf import settings +from django.core.management.base import BaseCommand + +from documents.models import Document + + +class Command(BaseCommand): + """ + Loop over every file found in CONSUMPTION_DIR and: + 1. Convert it to a greyscale tif + 2. Convert it to a full-colour jpg + 3. Use tesseract on the tif + 4. Store the OCR'd text in the database along with the paths to the jpg + and original pdf + 5. Delete the pdf and images + """ + + CONVERT = settings.CONVERT_BINARY + SCRATCH = settings.SCRATCH_DIR + CONSUME = settings.CONSUMPTION_DIR + + OCR = pyocr.get_available_tools()[0] + + MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img") + MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") + + def __init__(self, *args, **kwargs): + self.verbosity = 0 + BaseCommand.__init__(self, *args, **kwargs) + + def handle(self, *args, **options): + + self.verbosity = options["verbosity"] + + self._setup() + + for pdf in os.listdir(self.CONSUME): + + if not os.path.isfile(os.path.join(self.CONSUME, pdf)): + continue + + if not pdf.endswith(".pdf"): + continue + + if self.verbosity > 1: + print("Consuming {}".format(pdf)) + + pdf = os.path.join(self.CONSUME, pdf) + pngs = self._get_greyscale(pdf) + jpgs = self._get_colour(pdf) + text = self._get_ocr(pngs) + + self._store(text, jpgs, pdf) + self._cleanup(pngs, jpgs) + + def _setup(self): + for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF): + try: + os.makedirs(d) + except FileExistsError: + pass + + def _get_greyscale(self, pdf): + + i = random.randint(1000000, 4999999) + png = os.path.join(self.SCRATCH, "{}.png".format(i)) + + subprocess.Popen(( + self.CONVERT, "-density", "300", "-depth", "8", + "-type", "grayscale", pdf, png + )).wait() + + return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) + + def _get_colour(self, pdf): + + i = random.randint(5000000, 9999999) + jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i)) + + subprocess.Popen((self.CONVERT, pdf, jpg)).wait() + + return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) + + def _get_ocr(self, pngs): + + r = "" + for png in pngs: + with Image.open(os.path.join(self.SCRATCH, png)) as f: + r += self.OCR.image_to_string(f) + r += "\n\n\n\n\n\n\n\n" + + return r + + def _store(self, text, jpgs, pdf): + + doc = Document.objects.create(content=text) + + shutil.move(jpgs[0], os.path.join( + self.MEDIA_IMG, "{:07}.jpg".format(doc.pk))) + shutil.move(pdf, os.path.join( + self.MEDIA_PDF, "{:07}.pdf".format(doc.pk))) + + def _cleanup(self, pngs, jpgs): + + jpg_glob = os.path.join( + self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0])) + png_glob = os.path.join( + self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) + + for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)): + os.unlink(f) diff --git a/src/documents/migrations/0001_initial.py b/src/documents/migrations/0001_initial.py new file mode 100644 index 000000000..56a4dd58a --- /dev/null +++ b/src/documents/migrations/0001_initial.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9 on 2015-12-20 19:10 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Document', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('sender', models.CharField(blank=True, db_index=True, max_length=128)), + ('title', models.CharField(blank=True, db_index=True, max_length=128)), + ('content', models.TextField(db_index=True)), + ('created', models.DateTimeField(auto_now_add=True)), + ('modified', models.DateTimeField(auto_now=True)), + ], + ), + ] diff --git a/src/documents/migrations/__init__.py b/src/documents/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/documents/models.py b/src/documents/models.py new file mode 100644 index 000000000..a1b00e17e --- /dev/null +++ b/src/documents/models.py @@ -0,0 +1,10 @@ +from django.db import models + + +class Document(models.Model): + + sender = models.CharField(max_length=128, blank=True, db_index=True) + title = models.CharField(max_length=128, blank=True, db_index=True) + content = models.TextField(db_index=True) + created = models.DateTimeField(auto_now_add=True) + modified = models.DateTimeField(auto_now=True) diff --git a/src/documents/tests.py b/src/documents/tests.py new file mode 100644 index 000000000..7ce503c2d --- /dev/null +++ b/src/documents/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/src/documents/views.py b/src/documents/views.py new file mode 100644 index 000000000..91ea44a21 --- /dev/null +++ b/src/documents/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/src/manage.py b/src/manage.py new file mode 100755 index 000000000..99d61722e --- /dev/null +++ b/src/manage.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +import os +import sys + +if __name__ == "__main__": + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") + + from django.core.management import execute_from_command_line + + execute_from_command_line(sys.argv) diff --git a/src/paperless/__init__.py b/src/paperless/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless/settings.py b/src/paperless/settings.py new file mode 100644 index 000000000..32e9ee36d --- /dev/null +++ b/src/paperless/settings.py @@ -0,0 +1,137 @@ +""" +Django settings for paperless project. + +Generated by 'django-admin startproject' using Django 1.9. + +For more information on this file, see +https://docs.djangoproject.com/en/1.9/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/1.9/ref/settings/ +""" + +import os + +# Build paths inside the project like this: os.path.join(BASE_DIR, ...) +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + + "django_extensions", + + "documents", +] + +MIDDLEWARE_CLASSES = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'paperless.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'paperless.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/1.9/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), + } +} + + +# Password validation +# https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/1.9/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_L10N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/1.9/howto/static-files/ + +STATIC_ROOT = os.path.join(BASE_DIR, "..", "static") +MEDIA_ROOT = os.path.join(BASE_DIR, "..", "media") + +STATIC_URL = '/static/' +MEDIA_URL = "/media/" + + +# Paperless-specific stuffs +# Change these paths if yours are different + +CONVERT_BINARY = "/usr/bin/convert" +SCRATCH_DIR = "/tmp/paperless" # Will be created if it doesn't exist +CONSUMPTION_DIR = "/tmp/paperless/consume" diff --git a/src/paperless/urls.py b/src/paperless/urls.py new file mode 100644 index 000000000..42e9c16af --- /dev/null +++ b/src/paperless/urls.py @@ -0,0 +1,23 @@ +"""paperless URL Configuration + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/1.9/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') +Including another URLconf + 1. Add an import: from blog import urls as blog_urls + 2. Import the include() function: from django.conf.urls import url, include + 3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls)) +""" +from django.conf import settings +from django.conf.urls import url, static +from django.contrib import admin + +urlpatterns = [ + url(r'^admin/', admin.site.urls), +] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) diff --git a/src/paperless/wsgi.py b/src/paperless/wsgi.py new file mode 100644 index 000000000..9bf10da62 --- /dev/null +++ b/src/paperless/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for paperless project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") + +application = get_wsgi_application()