diff --git a/.gitignore b/.gitignore
index ba7466050..07b9a2f46 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,3 +55,15 @@ docs/_build/
# PyBuilder
target/
+
+# Stored PDFs & JPGs
+media/*
+
+# Sqlite database
+db.sqlite3
+
+# PyCharm
+.idea
+
+# Fixtures
+src/paperless/fixtures/
diff --git a/src/documents/__init__.py b/src/documents/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/documents/admin.py b/src/documents/admin.py
new file mode 100644
index 000000000..9b515864f
--- /dev/null
+++ b/src/documents/admin.py
@@ -0,0 +1,24 @@
+from django.conf import settings
+from django.contrib import admin
+
+from .models import Document
+
+
+class DocumentAdmin(admin.ModelAdmin):
+
+ search_fields = ("sender", "title", "content",)
+ list_display = ("created", "sender", "title", "thumbnail", "pdf")
+ list_filter = ("created", "sender")
+ save_on_top = True
+
+ def thumbnail(self, obj):
+ return '
'.format(
+ settings.MEDIA_URL, obj.pk)
+ thumbnail.allow_tags = True
+
+ def pdf(self, obj):
+ return 'Download'.format(
+ settings.MEDIA_URL, obj.pk)
+ pdf.allow_tags = True
+
+admin.site.register(Document, DocumentAdmin)
diff --git a/src/documents/apps.py b/src/documents/apps.py
new file mode 100644
index 000000000..93ca7550a
--- /dev/null
+++ b/src/documents/apps.py
@@ -0,0 +1,5 @@
+from django.apps import AppConfig
+
+
+class DocumentsConfig(AppConfig):
+ name = 'documents'
diff --git a/src/documents/management/__init__.py b/src/documents/management/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/documents/management/commands/__init__.py b/src/documents/management/commands/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/documents/management/commands/consume.py b/src/documents/management/commands/consume.py
new file mode 100644
index 000000000..45bdba464
--- /dev/null
+++ b/src/documents/management/commands/consume.py
@@ -0,0 +1,122 @@
+import glob
+import os
+import random
+import re
+import shutil
+import subprocess
+
+import pyocr
+
+from PIL import Image
+
+from django.conf import settings
+from django.core.management.base import BaseCommand
+
+from documents.models import Document
+
+
+class Command(BaseCommand):
+ """
+ Loop over every file found in CONSUMPTION_DIR and:
+ 1. Convert it to a greyscale tif
+ 2. Convert it to a full-colour jpg
+ 3. Use tesseract on the tif
+ 4. Store the OCR'd text in the database along with the paths to the jpg
+ and original pdf
+ 5. Delete the pdf and images
+ """
+
+ CONVERT = settings.CONVERT_BINARY
+ SCRATCH = settings.SCRATCH_DIR
+ CONSUME = settings.CONSUMPTION_DIR
+
+ OCR = pyocr.get_available_tools()[0]
+
+ MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img")
+ MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
+
+ def __init__(self, *args, **kwargs):
+ self.verbosity = 0
+ BaseCommand.__init__(self, *args, **kwargs)
+
+ def handle(self, *args, **options):
+
+ self.verbosity = options["verbosity"]
+
+ self._setup()
+
+ for pdf in os.listdir(self.CONSUME):
+
+ if not os.path.isfile(os.path.join(self.CONSUME, pdf)):
+ continue
+
+ if not pdf.endswith(".pdf"):
+ continue
+
+ if self.verbosity > 1:
+ print("Consuming {}".format(pdf))
+
+ pdf = os.path.join(self.CONSUME, pdf)
+ pngs = self._get_greyscale(pdf)
+ jpgs = self._get_colour(pdf)
+ text = self._get_ocr(pngs)
+
+ self._store(text, jpgs, pdf)
+ self._cleanup(pngs, jpgs)
+
+ def _setup(self):
+ for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF):
+ try:
+ os.makedirs(d)
+ except FileExistsError:
+ pass
+
+ def _get_greyscale(self, pdf):
+
+ i = random.randint(1000000, 4999999)
+ png = os.path.join(self.SCRATCH, "{}.png".format(i))
+
+ subprocess.Popen((
+ self.CONVERT, "-density", "300", "-depth", "8",
+ "-type", "grayscale", pdf, png
+ )).wait()
+
+ return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
+
+ def _get_colour(self, pdf):
+
+ i = random.randint(5000000, 9999999)
+ jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i))
+
+ subprocess.Popen((self.CONVERT, pdf, jpg)).wait()
+
+ return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
+
+ def _get_ocr(self, pngs):
+
+ r = ""
+ for png in pngs:
+ with Image.open(os.path.join(self.SCRATCH, png)) as f:
+ r += self.OCR.image_to_string(f)
+ r += "\n\n\n\n\n\n\n\n"
+
+ return r
+
+ def _store(self, text, jpgs, pdf):
+
+ doc = Document.objects.create(content=text)
+
+ shutil.move(jpgs[0], os.path.join(
+ self.MEDIA_IMG, "{:07}.jpg".format(doc.pk)))
+ shutil.move(pdf, os.path.join(
+ self.MEDIA_PDF, "{:07}.pdf".format(doc.pk)))
+
+ def _cleanup(self, pngs, jpgs):
+
+ jpg_glob = os.path.join(
+ self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0]))
+ png_glob = os.path.join(
+ self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
+
+ for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)):
+ os.unlink(f)
diff --git a/src/documents/migrations/0001_initial.py b/src/documents/migrations/0001_initial.py
new file mode 100644
index 000000000..56a4dd58a
--- /dev/null
+++ b/src/documents/migrations/0001_initial.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.9 on 2015-12-20 19:10
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ initial = True
+
+ dependencies = [
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='Document',
+ fields=[
+ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('sender', models.CharField(blank=True, db_index=True, max_length=128)),
+ ('title', models.CharField(blank=True, db_index=True, max_length=128)),
+ ('content', models.TextField(db_index=True)),
+ ('created', models.DateTimeField(auto_now_add=True)),
+ ('modified', models.DateTimeField(auto_now=True)),
+ ],
+ ),
+ ]
diff --git a/src/documents/migrations/__init__.py b/src/documents/migrations/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/documents/models.py b/src/documents/models.py
new file mode 100644
index 000000000..a1b00e17e
--- /dev/null
+++ b/src/documents/models.py
@@ -0,0 +1,10 @@
+from django.db import models
+
+
+class Document(models.Model):
+
+ sender = models.CharField(max_length=128, blank=True, db_index=True)
+ title = models.CharField(max_length=128, blank=True, db_index=True)
+ content = models.TextField(db_index=True)
+ created = models.DateTimeField(auto_now_add=True)
+ modified = models.DateTimeField(auto_now=True)
diff --git a/src/documents/tests.py b/src/documents/tests.py
new file mode 100644
index 000000000..7ce503c2d
--- /dev/null
+++ b/src/documents/tests.py
@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.
diff --git a/src/documents/views.py b/src/documents/views.py
new file mode 100644
index 000000000..91ea44a21
--- /dev/null
+++ b/src/documents/views.py
@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.
diff --git a/src/manage.py b/src/manage.py
new file mode 100755
index 000000000..99d61722e
--- /dev/null
+++ b/src/manage.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+import os
+import sys
+
+if __name__ == "__main__":
+ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
+
+ from django.core.management import execute_from_command_line
+
+ execute_from_command_line(sys.argv)
diff --git a/src/paperless/__init__.py b/src/paperless/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
new file mode 100644
index 000000000..32e9ee36d
--- /dev/null
+++ b/src/paperless/settings.py
@@ -0,0 +1,137 @@
+"""
+Django settings for paperless project.
+
+Generated by 'django-admin startproject' using Django 1.9.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/1.9/topics/settings/
+
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/1.9/ref/settings/
+"""
+
+import os
+
+# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+# Quick-start development settings - unsuitable for production
+# See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/
+
+# SECURITY WARNING: keep the secret key used in production secret!
+SECRET_KEY = 'e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee'
+
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = True
+
+ALLOWED_HOSTS = []
+
+
+# Application definition
+
+INSTALLED_APPS = [
+ 'django.contrib.admin',
+ 'django.contrib.auth',
+ 'django.contrib.contenttypes',
+ 'django.contrib.sessions',
+ 'django.contrib.messages',
+ 'django.contrib.staticfiles',
+
+ "django_extensions",
+
+ "documents",
+]
+
+MIDDLEWARE_CLASSES = [
+ 'django.middleware.security.SecurityMiddleware',
+ 'django.contrib.sessions.middleware.SessionMiddleware',
+ 'django.middleware.common.CommonMiddleware',
+ 'django.middleware.csrf.CsrfViewMiddleware',
+ 'django.contrib.auth.middleware.AuthenticationMiddleware',
+ 'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
+ 'django.contrib.messages.middleware.MessageMiddleware',
+ 'django.middleware.clickjacking.XFrameOptionsMiddleware',
+]
+
+ROOT_URLCONF = 'paperless.urls'
+
+TEMPLATES = [
+ {
+ 'BACKEND': 'django.template.backends.django.DjangoTemplates',
+ 'DIRS': [],
+ 'APP_DIRS': True,
+ 'OPTIONS': {
+ 'context_processors': [
+ 'django.template.context_processors.debug',
+ 'django.template.context_processors.request',
+ 'django.contrib.auth.context_processors.auth',
+ 'django.contrib.messages.context_processors.messages',
+ ],
+ },
+ },
+]
+
+WSGI_APPLICATION = 'paperless.wsgi.application'
+
+
+# Database
+# https://docs.djangoproject.com/en/1.9/ref/settings/#databases
+
+DATABASES = {
+ 'default': {
+ 'ENGINE': 'django.db.backends.sqlite3',
+ 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
+ }
+}
+
+
+# Password validation
+# https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators
+
+AUTH_PASSWORD_VALIDATORS = [
+ {
+ 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
+ },
+ {
+ 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
+ },
+ {
+ 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
+ },
+ {
+ 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
+ },
+]
+
+
+# Internationalization
+# https://docs.djangoproject.com/en/1.9/topics/i18n/
+
+LANGUAGE_CODE = 'en-us'
+
+TIME_ZONE = 'UTC'
+
+USE_I18N = True
+
+USE_L10N = True
+
+USE_TZ = True
+
+
+# Static files (CSS, JavaScript, Images)
+# https://docs.djangoproject.com/en/1.9/howto/static-files/
+
+STATIC_ROOT = os.path.join(BASE_DIR, "..", "static")
+MEDIA_ROOT = os.path.join(BASE_DIR, "..", "media")
+
+STATIC_URL = '/static/'
+MEDIA_URL = "/media/"
+
+
+# Paperless-specific stuffs
+# Change these paths if yours are different
+
+CONVERT_BINARY = "/usr/bin/convert"
+SCRATCH_DIR = "/tmp/paperless" # Will be created if it doesn't exist
+CONSUMPTION_DIR = "/tmp/paperless/consume"
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
new file mode 100644
index 000000000..42e9c16af
--- /dev/null
+++ b/src/paperless/urls.py
@@ -0,0 +1,23 @@
+"""paperless URL Configuration
+
+The `urlpatterns` list routes URLs to views. For more information please see:
+ https://docs.djangoproject.com/en/1.9/topics/http/urls/
+Examples:
+Function views
+ 1. Add an import: from my_app import views
+ 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home')
+Class-based views
+ 1. Add an import: from other_app.views import Home
+ 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home')
+Including another URLconf
+ 1. Add an import: from blog import urls as blog_urls
+ 2. Import the include() function: from django.conf.urls import url, include
+ 3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls))
+"""
+from django.conf import settings
+from django.conf.urls import url, static
+from django.contrib import admin
+
+urlpatterns = [
+ url(r'^admin/', admin.site.urls),
+] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
diff --git a/src/paperless/wsgi.py b/src/paperless/wsgi.py
new file mode 100644
index 000000000..9bf10da62
--- /dev/null
+++ b/src/paperless/wsgi.py
@@ -0,0 +1,16 @@
+"""
+WSGI config for paperless project.
+
+It exposes the WSGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/
+"""
+
+import os
+
+from django.core.wsgi import get_wsgi_application
+
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
+
+application = get_wsgi_application()