It works!

This commit is contained in:
Daniel Quinn 2015-12-20 19:23:33 +00:00
parent cbc8c25f3b
commit 855ee64097
17 changed files with 392 additions and 0 deletions

12
.gitignore vendored
View File

@ -55,3 +55,15 @@ docs/_build/
# PyBuilder
target/
# Stored PDFs & JPGs
media/*
# Sqlite database
db.sqlite3
# PyCharm
.idea
# Fixtures
src/paperless/fixtures/

View File

24
src/documents/admin.py Normal file
View File

@ -0,0 +1,24 @@
from django.conf import settings
from django.contrib import admin
from .models import Document
class DocumentAdmin(admin.ModelAdmin):
search_fields = ("sender", "title", "content",)
list_display = ("created", "sender", "title", "thumbnail", "pdf")
list_filter = ("created", "sender")
save_on_top = True
def thumbnail(self, obj):
return '<img src="{}documents/img/{:07}.jpg" width="100" />'.format(
settings.MEDIA_URL, obj.pk)
thumbnail.allow_tags = True
def pdf(self, obj):
return '<a href="{}documents/pdf/{:07}.pdf">Download</a>'.format(
settings.MEDIA_URL, obj.pk)
pdf.allow_tags = True
admin.site.register(Document, DocumentAdmin)

5
src/documents/apps.py Normal file
View File

@ -0,0 +1,5 @@
from django.apps import AppConfig
class DocumentsConfig(AppConfig):
name = 'documents'

View File

View File

@ -0,0 +1,122 @@
import glob
import os
import random
import re
import shutil
import subprocess
import pyocr
from PIL import Image
from django.conf import settings
from django.core.management.base import BaseCommand
from documents.models import Document
class Command(BaseCommand):
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale tif
2. Convert it to a full-colour jpg
3. Use tesseract on the tif
4. Store the OCR'd text in the database along with the paths to the jpg
and original pdf
5. Delete the pdf and images
"""
CONVERT = settings.CONVERT_BINARY
SCRATCH = settings.SCRATCH_DIR
CONSUME = settings.CONSUMPTION_DIR
OCR = pyocr.get_available_tools()[0]
MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img")
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
self._setup()
for pdf in os.listdir(self.CONSUME):
if not os.path.isfile(os.path.join(self.CONSUME, pdf)):
continue
if not pdf.endswith(".pdf"):
continue
if self.verbosity > 1:
print("Consuming {}".format(pdf))
pdf = os.path.join(self.CONSUME, pdf)
pngs = self._get_greyscale(pdf)
jpgs = self._get_colour(pdf)
text = self._get_ocr(pngs)
self._store(text, jpgs, pdf)
self._cleanup(pngs, jpgs)
def _setup(self):
for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF):
try:
os.makedirs(d)
except FileExistsError:
pass
def _get_greyscale(self, pdf):
i = random.randint(1000000, 4999999)
png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", pdf, png
)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_colour(self, pdf):
i = random.randint(5000000, 9999999)
jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i))
subprocess.Popen((self.CONVERT, pdf, jpg)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_ocr(self, pngs):
r = ""
for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f:
r += self.OCR.image_to_string(f)
r += "\n\n\n\n\n\n\n\n"
return r
def _store(self, text, jpgs, pdf):
doc = Document.objects.create(content=text)
shutil.move(jpgs[0], os.path.join(
self.MEDIA_IMG, "{:07}.jpg".format(doc.pk)))
shutil.move(pdf, os.path.join(
self.MEDIA_PDF, "{:07}.pdf".format(doc.pk)))
def _cleanup(self, pngs, jpgs):
jpg_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0]))
png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)):
os.unlink(f)

View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9 on 2015-12-20 19:10
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Document',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('sender', models.CharField(blank=True, db_index=True, max_length=128)),
('title', models.CharField(blank=True, db_index=True, max_length=128)),
('content', models.TextField(db_index=True)),
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
],
),
]

View File

10
src/documents/models.py Normal file
View File

@ -0,0 +1,10 @@
from django.db import models
class Document(models.Model):
sender = models.CharField(max_length=128, blank=True, db_index=True)
title = models.CharField(max_length=128, blank=True, db_index=True)
content = models.TextField(db_index=True)
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)

3
src/documents/tests.py Normal file
View File

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

3
src/documents/views.py Normal file
View File

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

10
src/manage.py Executable file
View File

@ -0,0 +1,10 @@
#!/usr/bin/env python
import os
import sys
if __name__ == "__main__":
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
from django.core.management import execute_from_command_line
execute_from_command_line(sys.argv)

View File

137
src/paperless/settings.py Normal file
View File

@ -0,0 +1,137 @@
"""
Django settings for paperless project.
Generated by 'django-admin startproject' using Django 1.9.
For more information on this file, see
https://docs.djangoproject.com/en/1.9/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/1.9/ref/settings/
"""
import os
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
"django_extensions",
"documents",
]
MIDDLEWARE_CLASSES = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'paperless.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'paperless.wsgi.application'
# Database
# https://docs.djangoproject.com/en/1.9/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
}
}
# Password validation
# https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/1.9/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_L10N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/1.9/howto/static-files/
STATIC_ROOT = os.path.join(BASE_DIR, "..", "static")
MEDIA_ROOT = os.path.join(BASE_DIR, "..", "media")
STATIC_URL = '/static/'
MEDIA_URL = "/media/"
# Paperless-specific stuffs
# Change these paths if yours are different
CONVERT_BINARY = "/usr/bin/convert"
SCRATCH_DIR = "/tmp/paperless" # Will be created if it doesn't exist
CONSUMPTION_DIR = "/tmp/paperless/consume"

23
src/paperless/urls.py Normal file
View File

@ -0,0 +1,23 @@
"""paperless URL Configuration
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/1.9/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: url(r'^$', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home')
Including another URLconf
1. Add an import: from blog import urls as blog_urls
2. Import the include() function: from django.conf.urls import url, include
3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls))
"""
from django.conf import settings
from django.conf.urls import url, static
from django.contrib import admin
urlpatterns = [
url(r'^admin/', admin.site.urls),
] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)

16
src/paperless/wsgi.py Normal file
View File

@ -0,0 +1,16 @@
"""
WSGI config for paperless project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
application = get_wsgi_application()