mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
It works!
This commit is contained in:
parent
cbc8c25f3b
commit
855ee64097
12
.gitignore
vendored
12
.gitignore
vendored
@ -55,3 +55,15 @@ docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Stored PDFs & JPGs
|
||||
media/*
|
||||
|
||||
# Sqlite database
|
||||
db.sqlite3
|
||||
|
||||
# PyCharm
|
||||
.idea
|
||||
|
||||
# Fixtures
|
||||
src/paperless/fixtures/
|
||||
|
0
src/documents/__init__.py
Normal file
0
src/documents/__init__.py
Normal file
24
src/documents/admin.py
Normal file
24
src/documents/admin.py
Normal file
@ -0,0 +1,24 @@
|
||||
from django.conf import settings
|
||||
from django.contrib import admin
|
||||
|
||||
from .models import Document
|
||||
|
||||
|
||||
class DocumentAdmin(admin.ModelAdmin):
|
||||
|
||||
search_fields = ("sender", "title", "content",)
|
||||
list_display = ("created", "sender", "title", "thumbnail", "pdf")
|
||||
list_filter = ("created", "sender")
|
||||
save_on_top = True
|
||||
|
||||
def thumbnail(self, obj):
|
||||
return '<img src="{}documents/img/{:07}.jpg" width="100" />'.format(
|
||||
settings.MEDIA_URL, obj.pk)
|
||||
thumbnail.allow_tags = True
|
||||
|
||||
def pdf(self, obj):
|
||||
return '<a href="{}documents/pdf/{:07}.pdf">Download</a>'.format(
|
||||
settings.MEDIA_URL, obj.pk)
|
||||
pdf.allow_tags = True
|
||||
|
||||
admin.site.register(Document, DocumentAdmin)
|
5
src/documents/apps.py
Normal file
5
src/documents/apps.py
Normal file
@ -0,0 +1,5 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class DocumentsConfig(AppConfig):
|
||||
name = 'documents'
|
0
src/documents/management/__init__.py
Normal file
0
src/documents/management/__init__.py
Normal file
0
src/documents/management/commands/__init__.py
Normal file
0
src/documents/management/commands/__init__.py
Normal file
122
src/documents/management/commands/consume.py
Normal file
122
src/documents/management/commands/consume.py
Normal file
@ -0,0 +1,122 @@
|
||||
import glob
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
import pyocr
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""
|
||||
Loop over every file found in CONSUMPTION_DIR and:
|
||||
1. Convert it to a greyscale tif
|
||||
2. Convert it to a full-colour jpg
|
||||
3. Use tesseract on the tif
|
||||
4. Store the OCR'd text in the database along with the paths to the jpg
|
||||
and original pdf
|
||||
5. Delete the pdf and images
|
||||
"""
|
||||
|
||||
CONVERT = settings.CONVERT_BINARY
|
||||
SCRATCH = settings.SCRATCH_DIR
|
||||
CONSUME = settings.CONSUMPTION_DIR
|
||||
|
||||
OCR = pyocr.get_available_tools()[0]
|
||||
|
||||
MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img")
|
||||
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
self._setup()
|
||||
|
||||
for pdf in os.listdir(self.CONSUME):
|
||||
|
||||
if not os.path.isfile(os.path.join(self.CONSUME, pdf)):
|
||||
continue
|
||||
|
||||
if not pdf.endswith(".pdf"):
|
||||
continue
|
||||
|
||||
if self.verbosity > 1:
|
||||
print("Consuming {}".format(pdf))
|
||||
|
||||
pdf = os.path.join(self.CONSUME, pdf)
|
||||
pngs = self._get_greyscale(pdf)
|
||||
jpgs = self._get_colour(pdf)
|
||||
text = self._get_ocr(pngs)
|
||||
|
||||
self._store(text, jpgs, pdf)
|
||||
self._cleanup(pngs, jpgs)
|
||||
|
||||
def _setup(self):
|
||||
for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF):
|
||||
try:
|
||||
os.makedirs(d)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
def _get_greyscale(self, pdf):
|
||||
|
||||
i = random.randint(1000000, 4999999)
|
||||
png = os.path.join(self.SCRATCH, "{}.png".format(i))
|
||||
|
||||
subprocess.Popen((
|
||||
self.CONVERT, "-density", "300", "-depth", "8",
|
||||
"-type", "grayscale", pdf, png
|
||||
)).wait()
|
||||
|
||||
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
||||
|
||||
def _get_colour(self, pdf):
|
||||
|
||||
i = random.randint(5000000, 9999999)
|
||||
jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i))
|
||||
|
||||
subprocess.Popen((self.CONVERT, pdf, jpg)).wait()
|
||||
|
||||
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
||||
|
||||
def _get_ocr(self, pngs):
|
||||
|
||||
r = ""
|
||||
for png in pngs:
|
||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||
r += self.OCR.image_to_string(f)
|
||||
r += "\n\n\n\n\n\n\n\n"
|
||||
|
||||
return r
|
||||
|
||||
def _store(self, text, jpgs, pdf):
|
||||
|
||||
doc = Document.objects.create(content=text)
|
||||
|
||||
shutil.move(jpgs[0], os.path.join(
|
||||
self.MEDIA_IMG, "{:07}.jpg".format(doc.pk)))
|
||||
shutil.move(pdf, os.path.join(
|
||||
self.MEDIA_PDF, "{:07}.pdf".format(doc.pk)))
|
||||
|
||||
def _cleanup(self, pngs, jpgs):
|
||||
|
||||
jpg_glob = os.path.join(
|
||||
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0]))
|
||||
png_glob = os.path.join(
|
||||
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
|
||||
|
||||
for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)):
|
||||
os.unlink(f)
|
27
src/documents/migrations/0001_initial.py
Normal file
27
src/documents/migrations/0001_initial.py
Normal file
@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.9 on 2015-12-20 19:10
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Document',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('sender', models.CharField(blank=True, db_index=True, max_length=128)),
|
||||
('title', models.CharField(blank=True, db_index=True, max_length=128)),
|
||||
('content', models.TextField(db_index=True)),
|
||||
('created', models.DateTimeField(auto_now_add=True)),
|
||||
('modified', models.DateTimeField(auto_now=True)),
|
||||
],
|
||||
),
|
||||
]
|
0
src/documents/migrations/__init__.py
Normal file
0
src/documents/migrations/__init__.py
Normal file
10
src/documents/models.py
Normal file
10
src/documents/models.py
Normal file
@ -0,0 +1,10 @@
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Document(models.Model):
|
||||
|
||||
sender = models.CharField(max_length=128, blank=True, db_index=True)
|
||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||
content = models.TextField(db_index=True)
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
modified = models.DateTimeField(auto_now=True)
|
3
src/documents/tests.py
Normal file
3
src/documents/tests.py
Normal file
@ -0,0 +1,3 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
3
src/documents/views.py
Normal file
3
src/documents/views.py
Normal file
@ -0,0 +1,3 @@
|
||||
from django.shortcuts import render
|
||||
|
||||
# Create your views here.
|
10
src/manage.py
Executable file
10
src/manage.py
Executable file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python
|
||||
import os
|
||||
import sys
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
|
||||
|
||||
from django.core.management import execute_from_command_line
|
||||
|
||||
execute_from_command_line(sys.argv)
|
0
src/paperless/__init__.py
Normal file
0
src/paperless/__init__.py
Normal file
137
src/paperless/settings.py
Normal file
137
src/paperless/settings.py
Normal file
@ -0,0 +1,137 @@
|
||||
"""
|
||||
Django settings for paperless project.
|
||||
|
||||
Generated by 'django-admin startproject' using Django 1.9.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/1.9/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/1.9/ref/settings/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee'
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
|
||||
"django_extensions",
|
||||
|
||||
"documents",
|
||||
]
|
||||
|
||||
MIDDLEWARE_CLASSES = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
]
|
||||
|
||||
ROOT_URLCONF = 'paperless.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': [],
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
WSGI_APPLICATION = 'paperless.wsgi.application'
|
||||
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/1.9/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/1.9/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
|
||||
TIME_ZONE = 'UTC'
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
USE_L10N = True
|
||||
|
||||
USE_TZ = True
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/1.9/howto/static-files/
|
||||
|
||||
STATIC_ROOT = os.path.join(BASE_DIR, "..", "static")
|
||||
MEDIA_ROOT = os.path.join(BASE_DIR, "..", "media")
|
||||
|
||||
STATIC_URL = '/static/'
|
||||
MEDIA_URL = "/media/"
|
||||
|
||||
|
||||
# Paperless-specific stuffs
|
||||
# Change these paths if yours are different
|
||||
|
||||
CONVERT_BINARY = "/usr/bin/convert"
|
||||
SCRATCH_DIR = "/tmp/paperless" # Will be created if it doesn't exist
|
||||
CONSUMPTION_DIR = "/tmp/paperless/consume"
|
23
src/paperless/urls.py
Normal file
23
src/paperless/urls.py
Normal file
@ -0,0 +1,23 @@
|
||||
"""paperless URL Configuration
|
||||
|
||||
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||
https://docs.djangoproject.com/en/1.9/topics/http/urls/
|
||||
Examples:
|
||||
Function views
|
||||
1. Add an import: from my_app import views
|
||||
2. Add a URL to urlpatterns: url(r'^$', views.home, name='home')
|
||||
Class-based views
|
||||
1. Add an import: from other_app.views import Home
|
||||
2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home')
|
||||
Including another URLconf
|
||||
1. Add an import: from blog import urls as blog_urls
|
||||
2. Import the include() function: from django.conf.urls import url, include
|
||||
3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls))
|
||||
"""
|
||||
from django.conf import settings
|
||||
from django.conf.urls import url, static
|
||||
from django.contrib import admin
|
||||
|
||||
urlpatterns = [
|
||||
url(r'^admin/', admin.site.urls),
|
||||
] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
16
src/paperless/wsgi.py
Normal file
16
src/paperless/wsgi.py
Normal file
@ -0,0 +1,16 @@
|
||||
"""
|
||||
WSGI config for paperless project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
|
||||
|
||||
application = get_wsgi_application()
|
Loading…
x
Reference in New Issue
Block a user