mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	It works!
This commit is contained in:
		
							
								
								
									
										12
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										12
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -55,3 +55,15 @@ docs/_build/ | |||||||
|  |  | ||||||
| # PyBuilder | # PyBuilder | ||||||
| target/ | target/ | ||||||
|  |  | ||||||
|  | # Stored PDFs & JPGs | ||||||
|  | media/* | ||||||
|  |  | ||||||
|  | # Sqlite database | ||||||
|  | db.sqlite3 | ||||||
|  |  | ||||||
|  | # PyCharm | ||||||
|  | .idea | ||||||
|  |  | ||||||
|  | # Fixtures | ||||||
|  | src/paperless/fixtures/ | ||||||
|   | |||||||
							
								
								
									
										0
									
								
								src/documents/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/documents/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										24
									
								
								src/documents/admin.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								src/documents/admin.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | |||||||
|  | from django.conf import settings | ||||||
|  | from django.contrib import admin | ||||||
|  |  | ||||||
|  | from .models import Document | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class DocumentAdmin(admin.ModelAdmin): | ||||||
|  |  | ||||||
|  |     search_fields = ("sender", "title", "content",) | ||||||
|  |     list_display = ("created", "sender", "title", "thumbnail", "pdf") | ||||||
|  |     list_filter = ("created", "sender") | ||||||
|  |     save_on_top = True | ||||||
|  |  | ||||||
|  |     def thumbnail(self, obj): | ||||||
|  |         return '<img src="{}documents/img/{:07}.jpg" width="100" />'.format( | ||||||
|  |             settings.MEDIA_URL, obj.pk) | ||||||
|  |     thumbnail.allow_tags = True | ||||||
|  |  | ||||||
|  |     def pdf(self, obj): | ||||||
|  |         return '<a href="{}documents/pdf/{:07}.pdf">Download</a>'.format( | ||||||
|  |             settings.MEDIA_URL, obj.pk) | ||||||
|  |     pdf.allow_tags = True | ||||||
|  |  | ||||||
|  | admin.site.register(Document, DocumentAdmin) | ||||||
							
								
								
									
										5
									
								
								src/documents/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								src/documents/apps.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,5 @@ | |||||||
|  | from django.apps import AppConfig | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class DocumentsConfig(AppConfig): | ||||||
|  |     name = 'documents' | ||||||
							
								
								
									
										0
									
								
								src/documents/management/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/documents/management/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								src/documents/management/commands/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/documents/management/commands/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										122
									
								
								src/documents/management/commands/consume.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										122
									
								
								src/documents/management/commands/consume.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,122 @@ | |||||||
|  | import glob | ||||||
|  | import os | ||||||
|  | import random | ||||||
|  | import re | ||||||
|  | import shutil | ||||||
|  | import subprocess | ||||||
|  |  | ||||||
|  | import pyocr | ||||||
|  |  | ||||||
|  | from PIL import Image | ||||||
|  |  | ||||||
|  | from django.conf import settings | ||||||
|  | from django.core.management.base import BaseCommand | ||||||
|  |  | ||||||
|  | from documents.models import Document | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Command(BaseCommand): | ||||||
|  |     """ | ||||||
|  |     Loop over every file found in CONSUMPTION_DIR and: | ||||||
|  |       1. Convert it to a greyscale tif | ||||||
|  |       2. Convert it to a full-colour jpg | ||||||
|  |       3. Use tesseract on the tif | ||||||
|  |       4. Store the OCR'd text in the database along with the paths to the jpg | ||||||
|  |          and original pdf | ||||||
|  |       5. Delete the pdf and images | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     CONVERT = settings.CONVERT_BINARY | ||||||
|  |     SCRATCH = settings.SCRATCH_DIR | ||||||
|  |     CONSUME = settings.CONSUMPTION_DIR | ||||||
|  |  | ||||||
|  |     OCR = pyocr.get_available_tools()[0] | ||||||
|  |  | ||||||
|  |     MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img") | ||||||
|  |     MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") | ||||||
|  |  | ||||||
|  |     def __init__(self, *args, **kwargs): | ||||||
|  |         self.verbosity = 0 | ||||||
|  |         BaseCommand.__init__(self, *args, **kwargs) | ||||||
|  |  | ||||||
|  |     def handle(self, *args, **options): | ||||||
|  |  | ||||||
|  |         self.verbosity = options["verbosity"] | ||||||
|  |  | ||||||
|  |         self._setup() | ||||||
|  |  | ||||||
|  |         for pdf in os.listdir(self.CONSUME): | ||||||
|  |  | ||||||
|  |             if not os.path.isfile(os.path.join(self.CONSUME, pdf)): | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             if not pdf.endswith(".pdf"): | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             if self.verbosity > 1: | ||||||
|  |                 print("Consuming {}".format(pdf)) | ||||||
|  |  | ||||||
|  |             pdf = os.path.join(self.CONSUME, pdf) | ||||||
|  |             pngs = self._get_greyscale(pdf) | ||||||
|  |             jpgs = self._get_colour(pdf) | ||||||
|  |             text = self._get_ocr(pngs) | ||||||
|  |  | ||||||
|  |             self._store(text, jpgs, pdf) | ||||||
|  |             self._cleanup(pngs, jpgs) | ||||||
|  |  | ||||||
|  |     def _setup(self): | ||||||
|  |         for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF): | ||||||
|  |             try: | ||||||
|  |                 os.makedirs(d) | ||||||
|  |             except FileExistsError: | ||||||
|  |                 pass | ||||||
|  |  | ||||||
|  |     def _get_greyscale(self, pdf): | ||||||
|  |  | ||||||
|  |         i = random.randint(1000000, 4999999) | ||||||
|  |         png = os.path.join(self.SCRATCH, "{}.png".format(i)) | ||||||
|  |  | ||||||
|  |         subprocess.Popen(( | ||||||
|  |             self.CONVERT, "-density", "300", "-depth", "8", | ||||||
|  |             "-type", "grayscale", pdf, png | ||||||
|  |         )).wait() | ||||||
|  |  | ||||||
|  |         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||||||
|  |  | ||||||
|  |     def _get_colour(self, pdf): | ||||||
|  |  | ||||||
|  |         i = random.randint(5000000, 9999999) | ||||||
|  |         jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i)) | ||||||
|  |  | ||||||
|  |         subprocess.Popen((self.CONVERT, pdf, jpg)).wait() | ||||||
|  |  | ||||||
|  |         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||||||
|  |  | ||||||
|  |     def _get_ocr(self, pngs): | ||||||
|  |  | ||||||
|  |         r = "" | ||||||
|  |         for png in pngs: | ||||||
|  |             with Image.open(os.path.join(self.SCRATCH, png)) as f: | ||||||
|  |                 r += self.OCR.image_to_string(f) | ||||||
|  |                 r += "\n\n\n\n\n\n\n\n" | ||||||
|  |  | ||||||
|  |         return r | ||||||
|  |  | ||||||
|  |     def _store(self, text, jpgs, pdf): | ||||||
|  |  | ||||||
|  |         doc = Document.objects.create(content=text) | ||||||
|  |  | ||||||
|  |         shutil.move(jpgs[0], os.path.join( | ||||||
|  |             self.MEDIA_IMG, "{:07}.jpg".format(doc.pk))) | ||||||
|  |         shutil.move(pdf, os.path.join( | ||||||
|  |             self.MEDIA_PDF, "{:07}.pdf".format(doc.pk))) | ||||||
|  |  | ||||||
|  |     def _cleanup(self, pngs, jpgs): | ||||||
|  |  | ||||||
|  |         jpg_glob = os.path.join( | ||||||
|  |             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0])) | ||||||
|  |         png_glob = os.path.join( | ||||||
|  |             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) | ||||||
|  |  | ||||||
|  |         for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)): | ||||||
|  |             os.unlink(f) | ||||||
							
								
								
									
										27
									
								
								src/documents/migrations/0001_initial.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								src/documents/migrations/0001_initial.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | |||||||
|  | # -*- coding: utf-8 -*- | ||||||
|  | # Generated by Django 1.9 on 2015-12-20 19:10 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  |  | ||||||
|  | from django.db import migrations, models | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Migration(migrations.Migration): | ||||||
|  |  | ||||||
|  |     initial = True | ||||||
|  |  | ||||||
|  |     dependencies = [ | ||||||
|  |     ] | ||||||
|  |  | ||||||
|  |     operations = [ | ||||||
|  |         migrations.CreateModel( | ||||||
|  |             name='Document', | ||||||
|  |             fields=[ | ||||||
|  |                 ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||||||
|  |                 ('sender', models.CharField(blank=True, db_index=True, max_length=128)), | ||||||
|  |                 ('title', models.CharField(blank=True, db_index=True, max_length=128)), | ||||||
|  |                 ('content', models.TextField(db_index=True)), | ||||||
|  |                 ('created', models.DateTimeField(auto_now_add=True)), | ||||||
|  |                 ('modified', models.DateTimeField(auto_now=True)), | ||||||
|  |             ], | ||||||
|  |         ), | ||||||
|  |     ] | ||||||
							
								
								
									
										0
									
								
								src/documents/migrations/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/documents/migrations/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										10
									
								
								src/documents/models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								src/documents/models.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | |||||||
|  | from django.db import models | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Document(models.Model): | ||||||
|  |  | ||||||
|  |     sender = models.CharField(max_length=128, blank=True, db_index=True) | ||||||
|  |     title = models.CharField(max_length=128, blank=True, db_index=True) | ||||||
|  |     content = models.TextField(db_index=True) | ||||||
|  |     created = models.DateTimeField(auto_now_add=True) | ||||||
|  |     modified = models.DateTimeField(auto_now=True) | ||||||
							
								
								
									
										3
									
								
								src/documents/tests.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								src/documents/tests.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | |||||||
|  | from django.test import TestCase | ||||||
|  |  | ||||||
|  | # Create your tests here. | ||||||
							
								
								
									
										3
									
								
								src/documents/views.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								src/documents/views.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | |||||||
|  | from django.shortcuts import render | ||||||
|  |  | ||||||
|  | # Create your views here. | ||||||
							
								
								
									
										10
									
								
								src/manage.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										10
									
								
								src/manage.py
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,10 @@ | |||||||
|  | #!/usr/bin/env python | ||||||
|  | import os | ||||||
|  | import sys | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") | ||||||
|  |  | ||||||
|  |     from django.core.management import execute_from_command_line | ||||||
|  |  | ||||||
|  |     execute_from_command_line(sys.argv) | ||||||
							
								
								
									
										0
									
								
								src/paperless/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										137
									
								
								src/paperless/settings.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										137
									
								
								src/paperless/settings.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,137 @@ | |||||||
|  | """ | ||||||
|  | Django settings for paperless project. | ||||||
|  |  | ||||||
|  | Generated by 'django-admin startproject' using Django 1.9. | ||||||
|  |  | ||||||
|  | For more information on this file, see | ||||||
|  | https://docs.djangoproject.com/en/1.9/topics/settings/ | ||||||
|  |  | ||||||
|  | For the full list of settings and their values, see | ||||||
|  | https://docs.djangoproject.com/en/1.9/ref/settings/ | ||||||
|  | """ | ||||||
|  |  | ||||||
|  | import os | ||||||
|  |  | ||||||
|  | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) | ||||||
|  | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Quick-start development settings - unsuitable for production | ||||||
|  | # See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/ | ||||||
|  |  | ||||||
|  | # SECURITY WARNING: keep the secret key used in production secret! | ||||||
|  | SECRET_KEY = 'e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee' | ||||||
|  |  | ||||||
|  | # SECURITY WARNING: don't run with debug turned on in production! | ||||||
|  | DEBUG = True | ||||||
|  |  | ||||||
|  | ALLOWED_HOSTS = [] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Application definition | ||||||
|  |  | ||||||
|  | INSTALLED_APPS = [ | ||||||
|  |     'django.contrib.admin', | ||||||
|  |     'django.contrib.auth', | ||||||
|  |     'django.contrib.contenttypes', | ||||||
|  |     'django.contrib.sessions', | ||||||
|  |     'django.contrib.messages', | ||||||
|  |     'django.contrib.staticfiles', | ||||||
|  |  | ||||||
|  |     "django_extensions", | ||||||
|  |  | ||||||
|  |     "documents", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | MIDDLEWARE_CLASSES = [ | ||||||
|  |     'django.middleware.security.SecurityMiddleware', | ||||||
|  |     'django.contrib.sessions.middleware.SessionMiddleware', | ||||||
|  |     'django.middleware.common.CommonMiddleware', | ||||||
|  |     'django.middleware.csrf.CsrfViewMiddleware', | ||||||
|  |     'django.contrib.auth.middleware.AuthenticationMiddleware', | ||||||
|  |     'django.contrib.auth.middleware.SessionAuthenticationMiddleware', | ||||||
|  |     'django.contrib.messages.middleware.MessageMiddleware', | ||||||
|  |     'django.middleware.clickjacking.XFrameOptionsMiddleware', | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | ROOT_URLCONF = 'paperless.urls' | ||||||
|  |  | ||||||
|  | TEMPLATES = [ | ||||||
|  |     { | ||||||
|  |         'BACKEND': 'django.template.backends.django.DjangoTemplates', | ||||||
|  |         'DIRS': [], | ||||||
|  |         'APP_DIRS': True, | ||||||
|  |         'OPTIONS': { | ||||||
|  |             'context_processors': [ | ||||||
|  |                 'django.template.context_processors.debug', | ||||||
|  |                 'django.template.context_processors.request', | ||||||
|  |                 'django.contrib.auth.context_processors.auth', | ||||||
|  |                 'django.contrib.messages.context_processors.messages', | ||||||
|  |             ], | ||||||
|  |         }, | ||||||
|  |     }, | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | WSGI_APPLICATION = 'paperless.wsgi.application' | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Database | ||||||
|  | # https://docs.djangoproject.com/en/1.9/ref/settings/#databases | ||||||
|  |  | ||||||
|  | DATABASES = { | ||||||
|  |     'default': { | ||||||
|  |         'ENGINE': 'django.db.backends.sqlite3', | ||||||
|  |         'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Password validation | ||||||
|  | # https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators | ||||||
|  |  | ||||||
|  | AUTH_PASSWORD_VALIDATORS = [ | ||||||
|  |     { | ||||||
|  |         'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |         'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |         'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |         'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', | ||||||
|  |     }, | ||||||
|  | ] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Internationalization | ||||||
|  | # https://docs.djangoproject.com/en/1.9/topics/i18n/ | ||||||
|  |  | ||||||
|  | LANGUAGE_CODE = 'en-us' | ||||||
|  |  | ||||||
|  | TIME_ZONE = 'UTC' | ||||||
|  |  | ||||||
|  | USE_I18N = True | ||||||
|  |  | ||||||
|  | USE_L10N = True | ||||||
|  |  | ||||||
|  | USE_TZ = True | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Static files (CSS, JavaScript, Images) | ||||||
|  | # https://docs.djangoproject.com/en/1.9/howto/static-files/ | ||||||
|  |  | ||||||
|  | STATIC_ROOT = os.path.join(BASE_DIR, "..", "static") | ||||||
|  | MEDIA_ROOT = os.path.join(BASE_DIR, "..", "media") | ||||||
|  |  | ||||||
|  | STATIC_URL = '/static/' | ||||||
|  | MEDIA_URL = "/media/" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Paperless-specific stuffs | ||||||
|  | # Change these paths if yours are different | ||||||
|  |  | ||||||
|  | CONVERT_BINARY = "/usr/bin/convert" | ||||||
|  | SCRATCH_DIR = "/tmp/paperless"  # Will be created if it doesn't exist | ||||||
|  | CONSUMPTION_DIR = "/tmp/paperless/consume" | ||||||
							
								
								
									
										23
									
								
								src/paperless/urls.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/paperless/urls.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | |||||||
|  | """paperless URL Configuration | ||||||
|  |  | ||||||
|  | The `urlpatterns` list routes URLs to views. For more information please see: | ||||||
|  |     https://docs.djangoproject.com/en/1.9/topics/http/urls/ | ||||||
|  | Examples: | ||||||
|  | Function views | ||||||
|  |     1. Add an import:  from my_app import views | ||||||
|  |     2. Add a URL to urlpatterns:  url(r'^$', views.home, name='home') | ||||||
|  | Class-based views | ||||||
|  |     1. Add an import:  from other_app.views import Home | ||||||
|  |     2. Add a URL to urlpatterns:  url(r'^$', Home.as_view(), name='home') | ||||||
|  | Including another URLconf | ||||||
|  |     1. Add an import:  from blog import urls as blog_urls | ||||||
|  |     2. Import the include() function: from django.conf.urls import url, include | ||||||
|  |     3. Add a URL to urlpatterns:  url(r'^blog/', include(blog_urls)) | ||||||
|  | """ | ||||||
|  | from django.conf import settings | ||||||
|  | from django.conf.urls import url, static | ||||||
|  | from django.contrib import admin | ||||||
|  |  | ||||||
|  | urlpatterns = [ | ||||||
|  |     url(r'^admin/', admin.site.urls), | ||||||
|  | ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) | ||||||
							
								
								
									
										16
									
								
								src/paperless/wsgi.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/paperless/wsgi.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | |||||||
|  | """ | ||||||
|  | WSGI config for paperless project. | ||||||
|  |  | ||||||
|  | It exposes the WSGI callable as a module-level variable named ``application``. | ||||||
|  |  | ||||||
|  | For more information on this file, see | ||||||
|  | https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/ | ||||||
|  | """ | ||||||
|  |  | ||||||
|  | import os | ||||||
|  |  | ||||||
|  | from django.core.wsgi import get_wsgi_application | ||||||
|  |  | ||||||
|  | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") | ||||||
|  |  | ||||||
|  | application = get_wsgi_application() | ||||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn