mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	It works!
This commit is contained in:
		
							
								
								
									
										12
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										12
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -55,3 +55,15 @@ docs/_build/ | ||||
|  | ||||
| # PyBuilder | ||||
| target/ | ||||
|  | ||||
| # Stored PDFs & JPGs | ||||
| media/* | ||||
|  | ||||
| # Sqlite database | ||||
| db.sqlite3 | ||||
|  | ||||
| # PyCharm | ||||
| .idea | ||||
|  | ||||
| # Fixtures | ||||
| src/paperless/fixtures/ | ||||
|   | ||||
							
								
								
									
										0
									
								
								src/documents/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/documents/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										24
									
								
								src/documents/admin.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								src/documents/admin.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| from django.conf import settings | ||||
| from django.contrib import admin | ||||
|  | ||||
| from .models import Document | ||||
|  | ||||
|  | ||||
| class DocumentAdmin(admin.ModelAdmin): | ||||
|  | ||||
|     search_fields = ("sender", "title", "content",) | ||||
|     list_display = ("created", "sender", "title", "thumbnail", "pdf") | ||||
|     list_filter = ("created", "sender") | ||||
|     save_on_top = True | ||||
|  | ||||
|     def thumbnail(self, obj): | ||||
|         return '<img src="{}documents/img/{:07}.jpg" width="100" />'.format( | ||||
|             settings.MEDIA_URL, obj.pk) | ||||
|     thumbnail.allow_tags = True | ||||
|  | ||||
|     def pdf(self, obj): | ||||
|         return '<a href="{}documents/pdf/{:07}.pdf">Download</a>'.format( | ||||
|             settings.MEDIA_URL, obj.pk) | ||||
|     pdf.allow_tags = True | ||||
|  | ||||
| admin.site.register(Document, DocumentAdmin) | ||||
							
								
								
									
										5
									
								
								src/documents/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								src/documents/apps.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,5 @@ | ||||
| from django.apps import AppConfig | ||||
|  | ||||
|  | ||||
| class DocumentsConfig(AppConfig): | ||||
|     name = 'documents' | ||||
							
								
								
									
										0
									
								
								src/documents/management/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/documents/management/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								src/documents/management/commands/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/documents/management/commands/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										122
									
								
								src/documents/management/commands/consume.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										122
									
								
								src/documents/management/commands/consume.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,122 @@ | ||||
| import glob | ||||
| import os | ||||
| import random | ||||
| import re | ||||
| import shutil | ||||
| import subprocess | ||||
|  | ||||
| import pyocr | ||||
|  | ||||
| from PIL import Image | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.core.management.base import BaseCommand | ||||
|  | ||||
| from documents.models import Document | ||||
|  | ||||
|  | ||||
| class Command(BaseCommand): | ||||
|     """ | ||||
|     Loop over every file found in CONSUMPTION_DIR and: | ||||
|       1. Convert it to a greyscale tif | ||||
|       2. Convert it to a full-colour jpg | ||||
|       3. Use tesseract on the tif | ||||
|       4. Store the OCR'd text in the database along with the paths to the jpg | ||||
|          and original pdf | ||||
|       5. Delete the pdf and images | ||||
|     """ | ||||
|  | ||||
|     CONVERT = settings.CONVERT_BINARY | ||||
|     SCRATCH = settings.SCRATCH_DIR | ||||
|     CONSUME = settings.CONSUMPTION_DIR | ||||
|  | ||||
|     OCR = pyocr.get_available_tools()[0] | ||||
|  | ||||
|     MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img") | ||||
|     MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|         self.verbosity = 0 | ||||
|         BaseCommand.__init__(self, *args, **kwargs) | ||||
|  | ||||
|     def handle(self, *args, **options): | ||||
|  | ||||
|         self.verbosity = options["verbosity"] | ||||
|  | ||||
|         self._setup() | ||||
|  | ||||
|         for pdf in os.listdir(self.CONSUME): | ||||
|  | ||||
|             if not os.path.isfile(os.path.join(self.CONSUME, pdf)): | ||||
|                 continue | ||||
|  | ||||
|             if not pdf.endswith(".pdf"): | ||||
|                 continue | ||||
|  | ||||
|             if self.verbosity > 1: | ||||
|                 print("Consuming {}".format(pdf)) | ||||
|  | ||||
|             pdf = os.path.join(self.CONSUME, pdf) | ||||
|             pngs = self._get_greyscale(pdf) | ||||
|             jpgs = self._get_colour(pdf) | ||||
|             text = self._get_ocr(pngs) | ||||
|  | ||||
|             self._store(text, jpgs, pdf) | ||||
|             self._cleanup(pngs, jpgs) | ||||
|  | ||||
|     def _setup(self): | ||||
|         for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF): | ||||
|             try: | ||||
|                 os.makedirs(d) | ||||
|             except FileExistsError: | ||||
|                 pass | ||||
|  | ||||
|     def _get_greyscale(self, pdf): | ||||
|  | ||||
|         i = random.randint(1000000, 4999999) | ||||
|         png = os.path.join(self.SCRATCH, "{}.png".format(i)) | ||||
|  | ||||
|         subprocess.Popen(( | ||||
|             self.CONVERT, "-density", "300", "-depth", "8", | ||||
|             "-type", "grayscale", pdf, png | ||||
|         )).wait() | ||||
|  | ||||
|         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||||
|  | ||||
|     def _get_colour(self, pdf): | ||||
|  | ||||
|         i = random.randint(5000000, 9999999) | ||||
|         jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i)) | ||||
|  | ||||
|         subprocess.Popen((self.CONVERT, pdf, jpg)).wait() | ||||
|  | ||||
|         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||||
|  | ||||
|     def _get_ocr(self, pngs): | ||||
|  | ||||
|         r = "" | ||||
|         for png in pngs: | ||||
|             with Image.open(os.path.join(self.SCRATCH, png)) as f: | ||||
|                 r += self.OCR.image_to_string(f) | ||||
|                 r += "\n\n\n\n\n\n\n\n" | ||||
|  | ||||
|         return r | ||||
|  | ||||
|     def _store(self, text, jpgs, pdf): | ||||
|  | ||||
|         doc = Document.objects.create(content=text) | ||||
|  | ||||
|         shutil.move(jpgs[0], os.path.join( | ||||
|             self.MEDIA_IMG, "{:07}.jpg".format(doc.pk))) | ||||
|         shutil.move(pdf, os.path.join( | ||||
|             self.MEDIA_PDF, "{:07}.pdf".format(doc.pk))) | ||||
|  | ||||
|     def _cleanup(self, pngs, jpgs): | ||||
|  | ||||
|         jpg_glob = os.path.join( | ||||
|             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0])) | ||||
|         png_glob = os.path.join( | ||||
|             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) | ||||
|  | ||||
|         for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)): | ||||
|             os.unlink(f) | ||||
							
								
								
									
										27
									
								
								src/documents/migrations/0001_initial.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								src/documents/migrations/0001_initial.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
| # Generated by Django 1.9 on 2015-12-20 19:10 | ||||
| from __future__ import unicode_literals | ||||
|  | ||||
| from django.db import migrations, models | ||||
|  | ||||
|  | ||||
| class Migration(migrations.Migration): | ||||
|  | ||||
|     initial = True | ||||
|  | ||||
|     dependencies = [ | ||||
|     ] | ||||
|  | ||||
|     operations = [ | ||||
|         migrations.CreateModel( | ||||
|             name='Document', | ||||
|             fields=[ | ||||
|                 ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||||
|                 ('sender', models.CharField(blank=True, db_index=True, max_length=128)), | ||||
|                 ('title', models.CharField(blank=True, db_index=True, max_length=128)), | ||||
|                 ('content', models.TextField(db_index=True)), | ||||
|                 ('created', models.DateTimeField(auto_now_add=True)), | ||||
|                 ('modified', models.DateTimeField(auto_now=True)), | ||||
|             ], | ||||
|         ), | ||||
|     ] | ||||
							
								
								
									
										0
									
								
								src/documents/migrations/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/documents/migrations/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										10
									
								
								src/documents/models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								src/documents/models.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | ||||
| from django.db import models | ||||
|  | ||||
|  | ||||
| class Document(models.Model): | ||||
|  | ||||
|     sender = models.CharField(max_length=128, blank=True, db_index=True) | ||||
|     title = models.CharField(max_length=128, blank=True, db_index=True) | ||||
|     content = models.TextField(db_index=True) | ||||
|     created = models.DateTimeField(auto_now_add=True) | ||||
|     modified = models.DateTimeField(auto_now=True) | ||||
							
								
								
									
										3
									
								
								src/documents/tests.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								src/documents/tests.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from django.test import TestCase | ||||
|  | ||||
| # Create your tests here. | ||||
							
								
								
									
										3
									
								
								src/documents/views.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								src/documents/views.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from django.shortcuts import render | ||||
|  | ||||
| # Create your views here. | ||||
							
								
								
									
										10
									
								
								src/manage.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										10
									
								
								src/manage.py
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,10 @@ | ||||
| #!/usr/bin/env python | ||||
| import os | ||||
| import sys | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") | ||||
|  | ||||
|     from django.core.management import execute_from_command_line | ||||
|  | ||||
|     execute_from_command_line(sys.argv) | ||||
							
								
								
									
										0
									
								
								src/paperless/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										137
									
								
								src/paperless/settings.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										137
									
								
								src/paperless/settings.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,137 @@ | ||||
| """ | ||||
| Django settings for paperless project. | ||||
|  | ||||
| Generated by 'django-admin startproject' using Django 1.9. | ||||
|  | ||||
| For more information on this file, see | ||||
| https://docs.djangoproject.com/en/1.9/topics/settings/ | ||||
|  | ||||
| For the full list of settings and their values, see | ||||
| https://docs.djangoproject.com/en/1.9/ref/settings/ | ||||
| """ | ||||
|  | ||||
| import os | ||||
|  | ||||
| # Build paths inside the project like this: os.path.join(BASE_DIR, ...) | ||||
| BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||||
|  | ||||
|  | ||||
| # Quick-start development settings - unsuitable for production | ||||
| # See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/ | ||||
|  | ||||
| # SECURITY WARNING: keep the secret key used in production secret! | ||||
| SECRET_KEY = 'e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee' | ||||
|  | ||||
| # SECURITY WARNING: don't run with debug turned on in production! | ||||
| DEBUG = True | ||||
|  | ||||
| ALLOWED_HOSTS = [] | ||||
|  | ||||
|  | ||||
| # Application definition | ||||
|  | ||||
| INSTALLED_APPS = [ | ||||
|     'django.contrib.admin', | ||||
|     'django.contrib.auth', | ||||
|     'django.contrib.contenttypes', | ||||
|     'django.contrib.sessions', | ||||
|     'django.contrib.messages', | ||||
|     'django.contrib.staticfiles', | ||||
|  | ||||
|     "django_extensions", | ||||
|  | ||||
|     "documents", | ||||
| ] | ||||
|  | ||||
| MIDDLEWARE_CLASSES = [ | ||||
|     'django.middleware.security.SecurityMiddleware', | ||||
|     'django.contrib.sessions.middleware.SessionMiddleware', | ||||
|     'django.middleware.common.CommonMiddleware', | ||||
|     'django.middleware.csrf.CsrfViewMiddleware', | ||||
|     'django.contrib.auth.middleware.AuthenticationMiddleware', | ||||
|     'django.contrib.auth.middleware.SessionAuthenticationMiddleware', | ||||
|     'django.contrib.messages.middleware.MessageMiddleware', | ||||
|     'django.middleware.clickjacking.XFrameOptionsMiddleware', | ||||
| ] | ||||
|  | ||||
| ROOT_URLCONF = 'paperless.urls' | ||||
|  | ||||
| TEMPLATES = [ | ||||
|     { | ||||
|         'BACKEND': 'django.template.backends.django.DjangoTemplates', | ||||
|         'DIRS': [], | ||||
|         'APP_DIRS': True, | ||||
|         'OPTIONS': { | ||||
|             'context_processors': [ | ||||
|                 'django.template.context_processors.debug', | ||||
|                 'django.template.context_processors.request', | ||||
|                 'django.contrib.auth.context_processors.auth', | ||||
|                 'django.contrib.messages.context_processors.messages', | ||||
|             ], | ||||
|         }, | ||||
|     }, | ||||
| ] | ||||
|  | ||||
| WSGI_APPLICATION = 'paperless.wsgi.application' | ||||
|  | ||||
|  | ||||
| # Database | ||||
| # https://docs.djangoproject.com/en/1.9/ref/settings/#databases | ||||
|  | ||||
| DATABASES = { | ||||
|     'default': { | ||||
|         'ENGINE': 'django.db.backends.sqlite3', | ||||
|         'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| # Password validation | ||||
| # https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators | ||||
|  | ||||
| AUTH_PASSWORD_VALIDATORS = [ | ||||
|     { | ||||
|         'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', | ||||
|     }, | ||||
|     { | ||||
|         'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', | ||||
|     }, | ||||
|     { | ||||
|         'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', | ||||
|     }, | ||||
|     { | ||||
|         'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', | ||||
|     }, | ||||
| ] | ||||
|  | ||||
|  | ||||
| # Internationalization | ||||
| # https://docs.djangoproject.com/en/1.9/topics/i18n/ | ||||
|  | ||||
| LANGUAGE_CODE = 'en-us' | ||||
|  | ||||
| TIME_ZONE = 'UTC' | ||||
|  | ||||
| USE_I18N = True | ||||
|  | ||||
| USE_L10N = True | ||||
|  | ||||
| USE_TZ = True | ||||
|  | ||||
|  | ||||
| # Static files (CSS, JavaScript, Images) | ||||
| # https://docs.djangoproject.com/en/1.9/howto/static-files/ | ||||
|  | ||||
| STATIC_ROOT = os.path.join(BASE_DIR, "..", "static") | ||||
| MEDIA_ROOT = os.path.join(BASE_DIR, "..", "media") | ||||
|  | ||||
| STATIC_URL = '/static/' | ||||
| MEDIA_URL = "/media/" | ||||
|  | ||||
|  | ||||
| # Paperless-specific stuffs | ||||
| # Change these paths if yours are different | ||||
|  | ||||
| CONVERT_BINARY = "/usr/bin/convert" | ||||
| SCRATCH_DIR = "/tmp/paperless"  # Will be created if it doesn't exist | ||||
| CONSUMPTION_DIR = "/tmp/paperless/consume" | ||||
							
								
								
									
										23
									
								
								src/paperless/urls.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/paperless/urls.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| """paperless URL Configuration | ||||
|  | ||||
| The `urlpatterns` list routes URLs to views. For more information please see: | ||||
|     https://docs.djangoproject.com/en/1.9/topics/http/urls/ | ||||
| Examples: | ||||
| Function views | ||||
|     1. Add an import:  from my_app import views | ||||
|     2. Add a URL to urlpatterns:  url(r'^$', views.home, name='home') | ||||
| Class-based views | ||||
|     1. Add an import:  from other_app.views import Home | ||||
|     2. Add a URL to urlpatterns:  url(r'^$', Home.as_view(), name='home') | ||||
| Including another URLconf | ||||
|     1. Add an import:  from blog import urls as blog_urls | ||||
|     2. Import the include() function: from django.conf.urls import url, include | ||||
|     3. Add a URL to urlpatterns:  url(r'^blog/', include(blog_urls)) | ||||
| """ | ||||
| from django.conf import settings | ||||
| from django.conf.urls import url, static | ||||
| from django.contrib import admin | ||||
|  | ||||
| urlpatterns = [ | ||||
|     url(r'^admin/', admin.site.urls), | ||||
| ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) | ||||
							
								
								
									
										16
									
								
								src/paperless/wsgi.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/paperless/wsgi.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| """ | ||||
| WSGI config for paperless project. | ||||
|  | ||||
| It exposes the WSGI callable as a module-level variable named ``application``. | ||||
|  | ||||
| For more information on this file, see | ||||
| https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/ | ||||
| """ | ||||
|  | ||||
| import os | ||||
|  | ||||
| from django.core.wsgi import get_wsgi_application | ||||
|  | ||||
| os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") | ||||
|  | ||||
| application = get_wsgi_application() | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn