mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	#12: Support image documents
This commit is contained in:
		| @@ -34,7 +34,7 @@ class MonthListFilter(admin.SimpleListFilter): | ||||
| class DocumentAdmin(admin.ModelAdmin): | ||||
|  | ||||
|     search_fields = ("sender__name", "title", "content",) | ||||
|     list_display = ("edit", "created", "sender", "title", "tags_", "pdf") | ||||
|     list_display = ("edit", "created", "sender", "title", "tags_", "document") | ||||
|     list_filter = (MonthListFilter, "tags", "sender") | ||||
|     list_editable = ("sender", "title",) | ||||
|     list_per_page = 25 | ||||
| @@ -44,14 +44,14 @@ class DocumentAdmin(admin.ModelAdmin): | ||||
|             static("documents/img/edit.png")) | ||||
|     edit.allow_tags = True | ||||
|  | ||||
|     def pdf(self, obj): | ||||
|     def document(self, obj): | ||||
|         return '<a href="{}">' \ | ||||
|                  '<img src="{}" width="22" height="22" alt="PDF icon">' \ | ||||
|                '</a>'.format( | ||||
|                     reverse("fetch", kwargs={"pk": obj.pk}), | ||||
|                     static("documents/img/application-pdf.png") | ||||
|                 ) | ||||
|     pdf.allow_tags = True | ||||
|     document.allow_tags = True | ||||
|  | ||||
|     def tags_(self, obj): | ||||
|         r = "" | ||||
|   | ||||
| @@ -31,9 +31,9 @@ class Command(BaseCommand): | ||||
|     Loop over every file found in CONSUMPTION_DIR and: | ||||
|       1. Convert it to a greyscale png | ||||
|       2. Use tesseract on the png | ||||
|       3. Encrypt and store the PDF in the MEDIA_ROOT | ||||
|       3. Encrypt and store the document in the MEDIA_ROOT | ||||
|       4. Store the OCR'd text in the database | ||||
|       5. Delete the pdf and image(s) | ||||
|       5. Delete the document and image(s) | ||||
|     """ | ||||
|  | ||||
|     LOOP_TIME = 10  # Seconds | ||||
| @@ -44,10 +44,12 @@ class Command(BaseCommand): | ||||
|  | ||||
|     OCR = pyocr.get_available_tools()[0] | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|     MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") | ||||
|     MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents") | ||||
|  | ||||
|     PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$") | ||||
|     PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$") | ||||
|     PARSER_REGEX_TITLE = re.compile( | ||||
|         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE) | ||||
|     PARSER_REGEX_SENDER_TITLE = re.compile( | ||||
|         r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE) | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|  | ||||
| @@ -74,35 +76,35 @@ class Command(BaseCommand): | ||||
|  | ||||
|     def loop(self): | ||||
|  | ||||
|         for pdf in os.listdir(self.CONSUME): | ||||
|         for doc in os.listdir(self.CONSUME): | ||||
|  | ||||
|             pdf = os.path.join(self.CONSUME, pdf) | ||||
|             doc = os.path.join(self.CONSUME, doc) | ||||
|  | ||||
|             if not os.path.isfile(pdf): | ||||
|             if not os.path.isfile(doc): | ||||
|                 continue | ||||
|  | ||||
|             if not re.match(self.PARSER_REGEX_TITLE, pdf): | ||||
|             if not re.match(self.PARSER_REGEX_TITLE, doc): | ||||
|                 continue | ||||
|  | ||||
|             if pdf in self._ignore: | ||||
|             if doc in self._ignore: | ||||
|                 continue | ||||
|  | ||||
|             if self._is_ready(pdf): | ||||
|             if self._is_ready(doc): | ||||
|                 continue | ||||
|  | ||||
|             self._render("Consuming {}".format(pdf), 1) | ||||
|             self._render("Consuming {}".format(doc), 1) | ||||
|  | ||||
|             pngs = self._get_greyscale(pdf) | ||||
|             pngs = self._get_greyscale(doc) | ||||
|  | ||||
|             try: | ||||
|                 text = self._get_ocr(pngs) | ||||
|             except OCRError: | ||||
|                 self._ignore.append(pdf) | ||||
|                 self._render("OCR FAILURE: {}".format(pdf), 0) | ||||
|                 self._ignore.append(doc) | ||||
|                 self._render("OCR FAILURE: {}".format(doc), 0) | ||||
|                 continue | ||||
|  | ||||
|             self._store(text, pdf) | ||||
|             self._cleanup(pngs, pdf) | ||||
|             self._store(text, doc) | ||||
|             self._cleanup(pngs, doc) | ||||
|  | ||||
|     def _setup(self): | ||||
|  | ||||
| @@ -116,29 +118,29 @@ class Command(BaseCommand): | ||||
|             raise CommandError("Consumption directory {} does not exist".format( | ||||
|                 self.CONSUME)) | ||||
|  | ||||
|         for d in (self.SCRATCH, self.MEDIA_PDF): | ||||
|         for d in (self.SCRATCH, self.MEDIA_DOCS): | ||||
|             try: | ||||
|                 os.makedirs(d) | ||||
|             except FileExistsError: | ||||
|                 pass | ||||
|  | ||||
|     def _is_ready(self, pdf): | ||||
|     def _is_ready(self, doc): | ||||
|         """ | ||||
|         Detect whether `pdf` is ready to consume or if it's still being written | ||||
|         Detect whether `doc` is ready to consume or if it's still being written | ||||
|         to by the scanner. | ||||
|         """ | ||||
|  | ||||
|         t = os.stat(pdf).st_mtime | ||||
|         t = os.stat(doc).st_mtime | ||||
|  | ||||
|         if self.stats.get(pdf) == t: | ||||
|             del(self.stats[pdf]) | ||||
|         if self.stats.get(doc) == t: | ||||
|             del(self.stats[doc]) | ||||
|             return True | ||||
|  | ||||
|         self.stats[pdf] = t | ||||
|         self.stats[doc] = t | ||||
|  | ||||
|         return False | ||||
|  | ||||
|     def _get_greyscale(self, pdf): | ||||
|     def _get_greyscale(self, doc): | ||||
|  | ||||
|         self._render("  Generating greyscale image", 2) | ||||
|  | ||||
| @@ -147,14 +149,14 @@ class Command(BaseCommand): | ||||
|  | ||||
|         subprocess.Popen(( | ||||
|             self.CONVERT, "-density", "300", "-depth", "8", | ||||
|             "-type", "grayscale", pdf, png | ||||
|             "-type", "grayscale", doc, png | ||||
|         )).wait() | ||||
|  | ||||
|         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||||
|  | ||||
|     def _get_ocr(self, pngs): | ||||
|  | ||||
|         self._render("  OCRing the PDF", 2) | ||||
|         self._render("  OCRing the document", 2) | ||||
|  | ||||
|         raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) | ||||
|  | ||||
| @@ -203,19 +205,22 @@ class Command(BaseCommand): | ||||
|         # Strip out excess white space to allow matching to go smoother | ||||
|         return re.sub(r"\s+", " ", r) | ||||
|  | ||||
|     def _store(self, text, pdf): | ||||
|     def _store(self, text, doc): | ||||
|  | ||||
|         sender, title = self._parse_file_name(pdf) | ||||
|         relevant_tags = [t for t in Tag.objects.all() if t.matches(text.lower())] | ||||
|         sender, title, file_type = self._parse_file_name(doc) | ||||
|  | ||||
|         stats = os.stat(pdf) | ||||
|         lower_text = text.lower() | ||||
|         relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)] | ||||
|  | ||||
|         stats = os.stat(doc) | ||||
|  | ||||
|         self._render("  Saving record to database", 2) | ||||
|  | ||||
|         doc = Document.objects.create( | ||||
|         document = Document.objects.create( | ||||
|             sender=sender, | ||||
|             title=title, | ||||
|             content=text, | ||||
|             file_type=file_type, | ||||
|             created=timezone.make_aware( | ||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime)), | ||||
|             modified=timezone.make_aware( | ||||
| @@ -225,38 +230,38 @@ class Command(BaseCommand): | ||||
|         if relevant_tags: | ||||
|             tag_names = ", ".join([t.slug for t in relevant_tags]) | ||||
|             self._render("    Tagging with {}".format(tag_names), 2) | ||||
|             doc.tags.add(*relevant_tags) | ||||
|             document.tags.add(*relevant_tags) | ||||
|  | ||||
|         with open(pdf, "rb") as unencrypted: | ||||
|             with open(doc.pdf_path, "wb") as encrypted: | ||||
|         with open(doc, "rb") as unencrypted: | ||||
|             with open(document.source_path, "wb") as encrypted: | ||||
|                 self._render("  Encrypting", 3) | ||||
|                 encrypted.write(GnuPG.encrypted(unencrypted)) | ||||
|  | ||||
|     def _parse_file_name(self, pdf): | ||||
|     def _parse_file_name(self, doc): | ||||
|         """ | ||||
|         We use a crude naming convention to make handling the sender and title | ||||
|         easier: | ||||
|           "sender - title.pdf" | ||||
|           "<sender> - <title>.<suffix>" | ||||
|         """ | ||||
|  | ||||
|         # First we attempt "sender - title.pdf" | ||||
|         m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf) | ||||
|         # First we attempt "<sender> - <title>.<suffix>" | ||||
|         m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc) | ||||
|         if m: | ||||
|             sender_name, title = m.group(1), m.group(2) | ||||
|             sender_name, title, file_type = m.group(1), m.group(2), m.group(3) | ||||
|             sender, __ = Sender.objects.get_or_create( | ||||
|                 name=sender_name, defaults={"slug": slugify(sender_name)}) | ||||
|             return sender, title | ||||
|             return sender, title, file_type | ||||
|  | ||||
|         # That didn't work, so we assume sender is None | ||||
|         m = re.match(self.PARSER_REGEX_TITLE, pdf) | ||||
|         return None, m.group(1) | ||||
|         m = re.match(self.PARSER_REGEX_TITLE, doc) | ||||
|         return None, m.group(1), m.group(2) | ||||
|  | ||||
|     def _cleanup(self, pngs, pdf): | ||||
|     def _cleanup(self, pngs, doc): | ||||
|  | ||||
|         png_glob = os.path.join( | ||||
|             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) | ||||
|  | ||||
|         for f in list(glob.glob(png_glob)) + [pdf]: | ||||
|         for f in list(glob.glob(png_glob)) + [doc]: | ||||
|             self._render("  Deleting {}".format(f), 2) | ||||
|             os.unlink(f) | ||||
|  | ||||
|   | ||||
							
								
								
									
										21
									
								
								src/documents/migrations/0008_document_file_type.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								src/documents/migrations/0008_document_file_type.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
| # Generated by Django 1.9 on 2016-01-29 22:58 | ||||
| from __future__ import unicode_literals | ||||
|  | ||||
| from django.db import migrations, models | ||||
|  | ||||
|  | ||||
| class Migration(migrations.Migration): | ||||
|  | ||||
|     dependencies = [ | ||||
|         ('documents', '0007_auto_20160126_2114'), | ||||
|     ] | ||||
|  | ||||
|     operations = [ | ||||
|         migrations.AddField( | ||||
|             model_name='document', | ||||
|             name='file_type', | ||||
|             field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF')], default='pdf', editable=False, max_length=4), | ||||
|             preserve_default=False, | ||||
|         ), | ||||
|     ] | ||||
| @@ -111,10 +111,22 @@ class Tag(SluggedModel): | ||||
|  | ||||
| class Document(models.Model): | ||||
|  | ||||
|     TYPE_PDF = "pdf" | ||||
|     TYPE_PNG = "png" | ||||
|     TYPE_JPG = "jpg" | ||||
|     TYPE_GIF = "gif" | ||||
|     TYPE_TIF = "tiff" | ||||
|     TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) | ||||
|  | ||||
|     sender = models.ForeignKey( | ||||
|         Sender, blank=True, null=True, related_name="documents") | ||||
|     title = models.CharField(max_length=128, blank=True, db_index=True) | ||||
|     content = models.TextField(db_index=True) | ||||
|     file_type = models.CharField( | ||||
|         max_length=4, | ||||
|         editable=False, | ||||
|         choices=tuple([(t, t.upper()) for t in TYPES]) | ||||
|     ) | ||||
|     tags = models.ManyToManyField(Tag, related_name="documents") | ||||
|     created = models.DateTimeField(default=timezone.now, editable=False) | ||||
|     modified = models.DateTimeField(auto_now=True, editable=False) | ||||
| @@ -131,20 +143,19 @@ class Document(models.Model): | ||||
|         return str(created) | ||||
|  | ||||
|     @property | ||||
|     def pdf_path(self): | ||||
|     def source_path(self): | ||||
|         return os.path.join( | ||||
|             settings.MEDIA_ROOT, | ||||
|             "documents", | ||||
|             "pdf", | ||||
|             "{:07}.pdf.gpg".format(self.pk) | ||||
|             "{:07}.{}.gpg".format(self.pk, self.file_type) | ||||
|         ) | ||||
|  | ||||
|     @property | ||||
|     def pdf(self): | ||||
|         return open(self.pdf_path, "rb") | ||||
|     def source_file(self): | ||||
|         return open(self.source_path, "rb") | ||||
|  | ||||
|     @property | ||||
|     def parseable_file_name(self): | ||||
|         if self.sender and self.title: | ||||
|             return "{} - {}.pdf".format(self.sender, self.title) | ||||
|         return os.path.basename(self.pdf_path) | ||||
|             return "{} - {}.{}".format(self.sender, self.title, self.file_types) | ||||
|         return os.path.basename(self.source_path) | ||||
|   | ||||
| @@ -16,9 +16,19 @@ class PdfView(DetailView): | ||||
|         Override the default to return the unencrypted PDF as raw data. | ||||
|         """ | ||||
|  | ||||
|         content_types = { | ||||
|             Document.TYPE_PDF: "application/pdf", | ||||
|             Document.TYPE_PNG: "image/png", | ||||
|             Document.TYPE_JPG: "image/jpeg", | ||||
|             Document.TYPE_GIF: "image/gif", | ||||
|             Document.TYPE_TIF: "image/tiff", | ||||
|         } | ||||
|  | ||||
|         response = HttpResponse( | ||||
|             GnuPG.decrypted(self.object.pdf), content_type="application/pdf") | ||||
|             GnuPG.decrypted(self.object.source_file), | ||||
|             content_type=content_types[self.object.file_type] | ||||
|         ) | ||||
|         response["Content-Disposition"] = 'attachment; filename="{}"'.format( | ||||
|             slugify(str(self.object)) + ".pdf") | ||||
|             slugify(str(self.object)) + "." + self.object.file_type) | ||||
|  | ||||
|         return response | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn