mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Broke the consumer script into separate files and started on a mail consumer
This commit is contained in:
		
							
								
								
									
										9
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -67,8 +67,9 @@ db.sqlite3 | ||||
|  | ||||
| # Other stuff that doesn't belong | ||||
| virtualenv | ||||
|  | ||||
| scripts/import-for-development | ||||
|  | ||||
| # Vagrant | ||||
| .vagrant | ||||
|  | ||||
| # Used for development | ||||
| scripts/import-for-development | ||||
| environment | ||||
|  | ||||
|   | ||||
							
								
								
									
										3
									
								
								src/documents/consumers/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								src/documents/consumers/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .base import Consumer | ||||
| from .file import FileConsumer, FileConsumerError | ||||
| from .mail import MailConsumer, MailConsumerError | ||||
							
								
								
									
										157
									
								
								src/documents/consumers/base.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										157
									
								
								src/documents/consumers/base.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,157 @@ | ||||
| import datetime | ||||
| import glob | ||||
| import langdetect | ||||
| import os | ||||
| import random | ||||
| import re | ||||
| import subprocess | ||||
|  | ||||
| import pyocr | ||||
|  | ||||
| from PIL import Image | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
|  | ||||
| from paperless.db import GnuPG | ||||
|  | ||||
| from ..models import Tag, Document | ||||
| from ..languages import ISO639 | ||||
|  | ||||
|  | ||||
| class OCRError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class Consumer(object): | ||||
|  | ||||
|     SCRATCH = settings.SCRATCH_DIR | ||||
|     CONVERT = settings.CONVERT_BINARY | ||||
|  | ||||
|     OCR = pyocr.get_available_tools()[0] | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|  | ||||
|     def __init__(self, verbosity=1): | ||||
|  | ||||
|         self.verbosity = verbosity | ||||
|  | ||||
|         try: | ||||
|             os.makedirs(self.SCRATCH) | ||||
|         except FileExistsError: | ||||
|             pass | ||||
|  | ||||
|     def _get_greyscale(self, doc): | ||||
|  | ||||
|         self._render("  Generating greyscale image", 2) | ||||
|  | ||||
|         i = random.randint(1000000, 9999999) | ||||
|         png = os.path.join(self.SCRATCH, "{}.png".format(i)) | ||||
|  | ||||
|         subprocess.Popen(( | ||||
|             self.CONVERT, "-density", "300", "-depth", "8", | ||||
|             "-type", "grayscale", doc, png | ||||
|         )).wait() | ||||
|  | ||||
|         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||||
|  | ||||
|     def _get_ocr(self, pngs): | ||||
|  | ||||
|         self._render("  OCRing the document", 2) | ||||
|  | ||||
|         raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) | ||||
|  | ||||
|         guessed_language = langdetect.detect(raw_text) | ||||
|  | ||||
|         self._render("    Language detected: {}".format(guessed_language), 2) | ||||
|  | ||||
|         if guessed_language not in ISO639: | ||||
|             self._render("Language detection failed!", 0) | ||||
|             if settings.FORGIVING_OCR: | ||||
|                 self._render( | ||||
|                     "As FORGIVING_OCR is enabled, we're going to make the best " | ||||
|                     "with what we have.", | ||||
|                     1 | ||||
|                 ) | ||||
|                 return raw_text | ||||
|             raise OCRError | ||||
|  | ||||
|         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: | ||||
|             return raw_text | ||||
|  | ||||
|         try: | ||||
|             return self._ocr(pngs, ISO639[guessed_language]) | ||||
|         except pyocr.pyocr.tesseract.TesseractError: | ||||
|             if settings.FORGIVING_OCR: | ||||
|                 self._render( | ||||
|                     "OCR for {} failed, but we're going to stick with what " | ||||
|                     "we've got since FORGIVING_OCR is enabled.".format( | ||||
|                         guessed_language | ||||
|                     ), | ||||
|                     0 | ||||
|                 ) | ||||
|                 return raw_text | ||||
|             raise OCRError | ||||
|  | ||||
|     def _ocr(self, pngs, lang): | ||||
|  | ||||
|         self._render("    Parsing for {}".format(lang), 2) | ||||
|  | ||||
|         r = "" | ||||
|         for png in pngs: | ||||
|             with Image.open(os.path.join(self.SCRATCH, png)) as f: | ||||
|                 self._render("    {}".format(f.filename), 3) | ||||
|                 r += self.OCR.image_to_string(f, lang=lang) | ||||
|  | ||||
|         # Strip out excess white space to allow matching to go smoother | ||||
|         return re.sub(r"\s+", " ", r) | ||||
|  | ||||
|     def _guess_file_attributes(self, doc): | ||||
|         raise NotImplementedError( | ||||
|             "At the very least a consumer should determine the file type.") | ||||
|  | ||||
|     def _store(self, text, doc): | ||||
|  | ||||
|         sender, title, file_type = self._guess_file_attributes(doc) | ||||
|  | ||||
|         lower_text = text.lower() | ||||
|         relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)] | ||||
|  | ||||
|         stats = os.stat(doc) | ||||
|  | ||||
|         self._render("  Saving record to database", 2) | ||||
|  | ||||
|         document = Document.objects.create( | ||||
|             sender=sender, | ||||
|             title=title, | ||||
|             content=text, | ||||
|             file_type=file_type, | ||||
|             created=timezone.make_aware( | ||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime)), | ||||
|             modified=timezone.make_aware( | ||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime)) | ||||
|         ) | ||||
|  | ||||
|         if relevant_tags: | ||||
|             tag_names = ", ".join([t.slug for t in relevant_tags]) | ||||
|             self._render("    Tagging with {}".format(tag_names), 2) | ||||
|             document.tags.add(*relevant_tags) | ||||
|  | ||||
|         with open(doc, "rb") as unencrypted: | ||||
|             with open(document.source_path, "wb") as encrypted: | ||||
|                 self._render("  Encrypting", 3) | ||||
|                 encrypted.write(GnuPG.encrypted(unencrypted)) | ||||
|  | ||||
|     def _cleanup(self, pngs, doc): | ||||
|  | ||||
|         png_glob = os.path.join( | ||||
|             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) | ||||
|  | ||||
|         for f in list(glob.glob(png_glob)) + [doc]: | ||||
|             self._render("  Deleting {}".format(f), 2) | ||||
|             os.unlink(f) | ||||
|  | ||||
|         self._render("", 2) | ||||
|  | ||||
|     def _render(self, text, verbosity): | ||||
|         if self.verbosity >= verbosity: | ||||
|             print(text) | ||||
							
								
								
									
										106
									
								
								src/documents/consumers/file.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								src/documents/consumers/file.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,106 @@ | ||||
| import os | ||||
| import re | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.template.defaultfilters import slugify | ||||
|  | ||||
| from ..models import Sender | ||||
| from . import Consumer, OCRError | ||||
|  | ||||
|  | ||||
| class FileConsumerError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class FileConsumer(Consumer): | ||||
|  | ||||
|     CONSUME = settings.CONSUMPTION_DIR | ||||
|  | ||||
|     PARSER_REGEX_TITLE = re.compile( | ||||
|         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE) | ||||
|     PARSER_REGEX_SENDER_TITLE = re.compile( | ||||
|         r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE) | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|  | ||||
|         Consumer.__init__(self, *args, **kwargs) | ||||
|  | ||||
|         self.stats = {} | ||||
|         self._ignore = [] | ||||
|  | ||||
|         if not self.CONSUME: | ||||
|             raise FileConsumerError( | ||||
|                 "The CONSUMPTION_DIR settings variable does not appear to be " | ||||
|                 "set." | ||||
|             ) | ||||
|  | ||||
|         if not os.path.exists(self.CONSUME): | ||||
|             raise FileConsumerError( | ||||
|                 "Consumption directory {} does not exist".format(self.CONSUME)) | ||||
|  | ||||
|     def consume(self): | ||||
|  | ||||
|         for doc in os.listdir(self.CONSUME): | ||||
|  | ||||
|             doc = os.path.join(self.CONSUME, doc) | ||||
|  | ||||
|             if not os.path.isfile(doc): | ||||
|                 continue | ||||
|  | ||||
|             if not re.match(self.PARSER_REGEX_TITLE, doc): | ||||
|                 continue | ||||
|  | ||||
|             if doc in self._ignore: | ||||
|                 continue | ||||
|  | ||||
|             if self._is_ready(doc): | ||||
|                 continue | ||||
|  | ||||
|             self._render("Consuming {}".format(doc), 1) | ||||
|  | ||||
|             pngs = self._get_greyscale(doc) | ||||
|  | ||||
|             try: | ||||
|                 text = self._get_ocr(pngs) | ||||
|             except OCRError: | ||||
|                 self._ignore.append(doc) | ||||
|                 self._render("OCR FAILURE: {}".format(doc), 0) | ||||
|                 continue | ||||
|  | ||||
|             self._store(text, doc) | ||||
|             self._cleanup(pngs, doc) | ||||
|  | ||||
|     def _is_ready(self, doc): | ||||
|         """ | ||||
|         Detect whether `doc` is ready to consume or if it's still being written | ||||
|         to by the uploader. | ||||
|         """ | ||||
|  | ||||
|         t = os.stat(doc).st_mtime | ||||
|  | ||||
|         if self.stats.get(doc) == t: | ||||
|             del(self.stats[doc]) | ||||
|             return True | ||||
|  | ||||
|         self.stats[doc] = t | ||||
|  | ||||
|         return False | ||||
|  | ||||
|     def _guess_file_attributes(self, doc): | ||||
|         """ | ||||
|         We use a crude naming convention to make handling the sender and title | ||||
|         easier: | ||||
|           "<sender> - <title>.<suffix>" | ||||
|         """ | ||||
|  | ||||
|         # First we attempt "<sender> - <title>.<suffix>" | ||||
|         m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc) | ||||
|         if m: | ||||
|             sender_name, title, file_type = m.group(1), m.group(2), m.group(3) | ||||
|             sender, __ = Sender.objects.get_or_create( | ||||
|                 name=sender_name, defaults={"slug": slugify(sender_name)}) | ||||
|             return sender, title, file_type | ||||
|  | ||||
|         # That didn't work, so we assume sender is None | ||||
|         m = re.match(self.PARSER_REGEX_TITLE, doc) | ||||
|         return None, m.group(1), m.group(2) | ||||
							
								
								
									
										69
									
								
								src/documents/consumers/mail.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								src/documents/consumers/mail.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,69 @@ | ||||
| import datetime | ||||
| import imaplib | ||||
|  | ||||
| from django.conf import settings | ||||
|  | ||||
| from . import Consumer | ||||
|  | ||||
|  | ||||
| class MailConsumerError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class MailConsumer(Consumer): | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|  | ||||
|         Consumer.__init__(self, *args, **kwargs) | ||||
|  | ||||
|         self._connection = None | ||||
|         self._host = settings.MAIL_CONSUMPTION["HOST"] | ||||
|         self._port = settings.MAIL_CONSUMPTION["PORT"] | ||||
|         self._username = settings.MAIL_CONSUMPTION["USERNAME"] | ||||
|         self._password = settings.MAIL_CONSUMPTION["PASSWORD"] | ||||
|         self._inbox = settings.MAIL_CONSUMPTION["INBOX"] | ||||
|  | ||||
|         self._enabled = bool(self._host) | ||||
|  | ||||
|         self.last_checked = datetime.datetime.now() | ||||
|  | ||||
|     def _connect(self): | ||||
|         self._connection = imaplib.IMAP4_SSL(self._host, self._port) | ||||
|  | ||||
|     def _login(self): | ||||
|  | ||||
|         login = self._connection.login(self._username, self._password) | ||||
|         if not login[0] == "OK": | ||||
|             raise MailConsumerError("Can't log into mail: {}".format(login[1])) | ||||
|  | ||||
|         inbox = self._connection.select("INBOX") | ||||
|         if not inbox[0] == "OK": | ||||
|             raise MailConsumerError("Can't find the inbox: {}".format(inbox[1])) | ||||
|  | ||||
|     def _fetch(self): | ||||
|         for num in self._connection.search(None, "ALL")[1][0].split(): | ||||
|             typ, data = self._connection.fetch(num, "(RFC822)") | ||||
|             # self._connection.store(num, "+FLAGS", "\\Deleted") | ||||
|             yield data[0][1] | ||||
|  | ||||
|     def consume(self): | ||||
|  | ||||
|         if self._enabled: | ||||
|             self.get_messages() | ||||
|  | ||||
|         self.last_checked = datetime.datetime.now() | ||||
|  | ||||
|     def get_messages(self): | ||||
|  | ||||
|         self._connect() | ||||
|         self._login() | ||||
|  | ||||
|         for message in self._fetch(): | ||||
|             print(message)  # Now we have to do something with the attachment | ||||
|  | ||||
|         self._connection.expunge() | ||||
|         self._connection.close() | ||||
|         self._connection.logout() | ||||
|  | ||||
|     def _guess_file_attributes(self, doc): | ||||
|         return None, None, "jpg" | ||||
| @@ -1,29 +1,12 @@ | ||||
| import datetime | ||||
| import glob | ||||
| import langdetect | ||||
| import os | ||||
| import random | ||||
| import re | ||||
| import subprocess | ||||
| import time | ||||
|  | ||||
| import pyocr | ||||
|  | ||||
| from PIL import Image | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.core.management.base import BaseCommand, CommandError | ||||
| from django.template.defaultfilters import slugify | ||||
| from django.utils import timezone | ||||
|  | ||||
| from paperless.db import GnuPG | ||||
|  | ||||
| from ...languages import ISO639 | ||||
| from ...models import Document, Sender, Tag | ||||
|  | ||||
|  | ||||
| class OCRError(BaseException): | ||||
|     pass | ||||
| from ...consumers import ( | ||||
|     FileConsumer, FileConsumerError, MailConsumer, MailConsumerError) | ||||
|  | ||||
|  | ||||
| class Command(BaseCommand): | ||||
| @@ -37,25 +20,16 @@ class Command(BaseCommand): | ||||
|     """ | ||||
|  | ||||
|     LOOP_TIME = 10  # Seconds | ||||
|     MAIL_DELTA = datetime.timedelta(minutes=10) | ||||
|  | ||||
|     CONVERT = settings.CONVERT_BINARY | ||||
|     SCRATCH = settings.SCRATCH_DIR | ||||
|     CONSUME = settings.CONSUMPTION_DIR | ||||
|  | ||||
|     OCR = pyocr.get_available_tools()[0] | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|     MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents") | ||||
|  | ||||
|     PARSER_REGEX_TITLE = re.compile( | ||||
|         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE) | ||||
|     PARSER_REGEX_SENDER_TITLE = re.compile( | ||||
|         r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE) | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|  | ||||
|         self.verbosity = 0 | ||||
|         self.stats = {} | ||||
|         self._ignore = [] | ||||
|  | ||||
|         self.file_consumer = None | ||||
|         self.mail_consumer = None | ||||
|  | ||||
|         BaseCommand.__init__(self, *args, **kwargs) | ||||
|  | ||||
| @@ -63,7 +37,16 @@ class Command(BaseCommand): | ||||
|  | ||||
|         self.verbosity = options["verbosity"] | ||||
|  | ||||
|         self._setup() | ||||
|         try: | ||||
|             self.file_consumer = FileConsumer(verbosity=self.verbosity) | ||||
|             self.mail_consumer = MailConsumer(verbosity=self.verbosity) | ||||
|         except (FileConsumerError, MailConsumerError) as e: | ||||
|             raise CommandError(e) | ||||
|  | ||||
|         try: | ||||
|             os.makedirs(self.MEDIA_DOCS) | ||||
|         except FileExistsError: | ||||
|             pass | ||||
|  | ||||
|         try: | ||||
|             while True: | ||||
| @@ -76,196 +59,11 @@ class Command(BaseCommand): | ||||
|  | ||||
|     def loop(self): | ||||
|  | ||||
|         for doc in os.listdir(self.CONSUME): | ||||
|         self.file_consumer.consume() | ||||
|  | ||||
|             doc = os.path.join(self.CONSUME, doc) | ||||
|  | ||||
|             if not os.path.isfile(doc): | ||||
|                 continue | ||||
|  | ||||
|             if not re.match(self.PARSER_REGEX_TITLE, doc): | ||||
|                 continue | ||||
|  | ||||
|             if doc in self._ignore: | ||||
|                 continue | ||||
|  | ||||
|             if self._is_ready(doc): | ||||
|                 continue | ||||
|  | ||||
|             self._render("Consuming {}".format(doc), 1) | ||||
|  | ||||
|             pngs = self._get_greyscale(doc) | ||||
|  | ||||
|             try: | ||||
|                 text = self._get_ocr(pngs) | ||||
|             except OCRError: | ||||
|                 self._ignore.append(doc) | ||||
|                 self._render("OCR FAILURE: {}".format(doc), 0) | ||||
|                 continue | ||||
|  | ||||
|             self._store(text, doc) | ||||
|             self._cleanup(pngs, doc) | ||||
|  | ||||
|     def _setup(self): | ||||
|  | ||||
|         if not self.CONSUME: | ||||
|             raise CommandError( | ||||
|                 "The CONSUMPTION_DIR settings variable does not appear to be " | ||||
|                 "set." | ||||
|             ) | ||||
|  | ||||
|         if not os.path.exists(self.CONSUME): | ||||
|             raise CommandError("Consumption directory {} does not exist".format( | ||||
|                 self.CONSUME)) | ||||
|  | ||||
|         for d in (self.SCRATCH, self.MEDIA_DOCS): | ||||
|             try: | ||||
|                 os.makedirs(d) | ||||
|             except FileExistsError: | ||||
|                 pass | ||||
|  | ||||
|     def _is_ready(self, doc): | ||||
|         """ | ||||
|         Detect whether `doc` is ready to consume or if it's still being written | ||||
|         to by the scanner. | ||||
|         """ | ||||
|  | ||||
|         t = os.stat(doc).st_mtime | ||||
|  | ||||
|         if self.stats.get(doc) == t: | ||||
|             del(self.stats[doc]) | ||||
|             return True | ||||
|  | ||||
|         self.stats[doc] = t | ||||
|  | ||||
|         return False | ||||
|  | ||||
|     def _get_greyscale(self, doc): | ||||
|  | ||||
|         self._render("  Generating greyscale image", 2) | ||||
|  | ||||
|         i = random.randint(1000000, 9999999) | ||||
|         png = os.path.join(self.SCRATCH, "{}.png".format(i)) | ||||
|  | ||||
|         subprocess.Popen(( | ||||
|             self.CONVERT, "-density", "300", "-depth", "8", | ||||
|             "-type", "grayscale", doc, png | ||||
|         )).wait() | ||||
|  | ||||
|         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||||
|  | ||||
|     def _get_ocr(self, pngs): | ||||
|  | ||||
|         self._render("  OCRing the document", 2) | ||||
|  | ||||
|         raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) | ||||
|  | ||||
|         guessed_language = langdetect.detect(raw_text) | ||||
|  | ||||
|         self._render("    Language detected: {}".format(guessed_language), 2) | ||||
|  | ||||
|         if guessed_language not in ISO639: | ||||
|             self._render("Language detection failed!", 0) | ||||
|             if settings.FORGIVING_OCR: | ||||
|                 self._render( | ||||
|                     "As FORGIVING_OCR is enabled, we're going to make the best " | ||||
|                     "with what we have.", | ||||
|                     1 | ||||
|                 ) | ||||
|                 return raw_text | ||||
|             raise OCRError | ||||
|  | ||||
|         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: | ||||
|             return raw_text | ||||
|  | ||||
|         try: | ||||
|             return self._ocr(pngs, ISO639[guessed_language]) | ||||
|         except pyocr.pyocr.tesseract.TesseractError: | ||||
|             if settings.FORGIVING_OCR: | ||||
|                 self._render( | ||||
|                     "OCR for {} failed, but we're going to stick with what " | ||||
|                     "we've got since FORGIVING_OCR is enabled.".format( | ||||
|                         guessed_language | ||||
|                     ), | ||||
|                     0 | ||||
|                 ) | ||||
|                 return raw_text | ||||
|             raise OCRError | ||||
|  | ||||
|     def _ocr(self, pngs, lang): | ||||
|  | ||||
|         self._render("    Parsing for {}".format(lang), 2) | ||||
|  | ||||
|         r = "" | ||||
|         for png in pngs: | ||||
|             with Image.open(os.path.join(self.SCRATCH, png)) as f: | ||||
|                 self._render("    {}".format(f.filename), 3) | ||||
|                 r += self.OCR.image_to_string(f, lang=lang) | ||||
|  | ||||
|         # Strip out excess white space to allow matching to go smoother | ||||
|         return re.sub(r"\s+", " ", r) | ||||
|  | ||||
|     def _store(self, text, doc): | ||||
|  | ||||
|         sender, title, file_type = self._parse_file_name(doc) | ||||
|  | ||||
|         lower_text = text.lower() | ||||
|         relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)] | ||||
|  | ||||
|         stats = os.stat(doc) | ||||
|  | ||||
|         self._render("  Saving record to database", 2) | ||||
|  | ||||
|         document = Document.objects.create( | ||||
|             sender=sender, | ||||
|             title=title, | ||||
|             content=text, | ||||
|             file_type=file_type, | ||||
|             created=timezone.make_aware( | ||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime)), | ||||
|             modified=timezone.make_aware( | ||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime)) | ||||
|         ) | ||||
|  | ||||
|         if relevant_tags: | ||||
|             tag_names = ", ".join([t.slug for t in relevant_tags]) | ||||
|             self._render("    Tagging with {}".format(tag_names), 2) | ||||
|             document.tags.add(*relevant_tags) | ||||
|  | ||||
|         with open(doc, "rb") as unencrypted: | ||||
|             with open(document.source_path, "wb") as encrypted: | ||||
|                 self._render("  Encrypting", 3) | ||||
|                 encrypted.write(GnuPG.encrypted(unencrypted)) | ||||
|  | ||||
|     def _parse_file_name(self, doc): | ||||
|         """ | ||||
|         We use a crude naming convention to make handling the sender and title | ||||
|         easier: | ||||
|           "<sender> - <title>.<suffix>" | ||||
|         """ | ||||
|  | ||||
|         # First we attempt "<sender> - <title>.<suffix>" | ||||
|         m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc) | ||||
|         if m: | ||||
|             sender_name, title, file_type = m.group(1), m.group(2), m.group(3) | ||||
|             sender, __ = Sender.objects.get_or_create( | ||||
|                 name=sender_name, defaults={"slug": slugify(sender_name)}) | ||||
|             return sender, title, file_type | ||||
|  | ||||
|         # That didn't work, so we assume sender is None | ||||
|         m = re.match(self.PARSER_REGEX_TITLE, doc) | ||||
|         return None, m.group(1), m.group(2) | ||||
|  | ||||
|     def _cleanup(self, pngs, doc): | ||||
|  | ||||
|         png_glob = os.path.join( | ||||
|             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) | ||||
|  | ||||
|         for f in list(glob.glob(png_glob)) + [doc]: | ||||
|             self._render("  Deleting {}".format(f), 2) | ||||
|             os.unlink(f) | ||||
|  | ||||
|         self._render("", 2) | ||||
|         now = datetime.datetime.now() | ||||
|         if self.mail_consumer.last_checked + self.MAIL_DELTA > now: | ||||
|             self.mail_consumer.consume() | ||||
|  | ||||
|     def _render(self, text, verbosity): | ||||
|         if self.verbosity >= verbosity: | ||||
|   | ||||
| @@ -47,7 +47,7 @@ class Command(BaseCommand): | ||||
|             self._render("Exporting: {}".format(target), 1) | ||||
|  | ||||
|             with open(target, "wb") as f: | ||||
|                 f.write(GnuPG.decrypted(document.pdf)) | ||||
|                 f.write(GnuPG.decrypted(document.source_file)) | ||||
|                 t = int(time.mktime(document.created.timetuple())) | ||||
|                 os.utime(target, times=(t, t)) | ||||
|  | ||||
|   | ||||
| @@ -162,7 +162,21 @@ SCRATCH_DIR = "/tmp/paperless" | ||||
| # This is where Paperless will look for PDFs to index | ||||
| CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME") | ||||
|  | ||||
| # Set this and change the permissions on this file to 0600, or set it to | ||||
| # `None` and you'll be prompted for the passphrase at runtime.  The default | ||||
| # looks for an environment variable. | ||||
| # If you want to use IMAP mail consumption, populate this with useful values. | ||||
| # If you leave HOST set to None, we assume you're not going to use this feature. | ||||
| MAIL_CONSUMPTION = { | ||||
|     "HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"), | ||||
|     "PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"), | ||||
|     "USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"), | ||||
|     "PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"), | ||||
|     "USE_SSL": True,  # If True, use SSL/TLS to connect | ||||
|     "INBOX": "INBOX"  # The name of the inbox on the server | ||||
| } | ||||
|  | ||||
| # This is used to encrypt the original documents and decrypt them later when you | ||||
| # want to download them.  Set it and change the permissions on this file to | ||||
| # 0600, or set it to `None` and you'll be prompted for the passphrase at | ||||
| # runtime.  The default looks for an environment variable. | ||||
| # DON'T FORGET TO SET THIS as leaving it blank may cause some strang things with | ||||
| # GPG, including an interesting case where it may "encrypt" zero-byte files. | ||||
| PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE") | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn