mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Broke the consumer script into separate files and started on a mail consumer
This commit is contained in:
parent
84d5f8cc5d
commit
a70b40f618
9
.gitignore
vendored
9
.gitignore
vendored
@ -67,8 +67,9 @@ db.sqlite3
|
|||||||
|
|
||||||
# Other stuff that doesn't belong
|
# Other stuff that doesn't belong
|
||||||
virtualenv
|
virtualenv
|
||||||
|
|
||||||
scripts/import-for-development
|
|
||||||
|
|
||||||
# Vagrant
|
|
||||||
.vagrant
|
.vagrant
|
||||||
|
|
||||||
|
# Used for development
|
||||||
|
scripts/import-for-development
|
||||||
|
environment
|
||||||
|
|
||||||
|
3
src/documents/consumers/__init__.py
Normal file
3
src/documents/consumers/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from .base import Consumer
|
||||||
|
from .file import FileConsumer, FileConsumerError
|
||||||
|
from .mail import MailConsumer, MailConsumerError
|
157
src/documents/consumers/base.py
Normal file
157
src/documents/consumers/base.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
import datetime
|
||||||
|
import glob
|
||||||
|
import langdetect
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
import pyocr
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
|
from ..models import Tag, Document
|
||||||
|
from ..languages import ISO639
|
||||||
|
|
||||||
|
|
||||||
|
class OCRError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Consumer(object):
|
||||||
|
|
||||||
|
SCRATCH = settings.SCRATCH_DIR
|
||||||
|
CONVERT = settings.CONVERT_BINARY
|
||||||
|
|
||||||
|
OCR = pyocr.get_available_tools()[0]
|
||||||
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
|
|
||||||
|
def __init__(self, verbosity=1):
|
||||||
|
|
||||||
|
self.verbosity = verbosity
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.makedirs(self.SCRATCH)
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _get_greyscale(self, doc):
|
||||||
|
|
||||||
|
self._render(" Generating greyscale image", 2)
|
||||||
|
|
||||||
|
i = random.randint(1000000, 9999999)
|
||||||
|
png = os.path.join(self.SCRATCH, "{}.png".format(i))
|
||||||
|
|
||||||
|
subprocess.Popen((
|
||||||
|
self.CONVERT, "-density", "300", "-depth", "8",
|
||||||
|
"-type", "grayscale", doc, png
|
||||||
|
)).wait()
|
||||||
|
|
||||||
|
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
||||||
|
|
||||||
|
def _get_ocr(self, pngs):
|
||||||
|
|
||||||
|
self._render(" OCRing the document", 2)
|
||||||
|
|
||||||
|
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
|
||||||
|
|
||||||
|
guessed_language = langdetect.detect(raw_text)
|
||||||
|
|
||||||
|
self._render(" Language detected: {}".format(guessed_language), 2)
|
||||||
|
|
||||||
|
if guessed_language not in ISO639:
|
||||||
|
self._render("Language detection failed!", 0)
|
||||||
|
if settings.FORGIVING_OCR:
|
||||||
|
self._render(
|
||||||
|
"As FORGIVING_OCR is enabled, we're going to make the best "
|
||||||
|
"with what we have.",
|
||||||
|
1
|
||||||
|
)
|
||||||
|
return raw_text
|
||||||
|
raise OCRError
|
||||||
|
|
||||||
|
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||||
|
return raw_text
|
||||||
|
|
||||||
|
try:
|
||||||
|
return self._ocr(pngs, ISO639[guessed_language])
|
||||||
|
except pyocr.pyocr.tesseract.TesseractError:
|
||||||
|
if settings.FORGIVING_OCR:
|
||||||
|
self._render(
|
||||||
|
"OCR for {} failed, but we're going to stick with what "
|
||||||
|
"we've got since FORGIVING_OCR is enabled.".format(
|
||||||
|
guessed_language
|
||||||
|
),
|
||||||
|
0
|
||||||
|
)
|
||||||
|
return raw_text
|
||||||
|
raise OCRError
|
||||||
|
|
||||||
|
def _ocr(self, pngs, lang):
|
||||||
|
|
||||||
|
self._render(" Parsing for {}".format(lang), 2)
|
||||||
|
|
||||||
|
r = ""
|
||||||
|
for png in pngs:
|
||||||
|
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||||
|
self._render(" {}".format(f.filename), 3)
|
||||||
|
r += self.OCR.image_to_string(f, lang=lang)
|
||||||
|
|
||||||
|
# Strip out excess white space to allow matching to go smoother
|
||||||
|
return re.sub(r"\s+", " ", r)
|
||||||
|
|
||||||
|
def _guess_file_attributes(self, doc):
|
||||||
|
raise NotImplementedError(
|
||||||
|
"At the very least a consumer should determine the file type.")
|
||||||
|
|
||||||
|
def _store(self, text, doc):
|
||||||
|
|
||||||
|
sender, title, file_type = self._guess_file_attributes(doc)
|
||||||
|
|
||||||
|
lower_text = text.lower()
|
||||||
|
relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
|
||||||
|
|
||||||
|
stats = os.stat(doc)
|
||||||
|
|
||||||
|
self._render(" Saving record to database", 2)
|
||||||
|
|
||||||
|
document = Document.objects.create(
|
||||||
|
sender=sender,
|
||||||
|
title=title,
|
||||||
|
content=text,
|
||||||
|
file_type=file_type,
|
||||||
|
created=timezone.make_aware(
|
||||||
|
datetime.datetime.fromtimestamp(stats.st_mtime)),
|
||||||
|
modified=timezone.make_aware(
|
||||||
|
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||||
|
)
|
||||||
|
|
||||||
|
if relevant_tags:
|
||||||
|
tag_names = ", ".join([t.slug for t in relevant_tags])
|
||||||
|
self._render(" Tagging with {}".format(tag_names), 2)
|
||||||
|
document.tags.add(*relevant_tags)
|
||||||
|
|
||||||
|
with open(doc, "rb") as unencrypted:
|
||||||
|
with open(document.source_path, "wb") as encrypted:
|
||||||
|
self._render(" Encrypting", 3)
|
||||||
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||||
|
|
||||||
|
def _cleanup(self, pngs, doc):
|
||||||
|
|
||||||
|
png_glob = os.path.join(
|
||||||
|
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
|
||||||
|
|
||||||
|
for f in list(glob.glob(png_glob)) + [doc]:
|
||||||
|
self._render(" Deleting {}".format(f), 2)
|
||||||
|
os.unlink(f)
|
||||||
|
|
||||||
|
self._render("", 2)
|
||||||
|
|
||||||
|
def _render(self, text, verbosity):
|
||||||
|
if self.verbosity >= verbosity:
|
||||||
|
print(text)
|
106
src/documents/consumers/file.py
Normal file
106
src/documents/consumers/file.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.template.defaultfilters import slugify
|
||||||
|
|
||||||
|
from ..models import Sender
|
||||||
|
from . import Consumer, OCRError
|
||||||
|
|
||||||
|
|
||||||
|
class FileConsumerError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class FileConsumer(Consumer):
|
||||||
|
|
||||||
|
CONSUME = settings.CONSUMPTION_DIR
|
||||||
|
|
||||||
|
PARSER_REGEX_TITLE = re.compile(
|
||||||
|
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE)
|
||||||
|
PARSER_REGEX_SENDER_TITLE = re.compile(
|
||||||
|
r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
|
||||||
|
Consumer.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
|
self.stats = {}
|
||||||
|
self._ignore = []
|
||||||
|
|
||||||
|
if not self.CONSUME:
|
||||||
|
raise FileConsumerError(
|
||||||
|
"The CONSUMPTION_DIR settings variable does not appear to be "
|
||||||
|
"set."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not os.path.exists(self.CONSUME):
|
||||||
|
raise FileConsumerError(
|
||||||
|
"Consumption directory {} does not exist".format(self.CONSUME))
|
||||||
|
|
||||||
|
def consume(self):
|
||||||
|
|
||||||
|
for doc in os.listdir(self.CONSUME):
|
||||||
|
|
||||||
|
doc = os.path.join(self.CONSUME, doc)
|
||||||
|
|
||||||
|
if not os.path.isfile(doc):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not re.match(self.PARSER_REGEX_TITLE, doc):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if doc in self._ignore:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if self._is_ready(doc):
|
||||||
|
continue
|
||||||
|
|
||||||
|
self._render("Consuming {}".format(doc), 1)
|
||||||
|
|
||||||
|
pngs = self._get_greyscale(doc)
|
||||||
|
|
||||||
|
try:
|
||||||
|
text = self._get_ocr(pngs)
|
||||||
|
except OCRError:
|
||||||
|
self._ignore.append(doc)
|
||||||
|
self._render("OCR FAILURE: {}".format(doc), 0)
|
||||||
|
continue
|
||||||
|
|
||||||
|
self._store(text, doc)
|
||||||
|
self._cleanup(pngs, doc)
|
||||||
|
|
||||||
|
def _is_ready(self, doc):
|
||||||
|
"""
|
||||||
|
Detect whether `doc` is ready to consume or if it's still being written
|
||||||
|
to by the uploader.
|
||||||
|
"""
|
||||||
|
|
||||||
|
t = os.stat(doc).st_mtime
|
||||||
|
|
||||||
|
if self.stats.get(doc) == t:
|
||||||
|
del(self.stats[doc])
|
||||||
|
return True
|
||||||
|
|
||||||
|
self.stats[doc] = t
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _guess_file_attributes(self, doc):
|
||||||
|
"""
|
||||||
|
We use a crude naming convention to make handling the sender and title
|
||||||
|
easier:
|
||||||
|
"<sender> - <title>.<suffix>"
|
||||||
|
"""
|
||||||
|
|
||||||
|
# First we attempt "<sender> - <title>.<suffix>"
|
||||||
|
m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc)
|
||||||
|
if m:
|
||||||
|
sender_name, title, file_type = m.group(1), m.group(2), m.group(3)
|
||||||
|
sender, __ = Sender.objects.get_or_create(
|
||||||
|
name=sender_name, defaults={"slug": slugify(sender_name)})
|
||||||
|
return sender, title, file_type
|
||||||
|
|
||||||
|
# That didn't work, so we assume sender is None
|
||||||
|
m = re.match(self.PARSER_REGEX_TITLE, doc)
|
||||||
|
return None, m.group(1), m.group(2)
|
69
src/documents/consumers/mail.py
Normal file
69
src/documents/consumers/mail.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
import datetime
|
||||||
|
import imaplib
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
from . import Consumer
|
||||||
|
|
||||||
|
|
||||||
|
class MailConsumerError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class MailConsumer(Consumer):
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
|
||||||
|
Consumer.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
|
self._connection = None
|
||||||
|
self._host = settings.MAIL_CONSUMPTION["HOST"]
|
||||||
|
self._port = settings.MAIL_CONSUMPTION["PORT"]
|
||||||
|
self._username = settings.MAIL_CONSUMPTION["USERNAME"]
|
||||||
|
self._password = settings.MAIL_CONSUMPTION["PASSWORD"]
|
||||||
|
self._inbox = settings.MAIL_CONSUMPTION["INBOX"]
|
||||||
|
|
||||||
|
self._enabled = bool(self._host)
|
||||||
|
|
||||||
|
self.last_checked = datetime.datetime.now()
|
||||||
|
|
||||||
|
def _connect(self):
|
||||||
|
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
|
||||||
|
|
||||||
|
def _login(self):
|
||||||
|
|
||||||
|
login = self._connection.login(self._username, self._password)
|
||||||
|
if not login[0] == "OK":
|
||||||
|
raise MailConsumerError("Can't log into mail: {}".format(login[1]))
|
||||||
|
|
||||||
|
inbox = self._connection.select("INBOX")
|
||||||
|
if not inbox[0] == "OK":
|
||||||
|
raise MailConsumerError("Can't find the inbox: {}".format(inbox[1]))
|
||||||
|
|
||||||
|
def _fetch(self):
|
||||||
|
for num in self._connection.search(None, "ALL")[1][0].split():
|
||||||
|
typ, data = self._connection.fetch(num, "(RFC822)")
|
||||||
|
# self._connection.store(num, "+FLAGS", "\\Deleted")
|
||||||
|
yield data[0][1]
|
||||||
|
|
||||||
|
def consume(self):
|
||||||
|
|
||||||
|
if self._enabled:
|
||||||
|
self.get_messages()
|
||||||
|
|
||||||
|
self.last_checked = datetime.datetime.now()
|
||||||
|
|
||||||
|
def get_messages(self):
|
||||||
|
|
||||||
|
self._connect()
|
||||||
|
self._login()
|
||||||
|
|
||||||
|
for message in self._fetch():
|
||||||
|
print(message) # Now we have to do something with the attachment
|
||||||
|
|
||||||
|
self._connection.expunge()
|
||||||
|
self._connection.close()
|
||||||
|
self._connection.logout()
|
||||||
|
|
||||||
|
def _guess_file_attributes(self, doc):
|
||||||
|
return None, None, "jpg"
|
@ -1,29 +1,12 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import glob
|
|
||||||
import langdetect
|
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
import re
|
|
||||||
import subprocess
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import pyocr
|
|
||||||
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.management.base import BaseCommand, CommandError
|
from django.core.management.base import BaseCommand, CommandError
|
||||||
from django.template.defaultfilters import slugify
|
|
||||||
from django.utils import timezone
|
|
||||||
|
|
||||||
from paperless.db import GnuPG
|
from ...consumers import (
|
||||||
|
FileConsumer, FileConsumerError, MailConsumer, MailConsumerError)
|
||||||
from ...languages import ISO639
|
|
||||||
from ...models import Document, Sender, Tag
|
|
||||||
|
|
||||||
|
|
||||||
class OCRError(BaseException):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
@ -37,25 +20,16 @@ class Command(BaseCommand):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
LOOP_TIME = 10 # Seconds
|
LOOP_TIME = 10 # Seconds
|
||||||
|
MAIL_DELTA = datetime.timedelta(minutes=10)
|
||||||
|
|
||||||
CONVERT = settings.CONVERT_BINARY
|
|
||||||
SCRATCH = settings.SCRATCH_DIR
|
|
||||||
CONSUME = settings.CONSUMPTION_DIR
|
|
||||||
|
|
||||||
OCR = pyocr.get_available_tools()[0]
|
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
|
||||||
MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents")
|
MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents")
|
||||||
|
|
||||||
PARSER_REGEX_TITLE = re.compile(
|
|
||||||
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE)
|
|
||||||
PARSER_REGEX_SENDER_TITLE = re.compile(
|
|
||||||
r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE)
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
|
||||||
self.verbosity = 0
|
self.verbosity = 0
|
||||||
self.stats = {}
|
|
||||||
self._ignore = []
|
self.file_consumer = None
|
||||||
|
self.mail_consumer = None
|
||||||
|
|
||||||
BaseCommand.__init__(self, *args, **kwargs)
|
BaseCommand.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
@ -63,7 +37,16 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
self.verbosity = options["verbosity"]
|
self.verbosity = options["verbosity"]
|
||||||
|
|
||||||
self._setup()
|
try:
|
||||||
|
self.file_consumer = FileConsumer(verbosity=self.verbosity)
|
||||||
|
self.mail_consumer = MailConsumer(verbosity=self.verbosity)
|
||||||
|
except (FileConsumerError, MailConsumerError) as e:
|
||||||
|
raise CommandError(e)
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.makedirs(self.MEDIA_DOCS)
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
@ -76,196 +59,11 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
def loop(self):
|
def loop(self):
|
||||||
|
|
||||||
for doc in os.listdir(self.CONSUME):
|
self.file_consumer.consume()
|
||||||
|
|
||||||
doc = os.path.join(self.CONSUME, doc)
|
now = datetime.datetime.now()
|
||||||
|
if self.mail_consumer.last_checked + self.MAIL_DELTA > now:
|
||||||
if not os.path.isfile(doc):
|
self.mail_consumer.consume()
|
||||||
continue
|
|
||||||
|
|
||||||
if not re.match(self.PARSER_REGEX_TITLE, doc):
|
|
||||||
continue
|
|
||||||
|
|
||||||
if doc in self._ignore:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if self._is_ready(doc):
|
|
||||||
continue
|
|
||||||
|
|
||||||
self._render("Consuming {}".format(doc), 1)
|
|
||||||
|
|
||||||
pngs = self._get_greyscale(doc)
|
|
||||||
|
|
||||||
try:
|
|
||||||
text = self._get_ocr(pngs)
|
|
||||||
except OCRError:
|
|
||||||
self._ignore.append(doc)
|
|
||||||
self._render("OCR FAILURE: {}".format(doc), 0)
|
|
||||||
continue
|
|
||||||
|
|
||||||
self._store(text, doc)
|
|
||||||
self._cleanup(pngs, doc)
|
|
||||||
|
|
||||||
def _setup(self):
|
|
||||||
|
|
||||||
if not self.CONSUME:
|
|
||||||
raise CommandError(
|
|
||||||
"The CONSUMPTION_DIR settings variable does not appear to be "
|
|
||||||
"set."
|
|
||||||
)
|
|
||||||
|
|
||||||
if not os.path.exists(self.CONSUME):
|
|
||||||
raise CommandError("Consumption directory {} does not exist".format(
|
|
||||||
self.CONSUME))
|
|
||||||
|
|
||||||
for d in (self.SCRATCH, self.MEDIA_DOCS):
|
|
||||||
try:
|
|
||||||
os.makedirs(d)
|
|
||||||
except FileExistsError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _is_ready(self, doc):
|
|
||||||
"""
|
|
||||||
Detect whether `doc` is ready to consume or if it's still being written
|
|
||||||
to by the scanner.
|
|
||||||
"""
|
|
||||||
|
|
||||||
t = os.stat(doc).st_mtime
|
|
||||||
|
|
||||||
if self.stats.get(doc) == t:
|
|
||||||
del(self.stats[doc])
|
|
||||||
return True
|
|
||||||
|
|
||||||
self.stats[doc] = t
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _get_greyscale(self, doc):
|
|
||||||
|
|
||||||
self._render(" Generating greyscale image", 2)
|
|
||||||
|
|
||||||
i = random.randint(1000000, 9999999)
|
|
||||||
png = os.path.join(self.SCRATCH, "{}.png".format(i))
|
|
||||||
|
|
||||||
subprocess.Popen((
|
|
||||||
self.CONVERT, "-density", "300", "-depth", "8",
|
|
||||||
"-type", "grayscale", doc, png
|
|
||||||
)).wait()
|
|
||||||
|
|
||||||
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
|
||||||
|
|
||||||
def _get_ocr(self, pngs):
|
|
||||||
|
|
||||||
self._render(" OCRing the document", 2)
|
|
||||||
|
|
||||||
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
|
|
||||||
|
|
||||||
guessed_language = langdetect.detect(raw_text)
|
|
||||||
|
|
||||||
self._render(" Language detected: {}".format(guessed_language), 2)
|
|
||||||
|
|
||||||
if guessed_language not in ISO639:
|
|
||||||
self._render("Language detection failed!", 0)
|
|
||||||
if settings.FORGIVING_OCR:
|
|
||||||
self._render(
|
|
||||||
"As FORGIVING_OCR is enabled, we're going to make the best "
|
|
||||||
"with what we have.",
|
|
||||||
1
|
|
||||||
)
|
|
||||||
return raw_text
|
|
||||||
raise OCRError
|
|
||||||
|
|
||||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
|
||||||
return raw_text
|
|
||||||
|
|
||||||
try:
|
|
||||||
return self._ocr(pngs, ISO639[guessed_language])
|
|
||||||
except pyocr.pyocr.tesseract.TesseractError:
|
|
||||||
if settings.FORGIVING_OCR:
|
|
||||||
self._render(
|
|
||||||
"OCR for {} failed, but we're going to stick with what "
|
|
||||||
"we've got since FORGIVING_OCR is enabled.".format(
|
|
||||||
guessed_language
|
|
||||||
),
|
|
||||||
0
|
|
||||||
)
|
|
||||||
return raw_text
|
|
||||||
raise OCRError
|
|
||||||
|
|
||||||
def _ocr(self, pngs, lang):
|
|
||||||
|
|
||||||
self._render(" Parsing for {}".format(lang), 2)
|
|
||||||
|
|
||||||
r = ""
|
|
||||||
for png in pngs:
|
|
||||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
|
||||||
self._render(" {}".format(f.filename), 3)
|
|
||||||
r += self.OCR.image_to_string(f, lang=lang)
|
|
||||||
|
|
||||||
# Strip out excess white space to allow matching to go smoother
|
|
||||||
return re.sub(r"\s+", " ", r)
|
|
||||||
|
|
||||||
def _store(self, text, doc):
|
|
||||||
|
|
||||||
sender, title, file_type = self._parse_file_name(doc)
|
|
||||||
|
|
||||||
lower_text = text.lower()
|
|
||||||
relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
|
|
||||||
|
|
||||||
stats = os.stat(doc)
|
|
||||||
|
|
||||||
self._render(" Saving record to database", 2)
|
|
||||||
|
|
||||||
document = Document.objects.create(
|
|
||||||
sender=sender,
|
|
||||||
title=title,
|
|
||||||
content=text,
|
|
||||||
file_type=file_type,
|
|
||||||
created=timezone.make_aware(
|
|
||||||
datetime.datetime.fromtimestamp(stats.st_mtime)),
|
|
||||||
modified=timezone.make_aware(
|
|
||||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
|
||||||
)
|
|
||||||
|
|
||||||
if relevant_tags:
|
|
||||||
tag_names = ", ".join([t.slug for t in relevant_tags])
|
|
||||||
self._render(" Tagging with {}".format(tag_names), 2)
|
|
||||||
document.tags.add(*relevant_tags)
|
|
||||||
|
|
||||||
with open(doc, "rb") as unencrypted:
|
|
||||||
with open(document.source_path, "wb") as encrypted:
|
|
||||||
self._render(" Encrypting", 3)
|
|
||||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
|
||||||
|
|
||||||
def _parse_file_name(self, doc):
|
|
||||||
"""
|
|
||||||
We use a crude naming convention to make handling the sender and title
|
|
||||||
easier:
|
|
||||||
"<sender> - <title>.<suffix>"
|
|
||||||
"""
|
|
||||||
|
|
||||||
# First we attempt "<sender> - <title>.<suffix>"
|
|
||||||
m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc)
|
|
||||||
if m:
|
|
||||||
sender_name, title, file_type = m.group(1), m.group(2), m.group(3)
|
|
||||||
sender, __ = Sender.objects.get_or_create(
|
|
||||||
name=sender_name, defaults={"slug": slugify(sender_name)})
|
|
||||||
return sender, title, file_type
|
|
||||||
|
|
||||||
# That didn't work, so we assume sender is None
|
|
||||||
m = re.match(self.PARSER_REGEX_TITLE, doc)
|
|
||||||
return None, m.group(1), m.group(2)
|
|
||||||
|
|
||||||
def _cleanup(self, pngs, doc):
|
|
||||||
|
|
||||||
png_glob = os.path.join(
|
|
||||||
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
|
|
||||||
|
|
||||||
for f in list(glob.glob(png_glob)) + [doc]:
|
|
||||||
self._render(" Deleting {}".format(f), 2)
|
|
||||||
os.unlink(f)
|
|
||||||
|
|
||||||
self._render("", 2)
|
|
||||||
|
|
||||||
def _render(self, text, verbosity):
|
def _render(self, text, verbosity):
|
||||||
if self.verbosity >= verbosity:
|
if self.verbosity >= verbosity:
|
||||||
|
@ -47,7 +47,7 @@ class Command(BaseCommand):
|
|||||||
self._render("Exporting: {}".format(target), 1)
|
self._render("Exporting: {}".format(target), 1)
|
||||||
|
|
||||||
with open(target, "wb") as f:
|
with open(target, "wb") as f:
|
||||||
f.write(GnuPG.decrypted(document.pdf))
|
f.write(GnuPG.decrypted(document.source_file))
|
||||||
t = int(time.mktime(document.created.timetuple()))
|
t = int(time.mktime(document.created.timetuple()))
|
||||||
os.utime(target, times=(t, t))
|
os.utime(target, times=(t, t))
|
||||||
|
|
||||||
|
@ -162,7 +162,21 @@ SCRATCH_DIR = "/tmp/paperless"
|
|||||||
# This is where Paperless will look for PDFs to index
|
# This is where Paperless will look for PDFs to index
|
||||||
CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
|
CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
|
||||||
|
|
||||||
# Set this and change the permissions on this file to 0600, or set it to
|
# If you want to use IMAP mail consumption, populate this with useful values.
|
||||||
# `None` and you'll be prompted for the passphrase at runtime. The default
|
# If you leave HOST set to None, we assume you're not going to use this feature.
|
||||||
# looks for an environment variable.
|
MAIL_CONSUMPTION = {
|
||||||
|
"HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"),
|
||||||
|
"PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"),
|
||||||
|
"USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"),
|
||||||
|
"PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"),
|
||||||
|
"USE_SSL": True, # If True, use SSL/TLS to connect
|
||||||
|
"INBOX": "INBOX" # The name of the inbox on the server
|
||||||
|
}
|
||||||
|
|
||||||
|
# This is used to encrypt the original documents and decrypt them later when you
|
||||||
|
# want to download them. Set it and change the permissions on this file to
|
||||||
|
# 0600, or set it to `None` and you'll be prompted for the passphrase at
|
||||||
|
# runtime. The default looks for an environment variable.
|
||||||
|
# DON'T FORGET TO SET THIS as leaving it blank may cause some strang things with
|
||||||
|
# GPG, including an interesting case where it may "encrypt" zero-byte files.
|
||||||
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")
|
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user