Merge branch 'master' into issue/81

This commit is contained in:
Daniel Quinn
2016-03-25 20:56:30 +00:00
16 changed files with 598 additions and 167 deletions

View File

@@ -19,12 +19,11 @@ from PIL import Image
from django.conf import settings
from django.utils import timezone
from django.template.defaultfilters import slugify
from pyocr.tesseract import TesseractError
from paperless.db import GnuPG
from .models import Correspondent, Tag, Document, Log
from .models import Tag, Document, Log, FileInfo
from .languages import ISO639
from .signals import (
document_consumption_started, document_consumption_finished)
@@ -56,19 +55,6 @@ class Consumer(object):
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE = re.compile(
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
def __init__(self):
self.logger = logging.getLogger(__name__)
@@ -107,7 +93,7 @@ class Consumer(object):
if not os.path.isfile(doc):
continue
if not re.match(self.REGEX_TITLE, doc):
if not re.match(FileInfo.REGEXES["title"], doc):
continue
if doc in self._ignore:
@@ -282,72 +268,20 @@ class Consumer(object):
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
def _guess_attributes_from_name(self, parseable):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
def get_correspondent(correspondent_name):
return Correspondent.objects.get_or_create(
name=correspondent_name,
defaults={"slug": slugify(correspondent_name)}
)[0]
def get_tags(tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
def get_suffix(suffix):
suffix = suffix.lower()
if suffix == "jpeg":
return "jpg"
return suffix
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
if m:
return (
get_correspondent(m.group(1)),
m.group(2),
get_tags(m.group(3)),
get_suffix(m.group(4))
)
# Second attempt: "<correspondent> - <title>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
if m:
return (
get_correspondent(m.group(1)),
m.group(2),
(),
get_suffix(m.group(3))
)
# That didn't work, so we assume correspondent and tags are None
m = re.match(self.REGEX_TITLE, parseable)
return None, m.group(1), (), get_suffix(m.group(2))
def _store(self, text, doc, thumbnail):
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
file_info = FileInfo.from_path(doc)
relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
stats = os.stat(doc)
self.log("debug", "Saving record to database")
document = Document.objects.create(
correspondent=sender,
title=title,
correspondent=file_info.correspondent,
title=file_info.title,
content=text,
file_type=file_type,
file_type=file_info.extension,
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(