mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-22 00:52:42 -05:00
Merge branch 'master' into issue/81
This commit is contained in:
@@ -19,12 +19,11 @@ from PIL import Image
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.template.defaultfilters import slugify
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from .models import Correspondent, Tag, Document, Log
|
||||
from .models import Tag, Document, Log, FileInfo
|
||||
from .languages import ISO639
|
||||
from .signals import (
|
||||
document_consumption_started, document_consumption_finished)
|
||||
@@ -56,19 +55,6 @@ class Consumer(object):
|
||||
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
|
||||
REGEX_TITLE = re.compile(
|
||||
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
REGEX_CORRESPONDENT_TITLE = re.compile(
|
||||
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
|
||||
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
@@ -107,7 +93,7 @@ class Consumer(object):
|
||||
if not os.path.isfile(doc):
|
||||
continue
|
||||
|
||||
if not re.match(self.REGEX_TITLE, doc):
|
||||
if not re.match(FileInfo.REGEXES["title"], doc):
|
||||
continue
|
||||
|
||||
if doc in self._ignore:
|
||||
@@ -282,72 +268,20 @@ class Consumer(object):
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
return re.sub(r"\s+", " ", r)
|
||||
|
||||
def _guess_attributes_from_name(self, parseable):
|
||||
"""
|
||||
We use a crude naming convention to make handling the correspondent,
|
||||
title, and tags easier:
|
||||
"<correspondent> - <title> - <tags>.<suffix>"
|
||||
"<correspondent> - <title>.<suffix>"
|
||||
"<title>.<suffix>"
|
||||
"""
|
||||
|
||||
def get_correspondent(correspondent_name):
|
||||
return Correspondent.objects.get_or_create(
|
||||
name=correspondent_name,
|
||||
defaults={"slug": slugify(correspondent_name)}
|
||||
)[0]
|
||||
|
||||
def get_tags(tags):
|
||||
r = []
|
||||
for t in tags.split(","):
|
||||
r.append(
|
||||
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
|
||||
return tuple(r)
|
||||
|
||||
def get_suffix(suffix):
|
||||
suffix = suffix.lower()
|
||||
if suffix == "jpeg":
|
||||
return "jpg"
|
||||
return suffix
|
||||
|
||||
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
|
||||
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
|
||||
if m:
|
||||
return (
|
||||
get_correspondent(m.group(1)),
|
||||
m.group(2),
|
||||
get_tags(m.group(3)),
|
||||
get_suffix(m.group(4))
|
||||
)
|
||||
|
||||
# Second attempt: "<correspondent> - <title>.<suffix>"
|
||||
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
|
||||
if m:
|
||||
return (
|
||||
get_correspondent(m.group(1)),
|
||||
m.group(2),
|
||||
(),
|
||||
get_suffix(m.group(3))
|
||||
)
|
||||
|
||||
# That didn't work, so we assume correspondent and tags are None
|
||||
m = re.match(self.REGEX_TITLE, parseable)
|
||||
return None, m.group(1), (), get_suffix(m.group(2))
|
||||
|
||||
def _store(self, text, doc, thumbnail):
|
||||
|
||||
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
|
||||
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
|
||||
file_info = FileInfo.from_path(doc)
|
||||
relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
|
||||
|
||||
stats = os.stat(doc)
|
||||
|
||||
self.log("debug", "Saving record to database")
|
||||
|
||||
document = Document.objects.create(
|
||||
correspondent=sender,
|
||||
title=title,
|
||||
correspondent=file_info.correspondent,
|
||||
title=file_info.title,
|
||||
content=text,
|
||||
file_type=file_type,
|
||||
file_type=file_info.extension,
|
||||
created=timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime)),
|
||||
modified=timezone.make_aware(
|
||||
|
Reference in New Issue
Block a user