Extract filename parsing into testable class

This commit is contained in:
Tikitu de Jager 2016-03-07 21:05:04 +02:00
parent 0b34894db9
commit 1f75af0137
2 changed files with 68 additions and 66 deletions

View File

@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError
from paperless.db import GnuPG
from .models import Correspondent, Tag, Document, Log
from .models import Correspondent, Tag, Document, Log, FileInfo
from .languages import ISO639
@ -54,19 +54,6 @@ class Consumer(object):
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE = re.compile(
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
def __init__(self):
self.logger = logging.getLogger(__name__)
@ -105,7 +92,7 @@ class Consumer(object):
if not os.path.isfile(doc):
continue
if not re.match(self.REGEX_TITLE, doc):
if not re.match(FileInfo.REGEX_TITLE, doc):
continue
if doc in self._ignore:
@ -270,56 +257,8 @@ class Consumer(object):
return re.sub(r"\s+", " ", r)
def _guess_attributes_from_name(self, parseable):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
def get_correspondent(correspondent_name):
return Correspondent.objects.get_or_create(
name=correspondent_name,
defaults={"slug": slugify(correspondent_name)}
)[0]
def get_tags(tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
def get_suffix(suffix):
suffix = suffix.lower()
if suffix == "jpeg":
return "jpg"
return suffix
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
if m:
return (
get_correspondent(m.group(1)),
m.group(2),
get_tags(m.group(3)),
get_suffix(m.group(4))
)
# Second attempt: "<correspondent> - <title>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
if m:
return (
get_correspondent(m.group(1)),
m.group(2),
(),
get_suffix(m.group(3))
)
# That didn't work, so we assume correspondent and tags are None
m = re.match(self.REGEX_TITLE, parseable)
return None, m.group(1), (), get_suffix(m.group(2))
file_info = FileInfo.from_path(parseable)
return file_info.sender, file_info.title, file_info.tags, file_info.suffix
def _store(self, text, doc, thumbnail):

View File

@ -23,9 +23,72 @@ class FileInfo(object):
self._file_mtime = file_mtime
self._path = path
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE = re.compile(
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
@classmethod
def from_path(cls, path):
pass
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
def get_correspondent(correspondent_name):
return Correspondent.objects.get_or_create(
name=correspondent_name,
defaults={"slug": slugify(correspondent_name)}
)[0]
def get_tags(tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
def get_suffix(suffix):
suffix = suffix.lower()
if suffix == "jpeg":
return "jpg"
return suffix
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
if m:
return cls(
title=m.group(2),
correspondent=get_correspondent(m.group(1)),
tags=get_tags(m.group(3)),
suffix=get_suffix(m.group(4))
)
# Second attempt: "<correspondent> - <title>.<suffix>"
m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
if m:
return cls(
title=m.group(2),
correspondent=get_correspondent(m.group(1)),
tags=(),
suffix=get_suffix(m.group(3))
)
# That didn't work, so we assume correspondent and tags are None
m = re.match(cls.REGEX_TITLE, path)
return FileInfo(
title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))
@classmethod
def from_document(cls, document):