Extract filename parsing into testable class

This commit is contained in:
Tikitu de Jager 2016-03-07 21:05:04 +02:00
parent 0b34894db9
commit 1f75af0137
2 changed files with 68 additions and 66 deletions

View File

@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError
from paperless.db import GnuPG from paperless.db import GnuPG
from .models import Correspondent, Tag, Document, Log from .models import Correspondent, Tag, Document, Log, FileInfo
from .languages import ISO639 from .languages import ISO639
@ -54,19 +54,6 @@ class Consumer(object):
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE = re.compile(
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
def __init__(self): def __init__(self):
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
@ -105,7 +92,7 @@ class Consumer(object):
if not os.path.isfile(doc): if not os.path.isfile(doc):
continue continue
if not re.match(self.REGEX_TITLE, doc): if not re.match(FileInfo.REGEX_TITLE, doc):
continue continue
if doc in self._ignore: if doc in self._ignore:
@ -270,56 +257,8 @@ class Consumer(object):
return re.sub(r"\s+", " ", r) return re.sub(r"\s+", " ", r)
def _guess_attributes_from_name(self, parseable): def _guess_attributes_from_name(self, parseable):
""" file_info = FileInfo.from_path(parseable)
We use a crude naming convention to make handling the correspondent, return file_info.sender, file_info.title, file_info.tags, file_info.suffix
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
def get_correspondent(correspondent_name):
return Correspondent.objects.get_or_create(
name=correspondent_name,
defaults={"slug": slugify(correspondent_name)}
)[0]
def get_tags(tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
def get_suffix(suffix):
suffix = suffix.lower()
if suffix == "jpeg":
return "jpg"
return suffix
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
if m:
return (
get_correspondent(m.group(1)),
m.group(2),
get_tags(m.group(3)),
get_suffix(m.group(4))
)
# Second attempt: "<correspondent> - <title>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
if m:
return (
get_correspondent(m.group(1)),
m.group(2),
(),
get_suffix(m.group(3))
)
# That didn't work, so we assume correspondent and tags are None
m = re.match(self.REGEX_TITLE, parseable)
return None, m.group(1), (), get_suffix(m.group(2))
def _store(self, text, doc, thumbnail): def _store(self, text, doc, thumbnail):

View File

@ -23,9 +23,72 @@ class FileInfo(object):
self._file_mtime = file_mtime self._file_mtime = file_mtime
self._path = path self._path = path
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE = re.compile(
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
@classmethod @classmethod
def from_path(cls, path): def from_path(cls, path):
pass """
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
def get_correspondent(correspondent_name):
return Correspondent.objects.get_or_create(
name=correspondent_name,
defaults={"slug": slugify(correspondent_name)}
)[0]
def get_tags(tags):
r = []
for t in tags.split(","):
r.append(
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
def get_suffix(suffix):
suffix = suffix.lower()
if suffix == "jpeg":
return "jpg"
return suffix
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
if m:
return cls(
title=m.group(2),
correspondent=get_correspondent(m.group(1)),
tags=get_tags(m.group(3)),
suffix=get_suffix(m.group(4))
)
# Second attempt: "<correspondent> - <title>.<suffix>"
m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
if m:
return cls(
title=m.group(2),
correspondent=get_correspondent(m.group(1)),
tags=(),
suffix=get_suffix(m.group(3))
)
# That didn't work, so we assume correspondent and tags are None
m = re.match(cls.REGEX_TITLE, path)
return FileInfo(
title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))
@classmethod @classmethod
def from_document(cls, document): def from_document(cls, document):