mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-19 10:19:27 -05:00
Extract filename parsing into testable class
This commit is contained in:
parent
0b34894db9
commit
1f75af0137
@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError
|
|||||||
|
|
||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
from .models import Correspondent, Tag, Document, Log
|
from .models import Correspondent, Tag, Document, Log, FileInfo
|
||||||
from .languages import ISO639
|
from .languages import ISO639
|
||||||
|
|
||||||
|
|
||||||
@ -54,19 +54,6 @@ class Consumer(object):
|
|||||||
|
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
|
|
||||||
REGEX_TITLE = re.compile(
|
|
||||||
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)
|
|
||||||
REGEX_CORRESPONDENT_TITLE = re.compile(
|
|
||||||
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)
|
|
||||||
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
|
|
||||||
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
|
|
||||||
flags=re.IGNORECASE
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
@ -105,7 +92,7 @@ class Consumer(object):
|
|||||||
if not os.path.isfile(doc):
|
if not os.path.isfile(doc):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not re.match(self.REGEX_TITLE, doc):
|
if not re.match(FileInfo.REGEX_TITLE, doc):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if doc in self._ignore:
|
if doc in self._ignore:
|
||||||
@ -270,56 +257,8 @@ class Consumer(object):
|
|||||||
return re.sub(r"\s+", " ", r)
|
return re.sub(r"\s+", " ", r)
|
||||||
|
|
||||||
def _guess_attributes_from_name(self, parseable):
|
def _guess_attributes_from_name(self, parseable):
|
||||||
"""
|
file_info = FileInfo.from_path(parseable)
|
||||||
We use a crude naming convention to make handling the correspondent,
|
return file_info.sender, file_info.title, file_info.tags, file_info.suffix
|
||||||
title, and tags easier:
|
|
||||||
"<correspondent> - <title> - <tags>.<suffix>"
|
|
||||||
"<correspondent> - <title>.<suffix>"
|
|
||||||
"<title>.<suffix>"
|
|
||||||
"""
|
|
||||||
|
|
||||||
def get_correspondent(correspondent_name):
|
|
||||||
return Correspondent.objects.get_or_create(
|
|
||||||
name=correspondent_name,
|
|
||||||
defaults={"slug": slugify(correspondent_name)}
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
def get_tags(tags):
|
|
||||||
r = []
|
|
||||||
for t in tags.split(","):
|
|
||||||
r.append(
|
|
||||||
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
|
|
||||||
return tuple(r)
|
|
||||||
|
|
||||||
def get_suffix(suffix):
|
|
||||||
suffix = suffix.lower()
|
|
||||||
if suffix == "jpeg":
|
|
||||||
return "jpg"
|
|
||||||
return suffix
|
|
||||||
|
|
||||||
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
|
|
||||||
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
|
|
||||||
if m:
|
|
||||||
return (
|
|
||||||
get_correspondent(m.group(1)),
|
|
||||||
m.group(2),
|
|
||||||
get_tags(m.group(3)),
|
|
||||||
get_suffix(m.group(4))
|
|
||||||
)
|
|
||||||
|
|
||||||
# Second attempt: "<correspondent> - <title>.<suffix>"
|
|
||||||
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
|
|
||||||
if m:
|
|
||||||
return (
|
|
||||||
get_correspondent(m.group(1)),
|
|
||||||
m.group(2),
|
|
||||||
(),
|
|
||||||
get_suffix(m.group(3))
|
|
||||||
)
|
|
||||||
|
|
||||||
# That didn't work, so we assume correspondent and tags are None
|
|
||||||
m = re.match(self.REGEX_TITLE, parseable)
|
|
||||||
return None, m.group(1), (), get_suffix(m.group(2))
|
|
||||||
|
|
||||||
def _store(self, text, doc, thumbnail):
|
def _store(self, text, doc, thumbnail):
|
||||||
|
|
||||||
|
@ -23,9 +23,72 @@ class FileInfo(object):
|
|||||||
self._file_mtime = file_mtime
|
self._file_mtime = file_mtime
|
||||||
self._path = path
|
self._path = path
|
||||||
|
|
||||||
|
REGEX_TITLE = re.compile(
|
||||||
|
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||||
|
flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
REGEX_CORRESPONDENT_TITLE = re.compile(
|
||||||
|
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||||
|
flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
|
||||||
|
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||||
|
flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_path(cls, path):
|
def from_path(cls, path):
|
||||||
pass
|
"""
|
||||||
|
We use a crude naming convention to make handling the correspondent,
|
||||||
|
title, and tags easier:
|
||||||
|
"<correspondent> - <title> - <tags>.<suffix>"
|
||||||
|
"<correspondent> - <title>.<suffix>"
|
||||||
|
"<title>.<suffix>"
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_correspondent(correspondent_name):
|
||||||
|
return Correspondent.objects.get_or_create(
|
||||||
|
name=correspondent_name,
|
||||||
|
defaults={"slug": slugify(correspondent_name)}
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
def get_tags(tags):
|
||||||
|
r = []
|
||||||
|
for t in tags.split(","):
|
||||||
|
r.append(
|
||||||
|
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
|
||||||
|
return tuple(r)
|
||||||
|
|
||||||
|
def get_suffix(suffix):
|
||||||
|
suffix = suffix.lower()
|
||||||
|
if suffix == "jpeg":
|
||||||
|
return "jpg"
|
||||||
|
return suffix
|
||||||
|
|
||||||
|
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
|
||||||
|
m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
|
||||||
|
if m:
|
||||||
|
return cls(
|
||||||
|
title=m.group(2),
|
||||||
|
correspondent=get_correspondent(m.group(1)),
|
||||||
|
tags=get_tags(m.group(3)),
|
||||||
|
suffix=get_suffix(m.group(4))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Second attempt: "<correspondent> - <title>.<suffix>"
|
||||||
|
m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
|
||||||
|
if m:
|
||||||
|
return cls(
|
||||||
|
title=m.group(2),
|
||||||
|
correspondent=get_correspondent(m.group(1)),
|
||||||
|
tags=(),
|
||||||
|
suffix=get_suffix(m.group(3))
|
||||||
|
)
|
||||||
|
|
||||||
|
# That didn't work, so we assume correspondent and tags are None
|
||||||
|
m = re.match(cls.REGEX_TITLE, path)
|
||||||
|
return FileInfo(
|
||||||
|
title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_document(cls, document):
|
def from_document(cls, document):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user