mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Extract filename parsing into testable class
This commit is contained in:
		| @@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError | ||||
|  | ||||
| from paperless.db import GnuPG | ||||
|  | ||||
| from .models import Correspondent, Tag, Document, Log | ||||
| from .models import Correspondent, Tag, Document, Log, FileInfo | ||||
| from .languages import ISO639 | ||||
|  | ||||
|  | ||||
| @@ -54,19 +54,6 @@ class Consumer(object): | ||||
|  | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|  | ||||
|     REGEX_TITLE = re.compile( | ||||
|         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
|     REGEX_CORRESPONDENT_TITLE = re.compile( | ||||
|         r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
|     REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( | ||||
|         r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
|  | ||||
|     def __init__(self): | ||||
|  | ||||
|         self.logger = logging.getLogger(__name__) | ||||
| @@ -105,7 +92,7 @@ class Consumer(object): | ||||
|             if not os.path.isfile(doc): | ||||
|                 continue | ||||
|  | ||||
|             if not re.match(self.REGEX_TITLE, doc): | ||||
|             if not re.match(FileInfo.REGEX_TITLE, doc): | ||||
|                 continue | ||||
|  | ||||
|             if doc in self._ignore: | ||||
| @@ -270,56 +257,8 @@ class Consumer(object): | ||||
|         return re.sub(r"\s+", " ", r) | ||||
|  | ||||
|     def _guess_attributes_from_name(self, parseable): | ||||
|         """ | ||||
|         We use a crude naming convention to make handling the correspondent, | ||||
|         title, and tags easier: | ||||
|           "<correspondent> - <title> - <tags>.<suffix>" | ||||
|           "<correspondent> - <title>.<suffix>" | ||||
|           "<title>.<suffix>" | ||||
|         """ | ||||
|  | ||||
|         def get_correspondent(correspondent_name): | ||||
|             return Correspondent.objects.get_or_create( | ||||
|                 name=correspondent_name, | ||||
|                 defaults={"slug": slugify(correspondent_name)} | ||||
|             )[0] | ||||
|  | ||||
|         def get_tags(tags): | ||||
|             r = [] | ||||
|             for t in tags.split(","): | ||||
|                 r.append( | ||||
|                     Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) | ||||
|             return tuple(r) | ||||
|  | ||||
|         def get_suffix(suffix): | ||||
|             suffix = suffix.lower() | ||||
|             if suffix == "jpeg": | ||||
|                 return "jpg" | ||||
|             return suffix | ||||
|  | ||||
|         # First attempt: "<correspondent> - <title> - <tags>.<suffix>" | ||||
|         m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) | ||||
|         if m: | ||||
|             return ( | ||||
|                 get_correspondent(m.group(1)), | ||||
|                 m.group(2), | ||||
|                 get_tags(m.group(3)), | ||||
|                 get_suffix(m.group(4)) | ||||
|             ) | ||||
|  | ||||
|         # Second attempt: "<correspondent> - <title>.<suffix>" | ||||
|         m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) | ||||
|         if m: | ||||
|             return ( | ||||
|                 get_correspondent(m.group(1)), | ||||
|                 m.group(2), | ||||
|                 (), | ||||
|                 get_suffix(m.group(3)) | ||||
|             ) | ||||
|  | ||||
|         # That didn't work, so we assume correspondent and tags are None | ||||
|         m = re.match(self.REGEX_TITLE, parseable) | ||||
|         return None, m.group(1), (), get_suffix(m.group(2)) | ||||
|         file_info = FileInfo.from_path(parseable) | ||||
|         return file_info.sender, file_info.title, file_info.tags, file_info.suffix | ||||
|  | ||||
|     def _store(self, text, doc, thumbnail): | ||||
|  | ||||
|   | ||||
| @@ -23,9 +23,72 @@ class FileInfo(object): | ||||
|         self._file_mtime = file_mtime | ||||
|         self._path = path | ||||
|  | ||||
|     REGEX_TITLE = re.compile( | ||||
|         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
|     REGEX_CORRESPONDENT_TITLE = re.compile( | ||||
|         r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
|     REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( | ||||
|         r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||
|         flags=re.IGNORECASE | ||||
|     ) | ||||
|  | ||||
|     @classmethod | ||||
|     def from_path(cls, path): | ||||
|         pass | ||||
|         """ | ||||
|         We use a crude naming convention to make handling the correspondent, | ||||
|         title, and tags easier: | ||||
|           "<correspondent> - <title> - <tags>.<suffix>" | ||||
|           "<correspondent> - <title>.<suffix>" | ||||
|           "<title>.<suffix>" | ||||
|         """ | ||||
|  | ||||
|         def get_correspondent(correspondent_name): | ||||
|             return Correspondent.objects.get_or_create( | ||||
|                 name=correspondent_name, | ||||
|                 defaults={"slug": slugify(correspondent_name)} | ||||
|             )[0] | ||||
|  | ||||
|         def get_tags(tags): | ||||
|             r = [] | ||||
|             for t in tags.split(","): | ||||
|                 r.append( | ||||
|                     Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) | ||||
|             return tuple(r) | ||||
|  | ||||
|         def get_suffix(suffix): | ||||
|             suffix = suffix.lower() | ||||
|             if suffix == "jpeg": | ||||
|                 return "jpg" | ||||
|             return suffix | ||||
|  | ||||
|         # First attempt: "<correspondent> - <title> - <tags>.<suffix>" | ||||
|         m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path) | ||||
|         if m: | ||||
|             return cls( | ||||
|                 title=m.group(2), | ||||
|                 correspondent=get_correspondent(m.group(1)), | ||||
|                 tags=get_tags(m.group(3)), | ||||
|                 suffix=get_suffix(m.group(4)) | ||||
|             ) | ||||
|  | ||||
|         # Second attempt: "<correspondent> - <title>.<suffix>" | ||||
|         m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path) | ||||
|         if m: | ||||
|             return cls( | ||||
|                 title=m.group(2), | ||||
|                 correspondent=get_correspondent(m.group(1)), | ||||
|                 tags=(), | ||||
|                 suffix=get_suffix(m.group(3)) | ||||
|             ) | ||||
|  | ||||
|         # That didn't work, so we assume correspondent and tags are None | ||||
|         m = re.match(cls.REGEX_TITLE, path) | ||||
|         return FileInfo( | ||||
|             title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) | ||||
|  | ||||
|     @classmethod | ||||
|     def from_document(cls, document): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Tikitu de Jager
					Tikitu de Jager