mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge branch 'refactor-file-info-extraction' of https://github.com/tikitu/paperless into tikitu-refactor-file-info-extraction
This commit is contained in:
		| @@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError | |||||||
|  |  | ||||||
| from paperless.db import GnuPG | from paperless.db import GnuPG | ||||||
|  |  | ||||||
| from .models import Correspondent, Tag, Document, Log | from .models import Correspondent, Tag, Document, Log, FileInfo | ||||||
| from .languages import ISO639 | from .languages import ISO639 | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -54,19 +54,6 @@ class Consumer(object): | |||||||
|  |  | ||||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE |     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||||
|  |  | ||||||
|     REGEX_TITLE = re.compile( |  | ||||||
|         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", |  | ||||||
|         flags=re.IGNORECASE |  | ||||||
|     ) |  | ||||||
|     REGEX_CORRESPONDENT_TITLE = re.compile( |  | ||||||
|         r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", |  | ||||||
|         flags=re.IGNORECASE |  | ||||||
|     ) |  | ||||||
|     REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( |  | ||||||
|         r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", |  | ||||||
|         flags=re.IGNORECASE |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     def __init__(self): |     def __init__(self): | ||||||
|  |  | ||||||
|         self.logger = logging.getLogger(__name__) |         self.logger = logging.getLogger(__name__) | ||||||
| @@ -105,7 +92,7 @@ class Consumer(object): | |||||||
|             if not os.path.isfile(doc): |             if not os.path.isfile(doc): | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|             if not re.match(self.REGEX_TITLE, doc): |             if not re.match(FileInfo.REGEX_TITLE, doc): | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|             if doc in self._ignore: |             if doc in self._ignore: | ||||||
| @@ -269,72 +256,20 @@ class Consumer(object): | |||||||
|         # Strip out excess white space to allow matching to go smoother |         # Strip out excess white space to allow matching to go smoother | ||||||
|         return re.sub(r"\s+", " ", r) |         return re.sub(r"\s+", " ", r) | ||||||
|  |  | ||||||
|     def _guess_attributes_from_name(self, parseable): |  | ||||||
|         """ |  | ||||||
|         We use a crude naming convention to make handling the correspondent, |  | ||||||
|         title, and tags easier: |  | ||||||
|           "<correspondent> - <title> - <tags>.<suffix>" |  | ||||||
|           "<correspondent> - <title>.<suffix>" |  | ||||||
|           "<title>.<suffix>" |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         def get_correspondent(correspondent_name): |  | ||||||
|             return Correspondent.objects.get_or_create( |  | ||||||
|                 name=correspondent_name, |  | ||||||
|                 defaults={"slug": slugify(correspondent_name)} |  | ||||||
|             )[0] |  | ||||||
|  |  | ||||||
|         def get_tags(tags): |  | ||||||
|             r = [] |  | ||||||
|             for t in tags.split(","): |  | ||||||
|                 r.append( |  | ||||||
|                     Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) |  | ||||||
|             return tuple(r) |  | ||||||
|  |  | ||||||
|         def get_suffix(suffix): |  | ||||||
|             suffix = suffix.lower() |  | ||||||
|             if suffix == "jpeg": |  | ||||||
|                 return "jpg" |  | ||||||
|             return suffix |  | ||||||
|  |  | ||||||
|         # First attempt: "<correspondent> - <title> - <tags>.<suffix>" |  | ||||||
|         m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) |  | ||||||
|         if m: |  | ||||||
|             return ( |  | ||||||
|                 get_correspondent(m.group(1)), |  | ||||||
|                 m.group(2), |  | ||||||
|                 get_tags(m.group(3)), |  | ||||||
|                 get_suffix(m.group(4)) |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         # Second attempt: "<correspondent> - <title>.<suffix>" |  | ||||||
|         m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) |  | ||||||
|         if m: |  | ||||||
|             return ( |  | ||||||
|                 get_correspondent(m.group(1)), |  | ||||||
|                 m.group(2), |  | ||||||
|                 (), |  | ||||||
|                 get_suffix(m.group(3)) |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|         # That didn't work, so we assume correspondent and tags are None |  | ||||||
|         m = re.match(self.REGEX_TITLE, parseable) |  | ||||||
|         return None, m.group(1), (), get_suffix(m.group(2)) |  | ||||||
|  |  | ||||||
|     def _store(self, text, doc, thumbnail): |     def _store(self, text, doc, thumbnail): | ||||||
|  |  | ||||||
|         sender, title, tags, file_type = self._guess_attributes_from_name(doc) |         file_info = FileInfo.from_path(doc) | ||||||
|         relevant_tags = set(list(Tag.match_all(text)) + list(tags)) |         relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags)) | ||||||
|  |  | ||||||
|         stats = os.stat(doc) |         stats = os.stat(doc) | ||||||
|  |  | ||||||
|         self.log("debug", "Saving record to database") |         self.log("debug", "Saving record to database") | ||||||
|  |  | ||||||
|         document = Document.objects.create( |         document = Document.objects.create( | ||||||
|             correspondent=sender, |             correspondent=file_info.correspondent, | ||||||
|             title=title, |             title=file_info.title, | ||||||
|             content=text, |             content=text, | ||||||
|             file_type=file_type, |             file_type=file_info.suffix, | ||||||
|             created=timezone.make_aware( |             created=timezone.make_aware( | ||||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime)), |                 datetime.datetime.fromtimestamp(stats.st_mtime)), | ||||||
|             modified=timezone.make_aware( |             modified=timezone.make_aware( | ||||||
|   | |||||||
| @@ -12,6 +12,97 @@ from django.utils import timezone | |||||||
| from .managers import LogManager | from .managers import LogManager | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class FileInfo(object): | ||||||
|  |     def __init__(self, title, suffix, | ||||||
|  |                  correspondent=None, tags=None): | ||||||
|  |         self._title = title | ||||||
|  |         self._suffix = suffix | ||||||
|  |         self._correspondent = correspondent | ||||||
|  |         self._tags = tags | ||||||
|  |  | ||||||
|  |     REGEX_TITLE = re.compile( | ||||||
|  |         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||||
|  |         flags=re.IGNORECASE | ||||||
|  |     ) | ||||||
|  |     REGEX_CORRESPONDENT_TITLE = re.compile( | ||||||
|  |         r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||||
|  |         flags=re.IGNORECASE | ||||||
|  |     ) | ||||||
|  |     REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( | ||||||
|  |         r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", | ||||||
|  |         flags=re.IGNORECASE | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def from_path(cls, path): | ||||||
|  |         """ | ||||||
|  |         We use a crude naming convention to make handling the correspondent, | ||||||
|  |         title, and tags easier: | ||||||
|  |           "<correspondent> - <title> - <tags>.<suffix>" | ||||||
|  |           "<correspondent> - <title>.<suffix>" | ||||||
|  |           "<title>.<suffix>" | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         def get_correspondent(correspondent_name): | ||||||
|  |             return Correspondent.objects.get_or_create( | ||||||
|  |                 name=correspondent_name, | ||||||
|  |                 defaults={"slug": slugify(correspondent_name)} | ||||||
|  |             )[0] | ||||||
|  |  | ||||||
|  |         def get_tags(tags): | ||||||
|  |             r = [] | ||||||
|  |             for t in tags.split(","): | ||||||
|  |                 r.append( | ||||||
|  |                     Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) | ||||||
|  |             return tuple(r) | ||||||
|  |  | ||||||
|  |         def get_suffix(suffix): | ||||||
|  |             suffix = suffix.lower() | ||||||
|  |             if suffix == "jpeg": | ||||||
|  |                 return "jpg" | ||||||
|  |             return suffix | ||||||
|  |  | ||||||
|  |         # First attempt: "<correspondent> - <title> - <tags>.<suffix>" | ||||||
|  |         m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path) | ||||||
|  |         if m: | ||||||
|  |             return cls( | ||||||
|  |                 title=m.group(2), | ||||||
|  |                 correspondent=get_correspondent(m.group(1)), | ||||||
|  |                 tags=get_tags(m.group(3)), | ||||||
|  |                 suffix=get_suffix(m.group(4)) | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |         # Second attempt: "<correspondent> - <title>.<suffix>" | ||||||
|  |         m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path) | ||||||
|  |         if m: | ||||||
|  |             return cls( | ||||||
|  |                 title=m.group(2), | ||||||
|  |                 correspondent=get_correspondent(m.group(1)), | ||||||
|  |                 tags=(), | ||||||
|  |                 suffix=get_suffix(m.group(3)) | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |         # That didn't work, so we assume correspondent and tags are None | ||||||
|  |         m = re.match(cls.REGEX_TITLE, path) | ||||||
|  |         return FileInfo( | ||||||
|  |             title=m.group(1), tags=(), suffix=get_suffix(m.group(2))) | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def title(self): | ||||||
|  |         return self._title | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def correspondent(self): | ||||||
|  |         return self._correspondent | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def tags(self): | ||||||
|  |         return self._tags | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def suffix(self): | ||||||
|  |         return self._suffix | ||||||
|  |  | ||||||
| class SluggedModel(models.Model): | class SluggedModel(models.Model): | ||||||
|  |  | ||||||
|     name = models.CharField(max_length=128, unique=True) |     name = models.CharField(max_length=128, unique=True) | ||||||
|   | |||||||
| @@ -1,12 +1,11 @@ | |||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
|  |  | ||||||
| from ..consumer import Consumer | from ..models import FileInfo | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestAttachment(TestCase): | class TestAttachment(TestCase): | ||||||
|  |  | ||||||
|     TAGS = ("tag1", "tag2", "tag3") |     TAGS = ("tag1", "tag2", "tag3") | ||||||
|     CONSUMER = Consumer() |  | ||||||
|     SUFFIXES = ( |     SUFFIXES = ( | ||||||
|         "pdf", "png", "jpg", "jpeg", "gif", |         "pdf", "png", "jpg", "jpeg", "gif", | ||||||
|         "PDF", "PNG", "JPG", "JPEG", "GIF", |         "PDF", "PNG", "JPG", "JPEG", "GIF", | ||||||
| @@ -16,14 +15,14 @@ class TestAttachment(TestCase): | |||||||
|     def _test_guess_attributes_from_name(self, path, sender, title, tags): |     def _test_guess_attributes_from_name(self, path, sender, title, tags): | ||||||
|         for suffix in self.SUFFIXES: |         for suffix in self.SUFFIXES: | ||||||
|             f = path.format(suffix) |             f = path.format(suffix) | ||||||
|             results = self.CONSUMER._guess_attributes_from_name(f) |             file_info = FileInfo.from_path(f) | ||||||
|             self.assertEqual(results[0].name, sender, f) |             self.assertEqual(file_info.correspondent.name, sender, f) | ||||||
|             self.assertEqual(results[1], title, f) |             self.assertEqual(file_info.title, title, f) | ||||||
|             self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) |             self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f) | ||||||
|             if suffix.lower() == "jpeg": |             if suffix.lower() == "jpeg": | ||||||
|                 self.assertEqual(results[3], "jpg", f) |                 self.assertEqual(file_info.suffix, "jpg", f) | ||||||
|             else: |             else: | ||||||
|                 self.assertEqual(results[3], suffix.lower(), f) |                 self.assertEqual(file_info.suffix, suffix.lower(), f) | ||||||
|  |  | ||||||
|     def test_guess_attributes_from_name0(self): |     def test_guess_attributes_from_name0(self): | ||||||
|         self._test_guess_attributes_from_name( |         self._test_guess_attributes_from_name( | ||||||
| @@ -92,3 +91,95 @@ class TestAttachment(TestCase): | |||||||
|             "Τιτλε", |             "Τιτλε", | ||||||
|             self.TAGS |             self.TAGS | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |     def test_guess_attributes_from_name_when_correspondent_empty(self): | ||||||
|  |         self._test_guess_attributes_from_name( | ||||||
|  |             '/path/to/ - weird empty correspondent but should not break.{}', | ||||||
|  |             None, | ||||||
|  |             ' - weird empty correspondent but should not break', | ||||||
|  |             () | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def test_guess_attributes_from_name_when_title_starts_with_dash(self): | ||||||
|  |         self._test_guess_attributes_from_name( | ||||||
|  |             '/path/to/- weird but should not break.{}', | ||||||
|  |             None, | ||||||
|  |             '- weird but should not break', | ||||||
|  |             () | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def test_guess_attributes_from_name_when_title_ends_with_dash(self): | ||||||
|  |         self._test_guess_attributes_from_name( | ||||||
|  |             '/path/to/weird but should not break -.{}', | ||||||
|  |             None, | ||||||
|  |             'weird but should not break -', | ||||||
|  |             () | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def test_guess_attributes_from_name_when_title_is_empty(self): | ||||||
|  |         self._test_guess_attributes_from_name( | ||||||
|  |             '/path/to/weird correspondent but should not break - .{}', | ||||||
|  |             'weird correspondent but should not break', | ||||||
|  |             '', | ||||||
|  |             () | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Permutations(TestCase): | ||||||
|  |     valid_correspondents = ['timmy', 'Dr. McWheelie', | ||||||
|  |                             'Dash Gor-don', 'ο Θερμαστής'] | ||||||
|  |     valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', ''] | ||||||
|  |     valid_tags = ['tag', 'tig,tag', '-', '0,1,2', ''] | ||||||
|  |     valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif'] | ||||||
|  |  | ||||||
|  |     def _test_guessed_attributes( | ||||||
|  |             self, filename, title, suffix, correspondent=None, tags=None): | ||||||
|  |         file_info = FileInfo.from_path(filename) | ||||||
|  |  | ||||||
|  |         # Required | ||||||
|  |         self.assertEqual(file_info.title, title, filename) | ||||||
|  |         if suffix == 'jpeg': | ||||||
|  |             suffix = 'jpg' | ||||||
|  |         self.assertEqual(file_info.suffix, suffix, filename) | ||||||
|  |         # Optional | ||||||
|  |         if correspondent is None: | ||||||
|  |             self.assertEqual(file_info.correspondent, | ||||||
|  |                              correspondent, filename) | ||||||
|  |         else: | ||||||
|  |             self.assertEqual(file_info.correspondent.name, | ||||||
|  |                              correspondent, filename) | ||||||
|  |         if tags is None: | ||||||
|  |             self.assertEqual(file_info.tags, (), filename) | ||||||
|  |         else: | ||||||
|  |             self.assertEqual([t.slug for t in file_info.tags], | ||||||
|  |                              tags.split(','), | ||||||
|  |                              filename) | ||||||
|  |  | ||||||
|  |     def test_just_title(self): | ||||||
|  |         template = '/path/to/{title}.{suffix}' | ||||||
|  |         for title in self.valid_titles: | ||||||
|  |             for suffix in self.valid_suffixes: | ||||||
|  |                 spec = dict(title=title, suffix=suffix) | ||||||
|  |                 filename = template.format(**spec) | ||||||
|  |                 self._test_guessed_attributes(filename, **spec) | ||||||
|  |  | ||||||
|  |     def test_title_and_correspondent(self): | ||||||
|  |         template = '/path/to/{correspondent} - {title}.{suffix}' | ||||||
|  |         for correspondent in self.valid_correspondents: | ||||||
|  |             for title in self.valid_titles: | ||||||
|  |                 for suffix in self.valid_suffixes: | ||||||
|  |                     spec = dict(correspondent=correspondent, title=title, | ||||||
|  |                                 suffix=suffix) | ||||||
|  |                     filename = template.format(**spec) | ||||||
|  |                     self._test_guessed_attributes(filename, **spec) | ||||||
|  |  | ||||||
|  |     def test_title_and_correspondent_and_tags(self): | ||||||
|  |         template = '/path/to/{correspondent} - {title} - {tags}.{suffix}' | ||||||
|  |         for correspondent in self.valid_correspondents: | ||||||
|  |             for title in self.valid_titles: | ||||||
|  |                 for tags in self.valid_tags: | ||||||
|  |                     for suffix in self.valid_suffixes: | ||||||
|  |                         spec = dict(correspondent=correspondent, title=title, | ||||||
|  |                                     tags=tags, suffix=suffix) | ||||||
|  |                         filename = template.format(**spec) | ||||||
|  |                         self._test_guessed_attributes(filename, **spec) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn