mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Merge branch 'refactor-file-info-extraction' of https://github.com/tikitu/paperless into tikitu-refactor-file-info-extraction
This commit is contained in:
		@@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError
 | 
			
		||||
 | 
			
		||||
from paperless.db import GnuPG
 | 
			
		||||
 | 
			
		||||
from .models import Correspondent, Tag, Document, Log
 | 
			
		||||
from .models import Correspondent, Tag, Document, Log, FileInfo
 | 
			
		||||
from .languages import ISO639
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -54,19 +54,6 @@ class Consumer(object):
 | 
			
		||||
 | 
			
		||||
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
			
		||||
 | 
			
		||||
    REGEX_TITLE = re.compile(
 | 
			
		||||
        r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
 | 
			
		||||
        flags=re.IGNORECASE
 | 
			
		||||
    )
 | 
			
		||||
    REGEX_CORRESPONDENT_TITLE = re.compile(
 | 
			
		||||
        r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
 | 
			
		||||
        flags=re.IGNORECASE
 | 
			
		||||
    )
 | 
			
		||||
    REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
 | 
			
		||||
        r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
 | 
			
		||||
        flags=re.IGNORECASE
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
 | 
			
		||||
        self.logger = logging.getLogger(__name__)
 | 
			
		||||
@@ -105,7 +92,7 @@ class Consumer(object):
 | 
			
		||||
            if not os.path.isfile(doc):
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if not re.match(self.REGEX_TITLE, doc):
 | 
			
		||||
            if not re.match(FileInfo.REGEX_TITLE, doc):
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if doc in self._ignore:
 | 
			
		||||
@@ -269,72 +256,20 @@ class Consumer(object):
 | 
			
		||||
        # Strip out excess white space to allow matching to go smoother
 | 
			
		||||
        return re.sub(r"\s+", " ", r)
 | 
			
		||||
 | 
			
		||||
    def _guess_attributes_from_name(self, parseable):
 | 
			
		||||
        """
 | 
			
		||||
        We use a crude naming convention to make handling the correspondent,
 | 
			
		||||
        title, and tags easier:
 | 
			
		||||
          "<correspondent> - <title> - <tags>.<suffix>"
 | 
			
		||||
          "<correspondent> - <title>.<suffix>"
 | 
			
		||||
          "<title>.<suffix>"
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        def get_correspondent(correspondent_name):
 | 
			
		||||
            return Correspondent.objects.get_or_create(
 | 
			
		||||
                name=correspondent_name,
 | 
			
		||||
                defaults={"slug": slugify(correspondent_name)}
 | 
			
		||||
            )[0]
 | 
			
		||||
 | 
			
		||||
        def get_tags(tags):
 | 
			
		||||
            r = []
 | 
			
		||||
            for t in tags.split(","):
 | 
			
		||||
                r.append(
 | 
			
		||||
                    Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
 | 
			
		||||
            return tuple(r)
 | 
			
		||||
 | 
			
		||||
        def get_suffix(suffix):
 | 
			
		||||
            suffix = suffix.lower()
 | 
			
		||||
            if suffix == "jpeg":
 | 
			
		||||
                return "jpg"
 | 
			
		||||
            return suffix
 | 
			
		||||
 | 
			
		||||
        # First attempt: "<correspondent> - <title> - <tags>.<suffix>"
 | 
			
		||||
        m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
 | 
			
		||||
        if m:
 | 
			
		||||
            return (
 | 
			
		||||
                get_correspondent(m.group(1)),
 | 
			
		||||
                m.group(2),
 | 
			
		||||
                get_tags(m.group(3)),
 | 
			
		||||
                get_suffix(m.group(4))
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        # Second attempt: "<correspondent> - <title>.<suffix>"
 | 
			
		||||
        m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
 | 
			
		||||
        if m:
 | 
			
		||||
            return (
 | 
			
		||||
                get_correspondent(m.group(1)),
 | 
			
		||||
                m.group(2),
 | 
			
		||||
                (),
 | 
			
		||||
                get_suffix(m.group(3))
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        # That didn't work, so we assume correspondent and tags are None
 | 
			
		||||
        m = re.match(self.REGEX_TITLE, parseable)
 | 
			
		||||
        return None, m.group(1), (), get_suffix(m.group(2))
 | 
			
		||||
 | 
			
		||||
    def _store(self, text, doc, thumbnail):
 | 
			
		||||
 | 
			
		||||
        sender, title, tags, file_type = self._guess_attributes_from_name(doc)
 | 
			
		||||
        relevant_tags = set(list(Tag.match_all(text)) + list(tags))
 | 
			
		||||
        file_info = FileInfo.from_path(doc)
 | 
			
		||||
        relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
 | 
			
		||||
 | 
			
		||||
        stats = os.stat(doc)
 | 
			
		||||
 | 
			
		||||
        self.log("debug", "Saving record to database")
 | 
			
		||||
 | 
			
		||||
        document = Document.objects.create(
 | 
			
		||||
            correspondent=sender,
 | 
			
		||||
            title=title,
 | 
			
		||||
            correspondent=file_info.correspondent,
 | 
			
		||||
            title=file_info.title,
 | 
			
		||||
            content=text,
 | 
			
		||||
            file_type=file_type,
 | 
			
		||||
            file_type=file_info.suffix,
 | 
			
		||||
            created=timezone.make_aware(
 | 
			
		||||
                datetime.datetime.fromtimestamp(stats.st_mtime)),
 | 
			
		||||
            modified=timezone.make_aware(
 | 
			
		||||
 
 | 
			
		||||
@@ -12,6 +12,97 @@ from django.utils import timezone
 | 
			
		||||
from .managers import LogManager
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FileInfo(object):
 | 
			
		||||
    def __init__(self, title, suffix,
 | 
			
		||||
                 correspondent=None, tags=None):
 | 
			
		||||
        self._title = title
 | 
			
		||||
        self._suffix = suffix
 | 
			
		||||
        self._correspondent = correspondent
 | 
			
		||||
        self._tags = tags
 | 
			
		||||
 | 
			
		||||
    REGEX_TITLE = re.compile(
 | 
			
		||||
        r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
 | 
			
		||||
        flags=re.IGNORECASE
 | 
			
		||||
    )
 | 
			
		||||
    REGEX_CORRESPONDENT_TITLE = re.compile(
 | 
			
		||||
        r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
 | 
			
		||||
        flags=re.IGNORECASE
 | 
			
		||||
    )
 | 
			
		||||
    REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
 | 
			
		||||
        r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
 | 
			
		||||
        flags=re.IGNORECASE
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def from_path(cls, path):
 | 
			
		||||
        """
 | 
			
		||||
        We use a crude naming convention to make handling the correspondent,
 | 
			
		||||
        title, and tags easier:
 | 
			
		||||
          "<correspondent> - <title> - <tags>.<suffix>"
 | 
			
		||||
          "<correspondent> - <title>.<suffix>"
 | 
			
		||||
          "<title>.<suffix>"
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        def get_correspondent(correspondent_name):
 | 
			
		||||
            return Correspondent.objects.get_or_create(
 | 
			
		||||
                name=correspondent_name,
 | 
			
		||||
                defaults={"slug": slugify(correspondent_name)}
 | 
			
		||||
            )[0]
 | 
			
		||||
 | 
			
		||||
        def get_tags(tags):
 | 
			
		||||
            r = []
 | 
			
		||||
            for t in tags.split(","):
 | 
			
		||||
                r.append(
 | 
			
		||||
                    Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
 | 
			
		||||
            return tuple(r)
 | 
			
		||||
 | 
			
		||||
        def get_suffix(suffix):
 | 
			
		||||
            suffix = suffix.lower()
 | 
			
		||||
            if suffix == "jpeg":
 | 
			
		||||
                return "jpg"
 | 
			
		||||
            return suffix
 | 
			
		||||
 | 
			
		||||
        # First attempt: "<correspondent> - <title> - <tags>.<suffix>"
 | 
			
		||||
        m = re.match(cls.REGEX_CORRESPONDENT_TITLE_TAGS, path)
 | 
			
		||||
        if m:
 | 
			
		||||
            return cls(
 | 
			
		||||
                title=m.group(2),
 | 
			
		||||
                correspondent=get_correspondent(m.group(1)),
 | 
			
		||||
                tags=get_tags(m.group(3)),
 | 
			
		||||
                suffix=get_suffix(m.group(4))
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        # Second attempt: "<correspondent> - <title>.<suffix>"
 | 
			
		||||
        m = re.match(cls.REGEX_CORRESPONDENT_TITLE, path)
 | 
			
		||||
        if m:
 | 
			
		||||
            return cls(
 | 
			
		||||
                title=m.group(2),
 | 
			
		||||
                correspondent=get_correspondent(m.group(1)),
 | 
			
		||||
                tags=(),
 | 
			
		||||
                suffix=get_suffix(m.group(3))
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        # That didn't work, so we assume correspondent and tags are None
 | 
			
		||||
        m = re.match(cls.REGEX_TITLE, path)
 | 
			
		||||
        return FileInfo(
 | 
			
		||||
            title=m.group(1), tags=(), suffix=get_suffix(m.group(2)))
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def title(self):
 | 
			
		||||
        return self._title
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def correspondent(self):
 | 
			
		||||
        return self._correspondent
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def tags(self):
 | 
			
		||||
        return self._tags
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def suffix(self):
 | 
			
		||||
        return self._suffix
 | 
			
		||||
 | 
			
		||||
class SluggedModel(models.Model):
 | 
			
		||||
 | 
			
		||||
    name = models.CharField(max_length=128, unique=True)
 | 
			
		||||
 
 | 
			
		||||
@@ -1,12 +1,11 @@
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
 | 
			
		||||
from ..consumer import Consumer
 | 
			
		||||
from ..models import FileInfo
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestAttachment(TestCase):
 | 
			
		||||
 | 
			
		||||
    TAGS = ("tag1", "tag2", "tag3")
 | 
			
		||||
    CONSUMER = Consumer()
 | 
			
		||||
    SUFFIXES = (
 | 
			
		||||
        "pdf", "png", "jpg", "jpeg", "gif",
 | 
			
		||||
        "PDF", "PNG", "JPG", "JPEG", "GIF",
 | 
			
		||||
@@ -16,14 +15,14 @@ class TestAttachment(TestCase):
 | 
			
		||||
    def _test_guess_attributes_from_name(self, path, sender, title, tags):
 | 
			
		||||
        for suffix in self.SUFFIXES:
 | 
			
		||||
            f = path.format(suffix)
 | 
			
		||||
            results = self.CONSUMER._guess_attributes_from_name(f)
 | 
			
		||||
            self.assertEqual(results[0].name, sender, f)
 | 
			
		||||
            self.assertEqual(results[1], title, f)
 | 
			
		||||
            self.assertEqual(tuple([t.slug for t in results[2]]), tags, f)
 | 
			
		||||
            file_info = FileInfo.from_path(f)
 | 
			
		||||
            self.assertEqual(file_info.correspondent.name, sender, f)
 | 
			
		||||
            self.assertEqual(file_info.title, title, f)
 | 
			
		||||
            self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
 | 
			
		||||
            if suffix.lower() == "jpeg":
 | 
			
		||||
                self.assertEqual(results[3], "jpg", f)
 | 
			
		||||
                self.assertEqual(file_info.suffix, "jpg", f)
 | 
			
		||||
            else:
 | 
			
		||||
                self.assertEqual(results[3], suffix.lower(), f)
 | 
			
		||||
                self.assertEqual(file_info.suffix, suffix.lower(), f)
 | 
			
		||||
 | 
			
		||||
    def test_guess_attributes_from_name0(self):
 | 
			
		||||
        self._test_guess_attributes_from_name(
 | 
			
		||||
@@ -92,3 +91,95 @@ class TestAttachment(TestCase):
 | 
			
		||||
            "Τιτλε",
 | 
			
		||||
            self.TAGS
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def test_guess_attributes_from_name_when_correspondent_empty(self):
 | 
			
		||||
        self._test_guess_attributes_from_name(
 | 
			
		||||
            '/path/to/ - weird empty correspondent but should not break.{}',
 | 
			
		||||
            None,
 | 
			
		||||
            ' - weird empty correspondent but should not break',
 | 
			
		||||
            ()
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def test_guess_attributes_from_name_when_title_starts_with_dash(self):
 | 
			
		||||
        self._test_guess_attributes_from_name(
 | 
			
		||||
            '/path/to/- weird but should not break.{}',
 | 
			
		||||
            None,
 | 
			
		||||
            '- weird but should not break',
 | 
			
		||||
            ()
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def test_guess_attributes_from_name_when_title_ends_with_dash(self):
 | 
			
		||||
        self._test_guess_attributes_from_name(
 | 
			
		||||
            '/path/to/weird but should not break -.{}',
 | 
			
		||||
            None,
 | 
			
		||||
            'weird but should not break -',
 | 
			
		||||
            ()
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def test_guess_attributes_from_name_when_title_is_empty(self):
 | 
			
		||||
        self._test_guess_attributes_from_name(
 | 
			
		||||
            '/path/to/weird correspondent but should not break - .{}',
 | 
			
		||||
            'weird correspondent but should not break',
 | 
			
		||||
            '',
 | 
			
		||||
            ()
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Permutations(TestCase):
 | 
			
		||||
    valid_correspondents = ['timmy', 'Dr. McWheelie',
 | 
			
		||||
                            'Dash Gor-don', 'ο Θερμαστής']
 | 
			
		||||
    valid_titles = ['title', 'Title w Spaces', 'Title a-dash', 'Τίτλος', '']
 | 
			
		||||
    valid_tags = ['tag', 'tig,tag', '-', '0,1,2', '']
 | 
			
		||||
    valid_suffixes = ['pdf', 'png', 'jpg', 'jpeg', 'gif']
 | 
			
		||||
 | 
			
		||||
    def _test_guessed_attributes(
 | 
			
		||||
            self, filename, title, suffix, correspondent=None, tags=None):
 | 
			
		||||
        file_info = FileInfo.from_path(filename)
 | 
			
		||||
 | 
			
		||||
        # Required
 | 
			
		||||
        self.assertEqual(file_info.title, title, filename)
 | 
			
		||||
        if suffix == 'jpeg':
 | 
			
		||||
            suffix = 'jpg'
 | 
			
		||||
        self.assertEqual(file_info.suffix, suffix, filename)
 | 
			
		||||
        # Optional
 | 
			
		||||
        if correspondent is None:
 | 
			
		||||
            self.assertEqual(file_info.correspondent,
 | 
			
		||||
                             correspondent, filename)
 | 
			
		||||
        else:
 | 
			
		||||
            self.assertEqual(file_info.correspondent.name,
 | 
			
		||||
                             correspondent, filename)
 | 
			
		||||
        if tags is None:
 | 
			
		||||
            self.assertEqual(file_info.tags, (), filename)
 | 
			
		||||
        else:
 | 
			
		||||
            self.assertEqual([t.slug for t in file_info.tags],
 | 
			
		||||
                             tags.split(','),
 | 
			
		||||
                             filename)
 | 
			
		||||
 | 
			
		||||
    def test_just_title(self):
 | 
			
		||||
        template = '/path/to/{title}.{suffix}'
 | 
			
		||||
        for title in self.valid_titles:
 | 
			
		||||
            for suffix in self.valid_suffixes:
 | 
			
		||||
                spec = dict(title=title, suffix=suffix)
 | 
			
		||||
                filename = template.format(**spec)
 | 
			
		||||
                self._test_guessed_attributes(filename, **spec)
 | 
			
		||||
 | 
			
		||||
    def test_title_and_correspondent(self):
 | 
			
		||||
        template = '/path/to/{correspondent} - {title}.{suffix}'
 | 
			
		||||
        for correspondent in self.valid_correspondents:
 | 
			
		||||
            for title in self.valid_titles:
 | 
			
		||||
                for suffix in self.valid_suffixes:
 | 
			
		||||
                    spec = dict(correspondent=correspondent, title=title,
 | 
			
		||||
                                suffix=suffix)
 | 
			
		||||
                    filename = template.format(**spec)
 | 
			
		||||
                    self._test_guessed_attributes(filename, **spec)
 | 
			
		||||
 | 
			
		||||
    def test_title_and_correspondent_and_tags(self):
 | 
			
		||||
        template = '/path/to/{correspondent} - {title} - {tags}.{suffix}'
 | 
			
		||||
        for correspondent in self.valid_correspondents:
 | 
			
		||||
            for title in self.valid_titles:
 | 
			
		||||
                for tags in self.valid_tags:
 | 
			
		||||
                    for suffix in self.valid_suffixes:
 | 
			
		||||
                        spec = dict(correspondent=correspondent, title=title,
 | 
			
		||||
                                    tags=tags, suffix=suffix)
 | 
			
		||||
                        filename = template.format(**spec)
 | 
			
		||||
                        self._test_guessed_attributes(filename, **spec)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user