mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	first stab at text consumer
This commit is contained in:
		@@ -67,6 +67,7 @@ INSTALLED_APPS = [
 | 
			
		||||
    "documents.apps.DocumentsConfig",
 | 
			
		||||
    "reminders.apps.RemindersConfig",
 | 
			
		||||
    "paperless_tesseract.apps.PaperlessTesseractConfig",
 | 
			
		||||
    "paperless_text.apps.PaperlessTextConfig",
 | 
			
		||||
 | 
			
		||||
    "flat_responsive",  # TODO: Remove as of Django 2.x
 | 
			
		||||
    "django.contrib.admin",
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										0
									
								
								src/paperless_text/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_text/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										16
									
								
								src/paperless_text/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/paperless_text/apps.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,16 @@
 | 
			
		||||
from django.apps import AppConfig
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PaperlessTextConfig(AppConfig):
 | 
			
		||||
 | 
			
		||||
    name = "paperless_text"
 | 
			
		||||
 | 
			
		||||
    def ready(self):
 | 
			
		||||
 | 
			
		||||
        from documents.signals import document_consumer_declaration
 | 
			
		||||
 | 
			
		||||
        from .signals import ConsumerDeclaration
 | 
			
		||||
 | 
			
		||||
        document_consumer_declaration.connect(ConsumerDeclaration.handle)
 | 
			
		||||
 | 
			
		||||
        AppConfig.ready(self)
 | 
			
		||||
							
								
								
									
										113
									
								
								src/paperless_text/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										113
									
								
								src/paperless_text/parsers.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,113 @@
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import subprocess
 | 
			
		||||
 | 
			
		||||
import dateparser
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
 | 
			
		||||
from documents.parsers import DocumentParser, ParseError
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TextDocumentParser(DocumentParser):
 | 
			
		||||
    """
 | 
			
		||||
    This parser directly parses a text document (.txt or .md)
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    CONVERT = settings.CONVERT_BINARY
 | 
			
		||||
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
			
		||||
    UNPAPER = settings.UNPAPER_BINARY
 | 
			
		||||
    DATE_ORDER = settings.DATE_ORDER
 | 
			
		||||
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
			
		||||
    OCR_ALWAYS = settings.OCR_ALWAYS
 | 
			
		||||
 | 
			
		||||
    def __init__(self, path):
 | 
			
		||||
        super().__init__(path)
 | 
			
		||||
        self._text = None
 | 
			
		||||
 | 
			
		||||
    def get_thumbnail(self):
 | 
			
		||||
        """
 | 
			
		||||
        The thumbnail of a txt is just a 500px wide image of the text
 | 
			
		||||
        rendered onto a letter-sized page.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        run_convert(
 | 
			
		||||
            self.CONVERT,
 | 
			
		||||
            "-size", "500x647",
 | 
			
		||||
            "xc:white",
 | 
			
		||||
            "-pointsize", "12",
 | 
			
		||||
            "-fill", "black",
 | 
			
		||||
            "-draw", "\"text 0,12 \'$(cat {})\'\"".format(self.document_path),
 | 
			
		||||
            os.path.join(self.tempdir, "convert-txt.png")
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        return os.path.join(self.tempdir, "convert-txt.png")
 | 
			
		||||
 | 
			
		||||
    def get_text(self):
 | 
			
		||||
 | 
			
		||||
        if self._text is not None:
 | 
			
		||||
            return self._text
 | 
			
		||||
 | 
			
		||||
        with open(self.document_path, 'r') as f:
 | 
			
		||||
            self._text = f.read()
 | 
			
		||||
 | 
			
		||||
        return self._text
 | 
			
		||||
 | 
			
		||||
    def get_date(self):
 | 
			
		||||
        date = None
 | 
			
		||||
        datestring = None
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            text = self.get_text()
 | 
			
		||||
        except ParseError as e:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        # This regular expression will try to find dates in the document at
 | 
			
		||||
        # hand and will match the following formats:
 | 
			
		||||
        # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - MONTH ZZZZ, with ZZZZ being 4 digits
 | 
			
		||||
        # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 | 
			
		||||
        pattern = re.compile(
 | 
			
		||||
            r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
 | 
			
		||||
            r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
 | 
			
		||||
            r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
 | 
			
		||||
            r'\b([^\W\d_]{3,9} [0-9]{4})\b')
 | 
			
		||||
 | 
			
		||||
        # Iterate through all regex matches and try to parse the date
 | 
			
		||||
        for m in re.finditer(pattern, text):
 | 
			
		||||
            datestring = m.group(0)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                date = dateparser.parse(
 | 
			
		||||
                           datestring,
 | 
			
		||||
                           settings={'DATE_ORDER': self.DATE_ORDER,
 | 
			
		||||
                                     'PREFER_DAY_OF_MONTH': 'first',
 | 
			
		||||
                                     'RETURN_AS_TIMEZONE_AWARE': True})
 | 
			
		||||
            except TypeError:
 | 
			
		||||
                # Skip all matches that do not parse to a proper date
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if date is not None:
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
        if date is not None:
 | 
			
		||||
            self.log("info", "Detected document date " + date.isoformat() +
 | 
			
		||||
                             " based on string " + datestring)
 | 
			
		||||
        else:
 | 
			
		||||
            self.log("info", "Unable to detect date for document")
 | 
			
		||||
 | 
			
		||||
        return date
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_convert(*args):
 | 
			
		||||
    environment = os.environ.copy()
 | 
			
		||||
    if settings.CONVERT_MEMORY_LIMIT:
 | 
			
		||||
        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
 | 
			
		||||
    if settings.CONVERT_TMPDIR:
 | 
			
		||||
        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
 | 
			
		||||
 | 
			
		||||
    if not subprocess.Popen(args, env=environment).wait() == 0:
 | 
			
		||||
        raise ParseError("Convert failed at {}".format(args))
 | 
			
		||||
							
								
								
									
										23
									
								
								src/paperless_text/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/paperless_text/signals.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,23 @@
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from .parsers import TextDocumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ConsumerDeclaration:
 | 
			
		||||
 | 
			
		||||
    MATCHING_FILES = re.compile("^.*\.(txt|md)$")
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def handle(cls, sender, **kwargs):
 | 
			
		||||
        return cls.test
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def test(cls, doc):
 | 
			
		||||
 | 
			
		||||
        if cls.MATCHING_FILES.match(doc.lower()):
 | 
			
		||||
            return {
 | 
			
		||||
                "parser": TextDocumentParser,
 | 
			
		||||
                "weight": 10
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        return None
 | 
			
		||||
		Reference in New Issue
	
	Block a user