mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Introducing language detection
This commit is contained in:
		@@ -1,6 +1,6 @@
 | 
				
			|||||||
import datetime
 | 
					import datetime
 | 
				
			||||||
import glob
 | 
					import glob
 | 
				
			||||||
import gnupg
 | 
					import langdetect
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
@@ -44,9 +44,10 @@ class Command(BaseCommand):
 | 
				
			|||||||
    PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
 | 
					    PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, *args, **kwargs):
 | 
					    def __init__(self, *args, **kwargs):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.verbosity = 0
 | 
					        self.verbosity = 0
 | 
				
			||||||
        self.stats = {}
 | 
					        self.stats = {}
 | 
				
			||||||
        self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
 | 
					
 | 
				
			||||||
        BaseCommand.__init__(self, *args, **kwargs)
 | 
					        BaseCommand.__init__(self, *args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def handle(self, *args, **options):
 | 
					    def handle(self, *args, **options):
 | 
				
			||||||
@@ -132,6 +133,16 @@ class Command(BaseCommand):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        self._render("  OCRing the PDF", 2)
 | 
					        self._render("  OCRing the PDF", 2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        raw_text = self._ocr(pngs, self.OCR_LANG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        guessed_language = langdetect.detect(raw_text)
 | 
				
			||||||
 | 
					        if guessed_language == self.OCR_LANG:
 | 
				
			||||||
 | 
					            return raw_text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return self._ocr(pngs, guessed_language)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _ocr(self, pngs, lang):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        r = ""
 | 
					        r = ""
 | 
				
			||||||
        for png in pngs:
 | 
					        for png in pngs:
 | 
				
			||||||
            with Image.open(os.path.join(self.SCRATCH, png)) as f:
 | 
					            with Image.open(os.path.join(self.SCRATCH, png)) as f:
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user