mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Added language detection
This commit is contained in:
		| @@ -16,10 +16,15 @@ from django.core.management.base import BaseCommand, CommandError | |||||||
| from django.template.defaultfilters import slugify | from django.template.defaultfilters import slugify | ||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
|  |  | ||||||
| from documents.models import Document, Sender | from ...languages import ISO639 | ||||||
|  | from ...models import Document, Sender | ||||||
| from paperless.db import GnuPG | from paperless.db import GnuPG | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class OCRError(BaseException): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
| class Command(BaseCommand): | class Command(BaseCommand): | ||||||
|     """ |     """ | ||||||
|     Loop over every file found in CONSUMPTION_DIR and: |     Loop over every file found in CONSUMPTION_DIR and: | ||||||
| @@ -37,7 +42,7 @@ class Command(BaseCommand): | |||||||
|     CONSUME = settings.CONSUMPTION_DIR |     CONSUME = settings.CONSUMPTION_DIR | ||||||
|  |  | ||||||
|     OCR = pyocr.get_available_tools()[0] |     OCR = pyocr.get_available_tools()[0] | ||||||
|     OCR_LANG = settings.TESSERACT_LANGUAGE |     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||||
|     MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") |     MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") | ||||||
|  |  | ||||||
|     PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$") |     PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$") | ||||||
| @@ -47,6 +52,7 @@ class Command(BaseCommand): | |||||||
|  |  | ||||||
|         self.verbosity = 0 |         self.verbosity = 0 | ||||||
|         self.stats = {} |         self.stats = {} | ||||||
|  |         self._ignore = [] | ||||||
|  |  | ||||||
|         BaseCommand.__init__(self, *args, **kwargs) |         BaseCommand.__init__(self, *args, **kwargs) | ||||||
|  |  | ||||||
| @@ -81,13 +87,22 @@ class Command(BaseCommand): | |||||||
|             if not re.match(self.PARSER_REGEX_TITLE, pdf): |             if not re.match(self.PARSER_REGEX_TITLE, pdf): | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|  |             if pdf in self._ignore: | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|             if self._is_ready(pdf): |             if self._is_ready(pdf): | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|             self._render("Consuming {}".format(pdf), 1) |             self._render("Consuming {}".format(pdf), 1) | ||||||
|  |  | ||||||
|             pngs = self._get_greyscale(pdf) |             pngs = self._get_greyscale(pdf) | ||||||
|  |  | ||||||
|  |             try: | ||||||
|                 text = self._get_ocr(pngs) |                 text = self._get_ocr(pngs) | ||||||
|  |             except OCRError: | ||||||
|  |                 self._ignore.append(pdf) | ||||||
|  |                 self._render("OCR FAILURE: {}".format(pdf), 0) | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|             self._store(text, pdf) |             self._store(text, pdf) | ||||||
|             self._cleanup(pngs, pdf) |             self._cleanup(pngs, pdf) | ||||||
| @@ -131,23 +146,51 @@ class Command(BaseCommand): | |||||||
|  |  | ||||||
|     def _get_ocr(self, pngs): |     def _get_ocr(self, pngs): | ||||||
|  |  | ||||||
|         self._render("  OCRing the PDF", 2) |         self._render("  OCRing the PDF", 1) | ||||||
|  |  | ||||||
|         raw_text = self._ocr(pngs, self.OCR_LANG) |         raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) | ||||||
|  |  | ||||||
|         guessed_language = langdetect.detect(raw_text) |         guessed_language = langdetect.detect(raw_text) | ||||||
|         if guessed_language == self.OCR_LANG: |  | ||||||
|  |         self._render("    Language detected: {}".format(guessed_language), 2) | ||||||
|  |  | ||||||
|  |         if guessed_language not in ISO639: | ||||||
|  |             self._render("Language detection failed!", 0) | ||||||
|  |             if settings.FORGIVING_OCR: | ||||||
|  |                 self._render( | ||||||
|  |                     "As FORGIVING_OCR is enabled, we're going to make the best " | ||||||
|  |                     "with what we have.", | ||||||
|  |                     1 | ||||||
|  |                 ) | ||||||
|  |                 return raw_text | ||||||
|  |             raise OCRError | ||||||
|  |  | ||||||
|  |         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: | ||||||
|             return raw_text |             return raw_text | ||||||
|  |  | ||||||
|         return self._ocr(pngs, guessed_language) |         try: | ||||||
|  |             return self._ocr(pngs, ISO639[guessed_language]) | ||||||
|  |         except pyocr.pyocr.tesseract.TesseractError: | ||||||
|  |             if settings.FORGIVING_OCR: | ||||||
|  |                 self._render( | ||||||
|  |                     "OCR for {} failed, but we're going to stick with what " | ||||||
|  |                     "we've got since FORGIVING_OCR is enabled.".format( | ||||||
|  |                         guessed_language | ||||||
|  |                     ), | ||||||
|  |                     0 | ||||||
|  |                 ) | ||||||
|  |                 return raw_text | ||||||
|  |             raise OCRError | ||||||
|  |  | ||||||
|     def _ocr(self, pngs, lang): |     def _ocr(self, pngs, lang): | ||||||
|  |  | ||||||
|  |         self._render("    Parsing for {}".format(lang), 2) | ||||||
|  |  | ||||||
|         r = "" |         r = "" | ||||||
|         for png in pngs: |         for png in pngs: | ||||||
|             with Image.open(os.path.join(self.SCRATCH, png)) as f: |             with Image.open(os.path.join(self.SCRATCH, png)) as f: | ||||||
|                 self._render("    {}".format(f.filename), 3) |                 self._render("    {}".format(f.filename), 3) | ||||||
|                 r += self.OCR.image_to_string(f, lang=self.OCR_LANG) |                 r += self.OCR.image_to_string(f, lang=lang) | ||||||
|                 r += "\n\n\n\n\n\n\n\n" |                 r += "\n\n\n\n\n\n\n\n" | ||||||
|  |  | ||||||
|         return r |         return r | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn