mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-01 04:06:16 -05:00 
			
		
		
		
	feat: refactor for pluggable consumers
I've broken out the OCR-specific code from the consumers and dumped it
all into its own app, `paperless_tesseract`.  This new app should serve
as a sample of how to create one's own consumer for different file
types.
Documentation for how to do this isn't ready yet, but for the impatient:
* Create a new app
    * containing a `parsers.py` for your parser modelled after
      `paperless_tesseract.parsers.RasterisedDocumentParser`
    * containing a `signals.py` with a handler moddelled after
      `paperless_tesseract.signals.ConsumerDeclaration`
    * connect the signal handler to
      `documents.signals.document_consumer_declaration` in
      `your_app.apps`
* Install the app into Paperless by declaring
  `PAPERLESS_INSTALLED_APPS=your_app`.  Additional apps should be
  separated with commas.
* Restart the consumer
			
			
This commit is contained in:
		| @@ -1,35 +1,21 @@ | ||||
| import datetime | ||||
| import hashlib | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| import uuid | ||||
| import shutil | ||||
| import hashlib | ||||
| import logging | ||||
| import datetime | ||||
| import tempfile | ||||
| import itertools | ||||
| import subprocess | ||||
| from multiprocessing.pool import Pool | ||||
|  | ||||
| import pyocr | ||||
| import langdetect | ||||
| from PIL import Image | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
| from paperless.db import GnuPG | ||||
| from pyocr.tesseract import TesseractError | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
|  | ||||
| from .models import Tag, Document, FileInfo | ||||
| from .models import Document, FileInfo, Tag | ||||
| from .parsers import ParseError | ||||
| from .signals import ( | ||||
|     document_consumption_started, | ||||
|     document_consumption_finished | ||||
|     document_consumer_declaration, | ||||
|     document_consumption_finished, | ||||
|     document_consumption_started | ||||
| ) | ||||
| from .languages import ISO639 | ||||
|  | ||||
|  | ||||
| class OCRError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class ConsumerError(Exception): | ||||
| @@ -47,13 +33,7 @@ class Consumer(object): | ||||
|     """ | ||||
|  | ||||
|     SCRATCH = settings.SCRATCH_DIR | ||||
|     CONVERT = settings.CONVERT_BINARY | ||||
|     UNPAPER = settings.UNPAPER_BINARY | ||||
|     CONSUME = settings.CONSUMPTION_DIR | ||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||
|     DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 | ||||
|  | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|  | ||||
|     def __init__(self): | ||||
|  | ||||
| @@ -78,6 +58,16 @@ class Consumer(object): | ||||
|             raise ConsumerError( | ||||
|                 "Consumption directory {} does not exist".format(self.CONSUME)) | ||||
|  | ||||
|         self.parsers = [] | ||||
|         for response in document_consumer_declaration.send(self): | ||||
|             self.parsers.append(response[1]) | ||||
|  | ||||
|         if not self.parsers: | ||||
|             raise ConsumerError( | ||||
|                 "No parsers could be found, not even the default.  " | ||||
|                 "This is a problem." | ||||
|             ) | ||||
|  | ||||
|     def log(self, level, message): | ||||
|         getattr(self.logger, level)(message, extra={ | ||||
|             "group": self.logging_group | ||||
| @@ -109,6 +99,13 @@ class Consumer(object): | ||||
|                 self._ignore.append(doc) | ||||
|                 continue | ||||
|  | ||||
|             parser_class = self._get_parser_class(doc) | ||||
|             if not parser_class: | ||||
|                 self.log( | ||||
|                     "info", "No parsers could be found for {}".format(doc)) | ||||
|                 self._ignore.append(doc) | ||||
|                 continue | ||||
|  | ||||
|             self.logging_group = uuid.uuid4() | ||||
|  | ||||
|             self.log("info", "Consuming {}".format(doc)) | ||||
| @@ -119,25 +116,26 @@ class Consumer(object): | ||||
|                 logging_group=self.logging_group | ||||
|             ) | ||||
|  | ||||
|             tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) | ||||
|             imgs = self._get_greyscale(tempdir, doc) | ||||
|             thumbnail = self._get_thumbnail(tempdir, doc) | ||||
|             parsed_document = parser_class(doc) | ||||
|             thumbnail = parsed_document.get_thumbnail() | ||||
|  | ||||
|             try: | ||||
|  | ||||
|                 document = self._store(self._get_ocr(imgs), doc, thumbnail) | ||||
|  | ||||
|             except OCRError as e: | ||||
|                 document = self._store( | ||||
|                     parsed_document.get_text(), | ||||
|                     doc, | ||||
|                     thumbnail | ||||
|                 ) | ||||
|             except ParseError as e: | ||||
|  | ||||
|                 self._ignore.append(doc) | ||||
|                 self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) | ||||
|                 self._cleanup_tempdir(tempdir) | ||||
|                 self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) | ||||
|                 parsed_document.cleanup() | ||||
|  | ||||
|                 continue | ||||
|  | ||||
|             else: | ||||
|  | ||||
|                 self._cleanup_tempdir(tempdir) | ||||
|                 parsed_document.cleanup() | ||||
|                 self._cleanup_doc(doc) | ||||
|  | ||||
|                 self.log( | ||||
| @@ -151,142 +149,20 @@ class Consumer(object): | ||||
|                     logging_group=self.logging_group | ||||
|                 ) | ||||
|  | ||||
|     def _get_greyscale(self, tempdir, doc): | ||||
|     def _get_parser_class(self, doc): | ||||
|         """ | ||||
|         Greyscale images are easier for Tesseract to OCR | ||||
|         Determine the appropriate parser class based on the file | ||||
|         """ | ||||
|  | ||||
|         self.log("info", "Generating greyscale image from {}".format(doc)) | ||||
|         options = [] | ||||
|         for parser in self.parsers: | ||||
|             result = parser(doc) | ||||
|             if result: | ||||
|                 options.append(result) | ||||
|  | ||||
|         # Convert PDF to multiple PNMs | ||||
|         pnm = os.path.join(tempdir, "convert-%04d.pnm") | ||||
|         run_convert( | ||||
|             self.CONVERT, | ||||
|             "-density", str(self.DENSITY), | ||||
|             "-depth", "8", | ||||
|             "-type", "grayscale", | ||||
|             doc, pnm, | ||||
|         ) | ||||
|  | ||||
|         # Get a list of converted images | ||||
|         pnms = [] | ||||
|         for f in os.listdir(tempdir): | ||||
|             if f.endswith(".pnm"): | ||||
|                 pnms.append(os.path.join(tempdir, f)) | ||||
|  | ||||
|         # Run unpaper in parallel on converted images | ||||
|         with Pool(processes=self.THREADS) as pool: | ||||
|             pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) | ||||
|  | ||||
|         # Return list of converted images, processed with unpaper | ||||
|         pnms = [] | ||||
|         for f in os.listdir(tempdir): | ||||
|             if f.endswith(".unpaper.pnm"): | ||||
|                 pnms.append(os.path.join(tempdir, f)) | ||||
|  | ||||
|         return sorted(filter(lambda __: os.path.isfile(__), pnms)) | ||||
|  | ||||
|     def _get_thumbnail(self, tempdir, doc): | ||||
|         """ | ||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|         """ | ||||
|  | ||||
|         self.log("info", "Generating the thumbnail") | ||||
|  | ||||
|         run_convert( | ||||
|             self.CONVERT, | ||||
|             "-scale", "500x5000", | ||||
|             "-alpha", "remove", | ||||
|             doc, os.path.join(tempdir, "convert-%04d.png") | ||||
|         ) | ||||
|  | ||||
|         return os.path.join(tempdir, "convert-0000.png") | ||||
|  | ||||
|     def _guess_language(self, text): | ||||
|         try: | ||||
|             guess = langdetect.detect(text) | ||||
|             self.log("debug", "Language detected: {}".format(guess)) | ||||
|             return guess | ||||
|         except Exception as e: | ||||
|             self.log("warning", "Language detection error: {}".format(e)) | ||||
|  | ||||
|     def _get_ocr(self, imgs): | ||||
|         """ | ||||
|         Attempts to do the best job possible OCR'ing the document based on | ||||
|         simple language detection trial & error. | ||||
|         """ | ||||
|  | ||||
|         if not imgs: | ||||
|             raise OCRError("No images found") | ||||
|  | ||||
|         self.log("info", "OCRing the document") | ||||
|  | ||||
|         # Since the division gets rounded down by int, this calculation works | ||||
|         # for every edge-case, i.e. 1 | ||||
|         middle = int(len(imgs) / 2) | ||||
|         raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) | ||||
|  | ||||
|         guessed_language = self._guess_language(raw_text) | ||||
|  | ||||
|         if not guessed_language or guessed_language not in ISO639: | ||||
|             self.log("warning", "Language detection failed!") | ||||
|             if settings.FORGIVING_OCR: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     "As FORGIVING_OCR is enabled, we're going to make the " | ||||
|                     "best with what we have." | ||||
|                 ) | ||||
|                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|                 return raw_text | ||||
|             raise OCRError("Language detection failed") | ||||
|  | ||||
|         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: | ||||
|             raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|             return raw_text | ||||
|  | ||||
|         try: | ||||
|             return self._ocr(imgs, ISO639[guessed_language]) | ||||
|         except pyocr.pyocr.tesseract.TesseractError: | ||||
|             if settings.FORGIVING_OCR: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     "OCR for {} failed, but we're going to stick with what " | ||||
|                     "we've got since FORGIVING_OCR is enabled.".format( | ||||
|                         guessed_language | ||||
|                     ) | ||||
|                 ) | ||||
|                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|                 return raw_text | ||||
|             raise OCRError( | ||||
|                 "The guessed language is not available in this instance of " | ||||
|                 "Tesseract." | ||||
|             ) | ||||
|  | ||||
|     def _assemble_ocr_sections(self, imgs, middle, text): | ||||
|         """ | ||||
|         Given a `middle` value and the text that middle page represents, we OCR | ||||
|         the remainder of the document and return the whole thing. | ||||
|         """ | ||||
|         text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text | ||||
|         text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) | ||||
|         return text | ||||
|  | ||||
|     def _ocr(self, imgs, lang): | ||||
|         """ | ||||
|         Performs a single OCR attempt. | ||||
|         """ | ||||
|  | ||||
|         if not imgs: | ||||
|             return "" | ||||
|  | ||||
|         self.log("info", "Parsing for {}".format(lang)) | ||||
|  | ||||
|         with Pool(processes=self.THREADS) as pool: | ||||
|             r = pool.map(image_to_string, itertools.product(imgs, [lang])) | ||||
|             r = " ".join(r) | ||||
|  | ||||
|         # Strip out excess white space to allow matching to go smoother | ||||
|         return strip_excess_whitespace(r) | ||||
|         # Return the parser with the highest weight. | ||||
|         return sorted( | ||||
|             options, key=lambda _: _["weight"], reverse=True)[0]["parser"] | ||||
|  | ||||
|     def _store(self, text, doc, thumbnail): | ||||
|  | ||||
| @@ -332,10 +208,6 @@ class Consumer(object): | ||||
|  | ||||
|         return document | ||||
|  | ||||
|     def _cleanup_tempdir(self, d): | ||||
|         self.log("debug", "Deleting directory {}".format(d)) | ||||
|         shutil.rmtree(d) | ||||
|  | ||||
|     def _cleanup_doc(self, doc): | ||||
|         self.log("debug", "Deleting document {}".format(doc)) | ||||
|         os.unlink(doc) | ||||
| @@ -361,41 +233,3 @@ class Consumer(object): | ||||
|         with open(doc, "rb") as f: | ||||
|             checksum = hashlib.md5(f.read()).hexdigest() | ||||
|         return Document.objects.filter(checksum=checksum).exists() | ||||
|  | ||||
|  | ||||
| def strip_excess_whitespace(text): | ||||
|     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||
|     no_leading_whitespace = re.sub( | ||||
|         "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) | ||||
|     no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) | ||||
|     return no_trailing_whitespace | ||||
|  | ||||
|  | ||||
| def image_to_string(args): | ||||
|     img, lang = args | ||||
|     ocr = pyocr.get_available_tools()[0] | ||||
|     with Image.open(os.path.join(Consumer.SCRATCH, img)) as f: | ||||
|         if ocr.can_detect_orientation(): | ||||
|             try: | ||||
|                 orientation = ocr.detect_orientation(f, lang=lang) | ||||
|                 f = f.rotate(orientation["angle"], expand=1) | ||||
|             except (TesseractError, OtherTesseractError): | ||||
|                 pass | ||||
|         return ocr.image_to_string(f, lang=lang) | ||||
|  | ||||
|  | ||||
| def run_unpaper(args): | ||||
|     unpaper, pnm = args | ||||
|     subprocess.Popen( | ||||
|         (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() | ||||
|  | ||||
|  | ||||
| def run_convert(*args): | ||||
|  | ||||
|     environment = os.environ.copy() | ||||
|     if settings.CONVERT_MEMORY_LIMIT: | ||||
|         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT | ||||
|     if settings.CONVERT_TMPDIR: | ||||
|         environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR | ||||
|  | ||||
|     subprocess.Popen(args, env=environment).wait() | ||||
|   | ||||
| @@ -1,194 +0,0 @@ | ||||
| # Thanks to the Library of Congress and some creative use of sed and awk: | ||||
| # http://www.loc.gov/standards/iso639-2/php/English_list.php | ||||
|  | ||||
| ISO639 = { | ||||
|  | ||||
|     "aa": "aar", | ||||
|     "ab": "abk", | ||||
|     "ae": "ave", | ||||
|     "af": "afr", | ||||
|     "ak": "aka", | ||||
|     "am": "amh", | ||||
|     "an": "arg", | ||||
|     "ar": "ara", | ||||
|     "as": "asm", | ||||
|     "av": "ava", | ||||
|     "ay": "aym", | ||||
|     "az": "aze", | ||||
|     "ba": "bak", | ||||
|     "be": "bel", | ||||
|     "bg": "bul", | ||||
|     "bh": "bih", | ||||
|     "bi": "bis", | ||||
|     "bm": "bam", | ||||
|     "bn": "ben", | ||||
|     "bo": "bod", | ||||
|     "br": "bre", | ||||
|     "bs": "bos", | ||||
|     "ca": "cat", | ||||
|     "ce": "che", | ||||
|     "ch": "cha", | ||||
|     "co": "cos", | ||||
|     "cr": "cre", | ||||
|     "cs": "ces", | ||||
|     "cu": "chu", | ||||
|     "cv": "chv", | ||||
|     "cy": "cym", | ||||
|     "da": "dan", | ||||
|     "de": "deu", | ||||
|     "dv": "div", | ||||
|     "dz": "dzo", | ||||
|     "ee": "ewe", | ||||
|     "el": "ell", | ||||
|     "en": "eng", | ||||
|     "eo": "epo", | ||||
|     "es": "spa", | ||||
|     "et": "est", | ||||
|     "eu": "eus", | ||||
|     "fa": "fas", | ||||
|     "ff": "ful", | ||||
|     "fi": "fin", | ||||
|     "fj": "fij", | ||||
|     "fo": "fao", | ||||
|     "fr": "fra", | ||||
|     "fy": "fry", | ||||
|     "ga": "gle", | ||||
|     "gd": "gla", | ||||
|     "gl": "glg", | ||||
|     "gn": "grn", | ||||
|     "gu": "guj", | ||||
|     "gv": "glv", | ||||
|     "ha": "hau", | ||||
|     "he": "heb", | ||||
|     "hi": "hin", | ||||
|     "ho": "hmo", | ||||
|     "hr": "hrv", | ||||
|     "ht": "hat", | ||||
|     "hu": "hun", | ||||
|     "hy": "hye", | ||||
|     "hz": "her", | ||||
|     "ia": "ina", | ||||
|     "id": "ind", | ||||
|     "ie": "ile", | ||||
|     "ig": "ibo", | ||||
|     "ii": "iii", | ||||
|     "ik": "ipk", | ||||
|     "io": "ido", | ||||
|     "is": "isl", | ||||
|     "it": "ita", | ||||
|     "iu": "iku", | ||||
|     "ja": "jpn", | ||||
|     "jv": "jav", | ||||
|     "ka": "kat", | ||||
|     "kg": "kon", | ||||
|     "ki": "kik", | ||||
|     "kj": "kua", | ||||
|     "kk": "kaz", | ||||
|     "kl": "kal", | ||||
|     "km": "khm", | ||||
|     "kn": "kan", | ||||
|     "ko": "kor", | ||||
|     "kr": "kau", | ||||
|     "ks": "kas", | ||||
|     "ku": "kur", | ||||
|     "kv": "kom", | ||||
|     "kw": "cor", | ||||
|     "ky": "kir", | ||||
|     "la": "lat", | ||||
|     "lb": "ltz", | ||||
|     "lg": "lug", | ||||
|     "li": "lim", | ||||
|     "ln": "lin", | ||||
|     "lo": "lao", | ||||
|     "lt": "lit", | ||||
|     "lu": "lub", | ||||
|     "lv": "lav", | ||||
|     "mg": "mlg", | ||||
|     "mh": "mah", | ||||
|     "mi": "mri", | ||||
|     "mk": "mkd", | ||||
|     "ml": "mal", | ||||
|     "mn": "mon", | ||||
|     "mr": "mar", | ||||
|     "ms": "msa", | ||||
|     "mt": "mlt", | ||||
|     "my": "mya", | ||||
|     "na": "nau", | ||||
|     "nb": "nob", | ||||
|     "nd": "nde", | ||||
|     "ne": "nep", | ||||
|     "ng": "ndo", | ||||
|     "nl": "nld", | ||||
|     "no": "nor", | ||||
|     "nr": "nbl", | ||||
|     "nv": "nav", | ||||
|     "ny": "nya", | ||||
|     "oc": "oci", | ||||
|     "oj": "oji", | ||||
|     "om": "orm", | ||||
|     "or": "ori", | ||||
|     "os": "oss", | ||||
|     "pa": "pan", | ||||
|     "pi": "pli", | ||||
|     "pl": "pol", | ||||
|     "ps": "pus", | ||||
|     "pt": "por", | ||||
|     "qu": "que", | ||||
|     "rm": "roh", | ||||
|     "rn": "run", | ||||
|     "ro": "ron", | ||||
|     "ru": "rus", | ||||
|     "rw": "kin", | ||||
|     "sa": "san", | ||||
|     "sc": "srd", | ||||
|     "sd": "snd", | ||||
|     "se": "sme", | ||||
|     "sg": "sag", | ||||
|     "si": "sin", | ||||
|     "sk": "slk", | ||||
|     "sl": "slv", | ||||
|     "sm": "smo", | ||||
|     "sn": "sna", | ||||
|     "so": "som", | ||||
|     "sq": "sqi", | ||||
|     "sr": "srp", | ||||
|     "ss": "ssw", | ||||
|     "st": "sot", | ||||
|     "su": "sun", | ||||
|     "sv": "swe", | ||||
|     "sw": "swa", | ||||
|     "ta": "tam", | ||||
|     "te": "tel", | ||||
|     "tg": "tgk", | ||||
|     "th": "tha", | ||||
|     "ti": "tir", | ||||
|     "tk": "tuk", | ||||
|     "tl": "tgl", | ||||
|     "tn": "tsn", | ||||
|     "to": "ton", | ||||
|     "tr": "tur", | ||||
|     "ts": "tso", | ||||
|     "tt": "tat", | ||||
|     "tw": "twi", | ||||
|     "ty": "tah", | ||||
|     "ug": "uig", | ||||
|     "uk": "ukr", | ||||
|     "ur": "urd", | ||||
|     "uz": "uzb", | ||||
|     "ve": "ven", | ||||
|     "vi": "vie", | ||||
|     "vo": "vol", | ||||
|     "wa": "wln", | ||||
|     "wo": "wol", | ||||
|     "xh": "xho", | ||||
|     "yi": "yid", | ||||
|     "yo": "yor", | ||||
|     "za": "zha", | ||||
|  | ||||
|     # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra".  I | ||||
|     # have no idea which one is better, so I just picked the bigger file. | ||||
|     "zh": "chi_tra", | ||||
|  | ||||
|     "zu": "zul" | ||||
|  | ||||
| } | ||||
							
								
								
									
										45
									
								
								src/documents/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								src/documents/parsers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | ||||
| import logging | ||||
| import shutil | ||||
| import tempfile | ||||
|  | ||||
| from django.conf import settings | ||||
|  | ||||
|  | ||||
| class ParseError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class DocumentParser(object): | ||||
|     """ | ||||
|     Subclass this to make your own parser.  Have a look at | ||||
|     `paperless_tesseract.parsers` for inspiration. | ||||
|     """ | ||||
|  | ||||
|     SCRATCH = settings.SCRATCH_DIR | ||||
|  | ||||
|     def __init__(self, path): | ||||
|         self.document_path = path | ||||
|         self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) | ||||
|         self.logger = logging.getLogger(__name__) | ||||
|         self.logging_group = None | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
|         Returns the path to a file we can use as a thumbnail for this document. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def get_text(self): | ||||
|         """ | ||||
|         Returns the text from the document and only the text. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def log(self, level, message): | ||||
|         getattr(self.logger, level)(message, extra={ | ||||
|             "group": self.logging_group | ||||
|         }) | ||||
|  | ||||
|     def cleanup(self): | ||||
|         self.log("debug", "Deleting directory {}".format(self.tempdir)) | ||||
|         shutil.rmtree(self.tempdir) | ||||
| @@ -2,3 +2,4 @@ from django.dispatch import Signal | ||||
|  | ||||
| document_consumption_started = Signal(providing_args=["filename"]) | ||||
| document_consumption_finished = Signal(providing_args=["document"]) | ||||
| document_consumer_declaration = Signal(providing_args=[]) | ||||
|   | ||||
| @@ -1,6 +1,5 @@ | ||||
| import logging | ||||
| import os | ||||
|  | ||||
| from subprocess import Popen | ||||
|  | ||||
| from django.conf import settings | ||||
|   | ||||
										
											Binary file not shown.
										
									
								
							| Before Width: | Height: | Size: 32 KiB | 
| @@ -1,13 +1,6 @@ | ||||
| import os | ||||
| from unittest import mock, skipIf | ||||
|  | ||||
| import pyocr | ||||
| from django.test import TestCase | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
|  | ||||
| from ..models import FileInfo | ||||
| from ..consumer import image_to_string, strip_excess_whitespace | ||||
|  | ||||
|  | ||||
| class TestAttributes(TestCase): | ||||
| @@ -310,69 +303,3 @@ class TestFieldPermutations(TestCase): | ||||
|                             template.format(**spec), **spec) | ||||
|  | ||||
|  | ||||
| class FakeTesseract(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise OtherTesseractError("arbitrary status", "message") | ||||
|  | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         return "This is test text" | ||||
|  | ||||
|  | ||||
| class FakePyOcr(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
|  | ||||
|  | ||||
| class TestOCR(TestCase): | ||||
|  | ||||
|     text_cases = [ | ||||
|         ("simple     string", "simple string"), | ||||
|         ( | ||||
|             "simple    newline\n   testing string", | ||||
|             "simple newline\ntesting string" | ||||
|         ), | ||||
|         ( | ||||
|             "utf-8   строка с пробелами в конце  ", | ||||
|             "utf-8 строка с пробелами в конце" | ||||
|         ) | ||||
|     ] | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) | ||||
|  | ||||
|     def test_strip_excess_whitespace(self): | ||||
|         for source, result in self.text_cases: | ||||
|             actual_result = strip_excess_whitespace(source) | ||||
|             self.assertEqual( | ||||
|                 result, | ||||
|                 actual_result, | ||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||
|                     source, | ||||
|                     result, | ||||
|                     actual_result | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") | ||||
|     @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES) | ||||
|     @mock.patch("documents.consumer.pyocr", FakePyOcr) | ||||
|     def test_image_to_string_with_text_free_page(self): | ||||
|         """ | ||||
|         This test is sort of silly, since it's really just reproducing an odd | ||||
|         exception thrown by pyocr when it encounters a page with no text. | ||||
|         Actually running this test against an installation of Tesseract results | ||||
|         in a segmentation fault rooted somewhere deep inside pyocr where I | ||||
|         don't care to dig.  Regardless, if you run the consumer normally, | ||||
|         text-free pages are now handled correctly so long as we work around | ||||
|         this weird exception. | ||||
|         """ | ||||
|         image_to_string(["no-text.png", "en"]) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn