From 63de2ca1b0e49547d242960454c70a1d637c63b3 Mon Sep 17 00:00:00 2001 From: Aleksandr Bogdanov Date: Wed, 12 Oct 2016 01:46:34 +0200 Subject: [PATCH] Collapsing excess whitespace after OCR --- src/documents/consumer.py | 9 ++++++++- src/documents/tests/test_consumer.py | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 411e9f2db..cdd566b76 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -283,7 +283,7 @@ class Consumer(object): r = " ".join(r) # Strip out excess white space to allow matching to go smoother - return re.sub(r"\s+", " ", r) + return strip_excess_whitespace(r) def _store(self, text, doc, thumbnail): @@ -360,6 +360,13 @@ class Consumer(object): return Document.objects.filter(checksum=checksum).exists() +def strip_excess_whitespace(text): + collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) + no_leading_whitespace = re.sub("([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) + no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) + return no_trailing_whitespace + + def image_to_string(args): img, lang = args ocr = pyocr.get_available_tools()[0] diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 7e303c5da..3a7f3460a 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,5 +1,6 @@ from django.test import TestCase +from ..consumer import strip_excess_whitespace from ..models import FileInfo @@ -301,3 +302,16 @@ class Permutations(TestCase): } self._test_guessed_attributes( template.format(**spec), **spec) + + +class TestOCR(TestCase): + text_cases = [ + ("simple string", "simple string"), + ("simple newline\n testing string", "simple newline\ntesting string"), + ("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце") + ] + + def test_strip_excess_whitespace(self): + for source, result in self.text_cases: + actual_result = strip_excess_whitespace(source) + assert result == actual_result, "strip_exceess_whitespace(%s) != '%s', but '%s'" % (source, result, actual_result)