From 63de2ca1b0e49547d242960454c70a1d637c63b3 Mon Sep 17 00:00:00 2001 From: Aleksandr Bogdanov Date: Wed, 12 Oct 2016 01:46:34 +0200 Subject: [PATCH 1/2] Collapsing excess whitespace after OCR --- src/documents/consumer.py | 9 ++++++++- src/documents/tests/test_consumer.py | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 411e9f2db..cdd566b76 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -283,7 +283,7 @@ class Consumer(object): r = " ".join(r) # Strip out excess white space to allow matching to go smoother - return re.sub(r"\s+", " ", r) + return strip_excess_whitespace(r) def _store(self, text, doc, thumbnail): @@ -360,6 +360,13 @@ class Consumer(object): return Document.objects.filter(checksum=checksum).exists() +def strip_excess_whitespace(text): + collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) + no_leading_whitespace = re.sub("([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) + no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) + return no_trailing_whitespace + + def image_to_string(args): img, lang = args ocr = pyocr.get_available_tools()[0] diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 7e303c5da..3a7f3460a 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,5 +1,6 @@ from django.test import TestCase +from ..consumer import strip_excess_whitespace from ..models import FileInfo @@ -301,3 +302,16 @@ class Permutations(TestCase): } self._test_guessed_attributes( template.format(**spec), **spec) + + +class TestOCR(TestCase): + text_cases = [ + ("simple string", "simple string"), + ("simple newline\n testing string", "simple newline\ntesting string"), + ("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце") + ] + + def test_strip_excess_whitespace(self): + for source, result in self.text_cases: + actual_result = strip_excess_whitespace(source) + assert result == actual_result, "strip_exceess_whitespace(%s) != '%s', but '%s'" % (source, result, actual_result) From 8e584068816e0d1eac4b4c633750dd8baf982cf5 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Wed, 26 Oct 2016 09:32:59 +0000 Subject: [PATCH 2/2] pep8 corrections --- src/documents/consumer.py | 3 ++- src/documents/mail.py | 2 +- .../management/commands/document_consumer.py | 1 - src/documents/tests/test_consumer.py | 23 +++++++++++++++---- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index cdd566b76..72debcd3e 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -362,7 +362,8 @@ class Consumer(object): def strip_excess_whitespace(text): collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) - no_leading_whitespace = re.sub("([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) + no_leading_whitespace = re.sub( + "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) return no_trailing_whitespace diff --git a/src/documents/mail.py b/src/documents/mail.py index 8782e274c..012ee0cf9 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -14,7 +14,7 @@ from dateutil import parser from django.conf import settings from .consumer import Consumer -from .models import Correspondent, Log +from .models import Correspondent class MailFetcherError(Exception): diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 0acdaeeb0..a03845bd3 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -6,7 +6,6 @@ import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError -from ...models import Log from ...consumer import Consumer, ConsumerError from ...mail import MailFetcher, MailFetcherError diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 3a7f3460a..18bbab50f 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -134,7 +134,7 @@ class TestAttributes(TestCase): ) -class Permutations(TestCase): +class TestFieldPermutations(TestCase): valid_dates = ( "20150102030405Z", @@ -305,13 +305,28 @@ class Permutations(TestCase): class TestOCR(TestCase): + text_cases = [ ("simple string", "simple string"), - ("simple newline\n testing string", "simple newline\ntesting string"), - ("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце") + ( + "simple newline\n testing string", + "simple newline\ntesting string" + ), + ( + "utf-8 строка с пробелами в конце ", + "utf-8 строка с пробелами в конце" + ) ] def test_strip_excess_whitespace(self): for source, result in self.text_cases: actual_result = strip_excess_whitespace(source) - assert result == actual_result, "strip_exceess_whitespace(%s) != '%s', but '%s'" % (source, result, actual_result) + self.assertEqual( + result, + actual_result, + "strip_exceess_whitespace({}) != '{}', but '{}'".format( + source, + result, + actual_result + ) + )