diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 411e9f2db..72debcd3e 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -283,7 +283,7 @@ class Consumer(object): r = " ".join(r) # Strip out excess white space to allow matching to go smoother - return re.sub(r"\s+", " ", r) + return strip_excess_whitespace(r) def _store(self, text, doc, thumbnail): @@ -360,6 +360,14 @@ class Consumer(object): return Document.objects.filter(checksum=checksum).exists() +def strip_excess_whitespace(text): + collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) + no_leading_whitespace = re.sub( + "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) + no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) + return no_trailing_whitespace + + def image_to_string(args): img, lang = args ocr = pyocr.get_available_tools()[0] diff --git a/src/documents/mail.py b/src/documents/mail.py index 8782e274c..012ee0cf9 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -14,7 +14,7 @@ from dateutil import parser from django.conf import settings from .consumer import Consumer -from .models import Correspondent, Log +from .models import Correspondent class MailFetcherError(Exception): diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 0acdaeeb0..a03845bd3 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -6,7 +6,6 @@ import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError -from ...models import Log from ...consumer import Consumer, ConsumerError from ...mail import MailFetcher, MailFetcherError diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 7e303c5da..18bbab50f 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,5 +1,6 @@ from django.test import TestCase +from ..consumer import strip_excess_whitespace from ..models import FileInfo @@ -133,7 +134,7 @@ class TestAttributes(TestCase): ) -class Permutations(TestCase): +class TestFieldPermutations(TestCase): valid_dates = ( "20150102030405Z", @@ -301,3 +302,31 @@ class Permutations(TestCase): } self._test_guessed_attributes( template.format(**spec), **spec) + + +class TestOCR(TestCase): + + text_cases = [ + ("simple string", "simple string"), + ( + "simple newline\n testing string", + "simple newline\ntesting string" + ), + ( + "utf-8 строка с пробелами в конце ", + "utf-8 строка с пробелами в конце" + ) + ] + + def test_strip_excess_whitespace(self): + for source, result in self.text_cases: + actual_result = strip_excess_whitespace(source) + self.assertEqual( + result, + actual_result, + "strip_exceess_whitespace({}) != '{}', but '{}'".format( + source, + result, + actual_result + ) + )