Merge branch 'synchrone-no_stripping_newlines'

This commit is contained in:
Daniel Quinn 2016-10-26 09:33:07 +00:00
commit 77fda752ae
4 changed files with 40 additions and 4 deletions

View File

@ -283,7 +283,7 @@ class Consumer(object):
r = " ".join(r)
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
return strip_excess_whitespace(r)
def _store(self, text, doc, thumbnail):
@ -360,6 +360,14 @@ class Consumer(object):
return Document.objects.filter(checksum=checksum).exists()
def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace
def image_to_string(args):
img, lang = args
ocr = pyocr.get_available_tools()[0]

View File

@ -14,7 +14,7 @@ from dateutil import parser
from django.conf import settings
from .consumer import Consumer
from .models import Correspondent, Log
from .models import Correspondent
class MailFetcherError(Exception):

View File

@ -6,7 +6,6 @@ import time
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from ...models import Log
from ...consumer import Consumer, ConsumerError
from ...mail import MailFetcher, MailFetcherError

View File

@ -1,5 +1,6 @@
from django.test import TestCase
from ..consumer import strip_excess_whitespace
from ..models import FileInfo
@ -133,7 +134,7 @@ class TestAttributes(TestCase):
)
class Permutations(TestCase):
class TestFieldPermutations(TestCase):
valid_dates = (
"20150102030405Z",
@ -301,3 +302,31 @@ class Permutations(TestCase):
}
self._test_guessed_attributes(
template.format(**spec), **spec)
class TestOCR(TestCase):
text_cases = [
("simple string", "simple string"),
(
"simple newline\n testing string",
"simple newline\ntesting string"
),
(
"utf-8 строка с пробелами в конце ",
"utf-8 строка с пробелами в конце"
)
]
def test_strip_excess_whitespace(self):
for source, result in self.text_cases:
actual_result = strip_excess_whitespace(source)
self.assertEqual(
result,
actual_result,
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
source,
result,
actual_result
)
)