Merge branch 'synchrone-no_stripping_newlines'

This commit is contained in:
Daniel Quinn 2016-10-26 09:33:07 +00:00
commit 77fda752ae
4 changed files with 40 additions and 4 deletions

View File

@ -283,7 +283,7 @@ class Consumer(object):
r = " ".join(r) r = " ".join(r)
# Strip out excess white space to allow matching to go smoother # Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r) return strip_excess_whitespace(r)
def _store(self, text, doc, thumbnail): def _store(self, text, doc, thumbnail):
@ -360,6 +360,14 @@ class Consumer(object):
return Document.objects.filter(checksum=checksum).exists() return Document.objects.filter(checksum=checksum).exists()
def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace
def image_to_string(args): def image_to_string(args):
img, lang = args img, lang = args
ocr = pyocr.get_available_tools()[0] ocr = pyocr.get_available_tools()[0]

View File

@ -14,7 +14,7 @@ from dateutil import parser
from django.conf import settings from django.conf import settings
from .consumer import Consumer from .consumer import Consumer
from .models import Correspondent, Log from .models import Correspondent
class MailFetcherError(Exception): class MailFetcherError(Exception):

View File

@ -6,7 +6,6 @@ import time
from django.conf import settings from django.conf import settings
from django.core.management.base import BaseCommand, CommandError from django.core.management.base import BaseCommand, CommandError
from ...models import Log
from ...consumer import Consumer, ConsumerError from ...consumer import Consumer, ConsumerError
from ...mail import MailFetcher, MailFetcherError from ...mail import MailFetcher, MailFetcherError

View File

@ -1,5 +1,6 @@
from django.test import TestCase from django.test import TestCase
from ..consumer import strip_excess_whitespace
from ..models import FileInfo from ..models import FileInfo
@ -133,7 +134,7 @@ class TestAttributes(TestCase):
) )
class Permutations(TestCase): class TestFieldPermutations(TestCase):
valid_dates = ( valid_dates = (
"20150102030405Z", "20150102030405Z",
@ -301,3 +302,31 @@ class Permutations(TestCase):
} }
self._test_guessed_attributes( self._test_guessed_attributes(
template.format(**spec), **spec) template.format(**spec), **spec)
class TestOCR(TestCase):
text_cases = [
("simple string", "simple string"),
(
"simple newline\n testing string",
"simple newline\ntesting string"
),
(
"utf-8 строка с пробелами в конце ",
"utf-8 строка с пробелами в конце"
)
]
def test_strip_excess_whitespace(self):
for source, result in self.text_cases:
actual_result = strip_excess_whitespace(source)
self.assertEqual(
result,
actual_result,
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
source,
result,
actual_result
)
)