Collapsing excess whitespace after OCR

This commit is contained in:
Aleksandr Bogdanov 2016-10-12 01:46:34 +02:00
parent 14811a4a49
commit 63de2ca1b0
2 changed files with 22 additions and 1 deletions

View File

@ -283,7 +283,7 @@ class Consumer(object):
r = " ".join(r)
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
return strip_excess_whitespace(r)
def _store(self, text, doc, thumbnail):
@ -360,6 +360,13 @@ class Consumer(object):
return Document.objects.filter(checksum=checksum).exists()
def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub("([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace
def image_to_string(args):
img, lang = args
ocr = pyocr.get_available_tools()[0]

View File

@ -1,5 +1,6 @@
from django.test import TestCase
from ..consumer import strip_excess_whitespace
from ..models import FileInfo
@ -301,3 +302,16 @@ class Permutations(TestCase):
}
self._test_guessed_attributes(
template.format(**spec), **spec)
class TestOCR(TestCase):
text_cases = [
("simple string", "simple string"),
("simple newline\n testing string", "simple newline\ntesting string"),
("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце")
]
def test_strip_excess_whitespace(self):
for source, result in self.text_cases:
actual_result = strip_excess_whitespace(source)
assert result == actual_result, "strip_exceess_whitespace(%s) != '%s', but '%s'" % (source, result, actual_result)