mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Collapsing excess whitespace after OCR
This commit is contained in:
parent
14811a4a49
commit
63de2ca1b0
@ -283,7 +283,7 @@ class Consumer(object):
|
||||
r = " ".join(r)
|
||||
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
return re.sub(r"\s+", " ", r)
|
||||
return strip_excess_whitespace(r)
|
||||
|
||||
def _store(self, text, doc, thumbnail):
|
||||
|
||||
@ -360,6 +360,13 @@ class Consumer(object):
|
||||
return Document.objects.filter(checksum=checksum).exists()
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub("([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
|
@ -1,5 +1,6 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from ..consumer import strip_excess_whitespace
|
||||
from ..models import FileInfo
|
||||
|
||||
|
||||
@ -301,3 +302,16 @@ class Permutations(TestCase):
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
|
||||
class TestOCR(TestCase):
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
("simple newline\n testing string", "simple newline\ntesting string"),
|
||||
("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце")
|
||||
]
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
assert result == actual_result, "strip_exceess_whitespace(%s) != '%s', but '%s'" % (source, result, actual_result)
|
||||
|
Loading…
x
Reference in New Issue
Block a user