mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Collapsing excess whitespace after OCR
This commit is contained in:
parent
14811a4a49
commit
63de2ca1b0
@ -283,7 +283,7 @@ class Consumer(object):
|
|||||||
r = " ".join(r)
|
r = " ".join(r)
|
||||||
|
|
||||||
# Strip out excess white space to allow matching to go smoother
|
# Strip out excess white space to allow matching to go smoother
|
||||||
return re.sub(r"\s+", " ", r)
|
return strip_excess_whitespace(r)
|
||||||
|
|
||||||
def _store(self, text, doc, thumbnail):
|
def _store(self, text, doc, thumbnail):
|
||||||
|
|
||||||
@ -360,6 +360,13 @@ class Consumer(object):
|
|||||||
return Document.objects.filter(checksum=checksum).exists()
|
return Document.objects.filter(checksum=checksum).exists()
|
||||||
|
|
||||||
|
|
||||||
|
def strip_excess_whitespace(text):
|
||||||
|
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||||
|
no_leading_whitespace = re.sub("([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||||
|
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||||
|
return no_trailing_whitespace
|
||||||
|
|
||||||
|
|
||||||
def image_to_string(args):
|
def image_to_string(args):
|
||||||
img, lang = args
|
img, lang = args
|
||||||
ocr = pyocr.get_available_tools()[0]
|
ocr = pyocr.get_available_tools()[0]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
|
|
||||||
|
from ..consumer import strip_excess_whitespace
|
||||||
from ..models import FileInfo
|
from ..models import FileInfo
|
||||||
|
|
||||||
|
|
||||||
@ -301,3 +302,16 @@ class Permutations(TestCase):
|
|||||||
}
|
}
|
||||||
self._test_guessed_attributes(
|
self._test_guessed_attributes(
|
||||||
template.format(**spec), **spec)
|
template.format(**spec), **spec)
|
||||||
|
|
||||||
|
|
||||||
|
class TestOCR(TestCase):
|
||||||
|
text_cases = [
|
||||||
|
("simple string", "simple string"),
|
||||||
|
("simple newline\n testing string", "simple newline\ntesting string"),
|
||||||
|
("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце")
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_strip_excess_whitespace(self):
|
||||||
|
for source, result in self.text_cases:
|
||||||
|
actual_result = strip_excess_whitespace(source)
|
||||||
|
assert result == actual_result, "strip_exceess_whitespace(%s) != '%s', but '%s'" % (source, result, actual_result)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user