Collapsing excess whitespace after OCR

2026-01-18 22:14:22 -06:00 · 2016-10-12 01:46:34 +02:00
parent 14811a4a49
commit 63de2ca1b0
2 changed files with 22 additions and 1 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -283,7 +283,7 @@ class Consumer(object):
            r = " ".join(r)

        # Strip out excess white space to allow matching to go smoother
-        return re.sub(r"\s+", " ", r)
+        return strip_excess_whitespace(r)

    def _store(self, text, doc, thumbnail):

@@ -360,6 +360,13 @@ class Consumer(object):
        return Document.objects.filter(checksum=checksum).exists()


+def strip_excess_whitespace(text):
+    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
+    no_leading_whitespace = re.sub("([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
+    no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
+    return no_trailing_whitespace
+
+
 def image_to_string(args):
    img, lang = args
    ocr = pyocr.get_available_tools()[0]