mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge branch 'synchrone-no_stripping_newlines'
This commit is contained in:
commit
77fda752ae
@ -283,7 +283,7 @@ class Consumer(object):
|
||||
r = " ".join(r)
|
||||
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
return re.sub(r"\s+", " ", r)
|
||||
return strip_excess_whitespace(r)
|
||||
|
||||
def _store(self, text, doc, thumbnail):
|
||||
|
||||
@ -360,6 +360,14 @@ class Consumer(object):
|
||||
return Document.objects.filter(checksum=checksum).exists()
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(
|
||||
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
|
@ -14,7 +14,7 @@ from dateutil import parser
|
||||
from django.conf import settings
|
||||
|
||||
from .consumer import Consumer
|
||||
from .models import Correspondent, Log
|
||||
from .models import Correspondent
|
||||
|
||||
|
||||
class MailFetcherError(Exception):
|
||||
|
@ -6,7 +6,6 @@ import time
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from ...models import Log
|
||||
from ...consumer import Consumer, ConsumerError
|
||||
from ...mail import MailFetcher, MailFetcherError
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from ..consumer import strip_excess_whitespace
|
||||
from ..models import FileInfo
|
||||
|
||||
|
||||
@ -133,7 +134,7 @@ class TestAttributes(TestCase):
|
||||
)
|
||||
|
||||
|
||||
class Permutations(TestCase):
|
||||
class TestFieldPermutations(TestCase):
|
||||
|
||||
valid_dates = (
|
||||
"20150102030405Z",
|
||||
@ -301,3 +302,31 @@ class Permutations(TestCase):
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
|
||||
class TestOCR(TestCase):
|
||||
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
(
|
||||
"simple newline\n testing string",
|
||||
"simple newline\ntesting string"
|
||||
),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце"
|
||||
)
|
||||
]
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||
source,
|
||||
result,
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user