mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-17 10:13:56 -05:00
Merge branch 'synchrone-no_stripping_newlines'
This commit is contained in:
commit
77fda752ae
@ -283,7 +283,7 @@ class Consumer(object):
|
|||||||
r = " ".join(r)
|
r = " ".join(r)
|
||||||
|
|
||||||
# Strip out excess white space to allow matching to go smoother
|
# Strip out excess white space to allow matching to go smoother
|
||||||
return re.sub(r"\s+", " ", r)
|
return strip_excess_whitespace(r)
|
||||||
|
|
||||||
def _store(self, text, doc, thumbnail):
|
def _store(self, text, doc, thumbnail):
|
||||||
|
|
||||||
@ -360,6 +360,14 @@ class Consumer(object):
|
|||||||
return Document.objects.filter(checksum=checksum).exists()
|
return Document.objects.filter(checksum=checksum).exists()
|
||||||
|
|
||||||
|
|
||||||
|
def strip_excess_whitespace(text):
|
||||||
|
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||||
|
no_leading_whitespace = re.sub(
|
||||||
|
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||||
|
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||||
|
return no_trailing_whitespace
|
||||||
|
|
||||||
|
|
||||||
def image_to_string(args):
|
def image_to_string(args):
|
||||||
img, lang = args
|
img, lang = args
|
||||||
ocr = pyocr.get_available_tools()[0]
|
ocr = pyocr.get_available_tools()[0]
|
||||||
|
@ -14,7 +14,7 @@ from dateutil import parser
|
|||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from .consumer import Consumer
|
from .consumer import Consumer
|
||||||
from .models import Correspondent, Log
|
from .models import Correspondent
|
||||||
|
|
||||||
|
|
||||||
class MailFetcherError(Exception):
|
class MailFetcherError(Exception):
|
||||||
|
@ -6,7 +6,6 @@ import time
|
|||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.management.base import BaseCommand, CommandError
|
from django.core.management.base import BaseCommand, CommandError
|
||||||
|
|
||||||
from ...models import Log
|
|
||||||
from ...consumer import Consumer, ConsumerError
|
from ...consumer import Consumer, ConsumerError
|
||||||
from ...mail import MailFetcher, MailFetcherError
|
from ...mail import MailFetcher, MailFetcherError
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
|
|
||||||
|
from ..consumer import strip_excess_whitespace
|
||||||
from ..models import FileInfo
|
from ..models import FileInfo
|
||||||
|
|
||||||
|
|
||||||
@ -133,7 +134,7 @@ class TestAttributes(TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class Permutations(TestCase):
|
class TestFieldPermutations(TestCase):
|
||||||
|
|
||||||
valid_dates = (
|
valid_dates = (
|
||||||
"20150102030405Z",
|
"20150102030405Z",
|
||||||
@ -301,3 +302,31 @@ class Permutations(TestCase):
|
|||||||
}
|
}
|
||||||
self._test_guessed_attributes(
|
self._test_guessed_attributes(
|
||||||
template.format(**spec), **spec)
|
template.format(**spec), **spec)
|
||||||
|
|
||||||
|
|
||||||
|
class TestOCR(TestCase):
|
||||||
|
|
||||||
|
text_cases = [
|
||||||
|
("simple string", "simple string"),
|
||||||
|
(
|
||||||
|
"simple newline\n testing string",
|
||||||
|
"simple newline\ntesting string"
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"utf-8 строка с пробелами в конце ",
|
||||||
|
"utf-8 строка с пробелами в конце"
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_strip_excess_whitespace(self):
|
||||||
|
for source, result in self.text_cases:
|
||||||
|
actual_result = strip_excess_whitespace(source)
|
||||||
|
self.assertEqual(
|
||||||
|
result,
|
||||||
|
actual_result,
|
||||||
|
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||||
|
source,
|
||||||
|
result,
|
||||||
|
actual_result
|
||||||
|
)
|
||||||
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user