Fix language guesses in tests

It turns out that the Lorem ipsum text in the sample files was confuing the language guesser, causing it to think the file was in Catalan and not English or German.
This commit is contained in:
Daniel Quinn 2018-12-01 15:55:02 +00:00
parent c5488dcb98
commit c1d18c1e83
6 changed files with 7 additions and 1 deletions

View File

@ -11,6 +11,8 @@ Changelog
pointed this out. `#423`_.
* Updated dependencies to include (among other things) a security patch to
requests.
* Fix text in sample data for tests so that the language guesser stops thinking
that everything is in Catalan because we had *Lorem ipsum* in there.
2.5.0

Binary file not shown.

Before

Width:  |  Height:  |  Size: 138 KiB

After

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 138 KiB

After

Width:  |  Height:  |  Size: 53 KiB

View File

@ -5,7 +5,7 @@ from unittest import mock
from uuid import uuid4
from dateutil import tz
from django.test import TestCase
from django.test import TestCase, override_settings
from ..parsers import RasterisedDocumentParser
@ -211,6 +211,7 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="deu")
def test_get_text_3_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf")
document = RasterisedDocumentParser(input_file)
@ -225,6 +226,7 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="deu")
def test_get_text_3_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png")
document = RasterisedDocumentParser(input_file)
@ -239,6 +241,7 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="eng")
def test_get_text_4_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf")
document = RasterisedDocumentParser(input_file)
@ -253,6 +256,7 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="eng")
def test_get_text_4_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png")
document = RasterisedDocumentParser(input_file)