local import of ocrmypdf so that the webserver does not load that

This commit is contained in:
jonaswinkler 2021-02-15 12:18:10 +01:00
parent 416101d557
commit 56bd966c02
2 changed files with 9 additions and 12 deletions

View File

@ -2,12 +2,8 @@ import json
import os import os
import re import re
import ocrmypdf
import pdftotext
import pikepdf
from PIL import Image from PIL import Image
from django.conf import settings from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError
from documents.parsers import DocumentParser, ParseError, \ from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf make_thumbnail_from_pdf
@ -22,6 +18,8 @@ class RasterisedDocumentParser(DocumentParser):
logging_name = "paperless.parsing.tesseract" logging_name = "paperless.parsing.tesseract"
def extract_metadata(self, document_path, mime_type): def extract_metadata(self, document_path, mime_type):
import pikepdf
namespace_pattern = re.compile(r"\{(.*)\}(.*)") namespace_pattern = re.compile(r"\{(.*)\}(.*)")
result = [] result = []
@ -91,6 +89,9 @@ class RasterisedDocumentParser(DocumentParser):
return None return None
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path, mime_type, file_name=None):
import ocrmypdf
from ocrmypdf import InputFileError, EncryptedPdfError
mode = settings.OCR_MODE mode = settings.OCR_MODE
text_original = get_text_from_pdf(document_path) text_original = get_text_from_pdf(document_path)
@ -223,6 +224,7 @@ def strip_excess_whitespace(text):
def get_text_from_pdf(pdf_file): def get_text_from_pdf(pdf_file):
import pdftotext
if not os.path.isfile(pdf_file): if not os.path.isfile(pdf_file):
return None return None

View File

@ -164,17 +164,12 @@ class TestParser(DirectoriesMixin, TestCase):
self.assertRaises(ParseError, f) self.assertRaises(ParseError, f)
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr") def test_image_calc_a4_dpi(self):
def test_image_calc_a4_dpi(self, m):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
m.assert_called_once() self.assertEqual(dpi, 62)
args, kwargs = m.call_args
self.assertEqual(kwargs['image_dpi'], 62)
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi") @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
def test_image_dpi_fail(self, m): def test_image_dpi_fail(self, m):