mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
local import of ocrmypdf so that the webserver does not load that
This commit is contained in:
parent
416101d557
commit
56bd966c02
@ -2,12 +2,8 @@ import json
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import ocrmypdf
|
|
||||||
import pdftotext
|
|
||||||
import pikepdf
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
|
||||||
|
|
||||||
from documents.parsers import DocumentParser, ParseError, \
|
from documents.parsers import DocumentParser, ParseError, \
|
||||||
make_thumbnail_from_pdf
|
make_thumbnail_from_pdf
|
||||||
@ -22,6 +18,8 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
logging_name = "paperless.parsing.tesseract"
|
logging_name = "paperless.parsing.tesseract"
|
||||||
|
|
||||||
def extract_metadata(self, document_path, mime_type):
|
def extract_metadata(self, document_path, mime_type):
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
@ -91,6 +89,9 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None):
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
|
import ocrmypdf
|
||||||
|
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||||
|
|
||||||
mode = settings.OCR_MODE
|
mode = settings.OCR_MODE
|
||||||
|
|
||||||
text_original = get_text_from_pdf(document_path)
|
text_original = get_text_from_pdf(document_path)
|
||||||
@ -223,6 +224,7 @@ def strip_excess_whitespace(text):
|
|||||||
|
|
||||||
|
|
||||||
def get_text_from_pdf(pdf_file):
|
def get_text_from_pdf(pdf_file):
|
||||||
|
import pdftotext
|
||||||
|
|
||||||
if not os.path.isfile(pdf_file):
|
if not os.path.isfile(pdf_file):
|
||||||
return None
|
return None
|
||||||
|
@ -164,17 +164,12 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
|
|
||||||
self.assertRaises(ParseError, f)
|
self.assertRaises(ParseError, f)
|
||||||
|
|
||||||
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
|
def test_image_calc_a4_dpi(self):
|
||||||
def test_image_calc_a4_dpi(self, m):
|
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
|
|
||||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
|
dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
|
||||||
|
|
||||||
m.assert_called_once()
|
self.assertEqual(dpi, 62)
|
||||||
|
|
||||||
args, kwargs = m.call_args
|
|
||||||
|
|
||||||
self.assertEqual(kwargs['image_dpi'], 62)
|
|
||||||
|
|
||||||
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
|
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
|
||||||
def test_image_dpi_fail(self, m):
|
def test_image_dpi_fail(self, m):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user