mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
fixes #949: change to MIME detection for files
Signed-off-by: Florian Brandes <florian.brandes@posteo.de>
This commit is contained in:
parent
c7412deb77
commit
f8adfa9873
@ -4,6 +4,7 @@ import shutil
|
|||||||
import tempfile
|
import tempfile
|
||||||
from typing import List # for type hinting. Can be removed, if only Python >3.8 is used
|
from typing import List # for type hinting. Can be removed, if only Python >3.8 is used
|
||||||
|
|
||||||
|
import magic
|
||||||
import tqdm
|
import tqdm
|
||||||
from asgiref.sync import async_to_sync
|
from asgiref.sync import async_to_sync
|
||||||
from channels.layers import get_channel_layer
|
from channels.layers import get_channel_layer
|
||||||
@ -95,19 +96,33 @@ def barcode_reader(image) -> List[str]:
|
|||||||
return barcodes
|
return barcodes
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_type(path: str) -> str:
|
||||||
|
"""
|
||||||
|
Determines the file type, based on MIME type.
|
||||||
|
|
||||||
|
Returns the MIME type.
|
||||||
|
"""
|
||||||
|
mime_type = magic.from_file(path, mime=True)
|
||||||
|
logger.debug(f"Detected mime type: {mime_type}")
|
||||||
|
return mime_type
|
||||||
|
|
||||||
|
|
||||||
def convert_from_tiff_to_pdf(filepath: str) -> str:
|
def convert_from_tiff_to_pdf(filepath: str) -> str:
|
||||||
"""
|
"""
|
||||||
converts a given TIFF image file to pdf into a temp. directory.
|
converts a given TIFF image file to pdf into a temporary directory.
|
||||||
|
|
||||||
Returns the new pdf file.
|
Returns the new pdf file.
|
||||||
"""
|
"""
|
||||||
file_name = os.path.splitext(os.path.basename(filepath))[0]
|
file_name = os.path.splitext(os.path.basename(filepath))[0]
|
||||||
file_extension = os.path.splitext(os.path.basename(filepath))[1].lower()
|
mime_type = get_file_type(filepath)
|
||||||
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||||
# use old file name with pdf extension
|
# use old file name with pdf extension
|
||||||
if file_extension == ".tif" or file_extension == ".tiff":
|
if mime_type == "image/tiff":
|
||||||
newpath = os.path.join(tempdir, file_name + ".pdf")
|
newpath = os.path.join(tempdir, file_name + ".pdf")
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Cannot convert from {str(file_extension)} to pdf.")
|
logger.warning(
|
||||||
|
f"Cannot convert mime type {str(mime_type)} from {str(filepath)} to pdf.",
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
with Image.open(filepath) as image:
|
with Image.open(filepath) as image:
|
||||||
images = []
|
images = []
|
||||||
@ -231,17 +246,17 @@ def consume_file(
|
|||||||
document_list = []
|
document_list = []
|
||||||
converted_tiff = None
|
converted_tiff = None
|
||||||
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
|
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
|
||||||
supported_extensions = [".pdf", ".tiff", ".tif"]
|
supported_mime = ["image/tiff", "application/pdf"]
|
||||||
else:
|
else:
|
||||||
supported_extensions = [".pdf"]
|
supported_mime = ["application/pdf"]
|
||||||
file_extension = os.path.splitext(os.path.basename(path))[1].lower()
|
mime_type = get_file_type(path)
|
||||||
if file_extension not in supported_extensions:
|
if mime_type not in supported_mime:
|
||||||
# if not supported, skip this routine
|
# if not supported, skip this routine
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Unsupported file format for barcode reader: {str(file_extension)}",
|
f"Unsupported file format for barcode reader: {str(mime_type)}",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if file_extension in {".tif", ".tiff"}:
|
if mime_type == "image/tiff":
|
||||||
file_to_process = convert_from_tiff_to_pdf(path)
|
file_to_process = convert_from_tiff_to_pdf(path)
|
||||||
else:
|
else:
|
||||||
file_to_process = path
|
file_to_process = path
|
||||||
|
@ -204,6 +204,34 @@ class TestTasks(DirectoriesMixin, TestCase):
|
|||||||
img = Image.open(test_file)
|
img = Image.open(test_file)
|
||||||
self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
|
self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"])
|
||||||
|
|
||||||
|
def test_get_mime_type(self):
|
||||||
|
tiff_file = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"simple.tiff",
|
||||||
|
)
|
||||||
|
pdf_file = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"simple.pdf",
|
||||||
|
)
|
||||||
|
png_file = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"barcodes",
|
||||||
|
"barcode-128-custom.png",
|
||||||
|
)
|
||||||
|
tiff_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile1")
|
||||||
|
pdf_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile2")
|
||||||
|
shutil.copy(tiff_file, tiff_file_no_extension)
|
||||||
|
shutil.copy(pdf_file, pdf_file_no_extension)
|
||||||
|
|
||||||
|
self.assertEqual(tasks.get_file_type(tiff_file), "image/tiff")
|
||||||
|
self.assertEqual(tasks.get_file_type(pdf_file), "application/pdf")
|
||||||
|
self.assertEqual(tasks.get_file_type(tiff_file_no_extension), "image/tiff")
|
||||||
|
self.assertEqual(tasks.get_file_type(pdf_file_no_extension), "application/pdf")
|
||||||
|
self.assertEqual(tasks.get_file_type(png_file), "image/png")
|
||||||
|
|
||||||
def test_convert_from_tiff_to_pdf(self):
|
def test_convert_from_tiff_to_pdf(self):
|
||||||
test_file = os.path.join(
|
test_file = os.path.join(
|
||||||
os.path.dirname(__file__),
|
os.path.dirname(__file__),
|
||||||
@ -469,7 +497,7 @@ class TestTasks(DirectoriesMixin, TestCase):
|
|||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
cm.output,
|
cm.output,
|
||||||
[
|
[
|
||||||
"WARNING:paperless.tasks:Unsupported file format for barcode reader: .jpg",
|
"WARNING:paperless.tasks:Unsupported file format for barcode reader: image/jpeg",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
m.assert_called_once()
|
m.assert_called_once()
|
||||||
@ -481,6 +509,26 @@ class TestTasks(DirectoriesMixin, TestCase):
|
|||||||
self.assertIsNone(kwargs["override_document_type_id"])
|
self.assertIsNone(kwargs["override_document_type_id"])
|
||||||
self.assertIsNone(kwargs["override_tag_ids"])
|
self.assertIsNone(kwargs["override_tag_ids"])
|
||||||
|
|
||||||
|
@override_settings(
|
||||||
|
CONSUMER_ENABLE_BARCODES=True,
|
||||||
|
CONSUMER_BARCODE_TIFF_SUPPORT=True,
|
||||||
|
)
|
||||||
|
def test_consume_barcode_supported_no_extension_file(self):
|
||||||
|
"""
|
||||||
|
This test assumes barcode and TIFF support are enabled and
|
||||||
|
the user uploads a supported image file, but without extension
|
||||||
|
"""
|
||||||
|
test_file = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"samples",
|
||||||
|
"barcodes",
|
||||||
|
"patch-code-t-middle.tiff",
|
||||||
|
)
|
||||||
|
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle")
|
||||||
|
shutil.copy(test_file, dst)
|
||||||
|
|
||||||
|
self.assertEqual(tasks.consume_file(dst), "File successfully split")
|
||||||
|
|
||||||
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
@mock.patch("documents.tasks.sanity_checker.check_sanity")
|
||||||
def test_sanity_check_success(self, m):
|
def test_sanity_check_success(self, m):
|
||||||
m.return_value = SanityCheckMessages()
|
m.return_value = SanityCheckMessages()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user