diff --git a/src/documents/admin.py b/src/documents/admin.py index 209ddff35..5b3975fda 100755 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -50,7 +50,7 @@ class DocumentTypeAdmin(admin.ModelAdmin): class DocumentAdmin(admin.ModelAdmin): search_fields = ("correspondent__name", "title", "content", "tags__name") - readonly_fields = ("added", "file_type", "storage_type", "filename") + readonly_fields = ("added", "mime_type", "storage_type", "filename") list_display = ( "title", "created", @@ -58,8 +58,7 @@ class DocumentAdmin(admin.ModelAdmin): "correspondent", "tags_", "archive_serial_number", - "document_type", - "filename" + "document_type" ) list_filter = ( "document_type", diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 3cd57796e..b8eb8cfca 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -2,8 +2,8 @@ import datetime import hashlib import logging import os -import re +import magic from django.conf import settings from django.db import transaction from django.utils import timezone @@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError from .file_handling import generate_filename, create_source_path_directory from .loggers import LoggingMixin from .models import Document, FileInfo, Correspondent, DocumentType, Tag -from .parsers import ParseError, get_parser_class +from .parsers import ParseError, get_parser_class_for_mime_type from .signals import ( document_consumption_finished, document_consumption_started @@ -51,12 +51,6 @@ class Consumer(LoggingMixin): "Consumption directory {} does not exist".format( settings.CONSUMPTION_DIR)) - def pre_check_regex(self): - if not re.match(FileInfo.REGEXES["title"], self.filename): - raise ConsumerError( - "Filename {} does not seem to be safe to " - "consume".format(self.filename)) - def pre_check_duplicate(self): with open(self.path, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() @@ -100,18 +94,19 @@ class Consumer(LoggingMixin): self.pre_check_file_exists() self.pre_check_consumption_dir() self.pre_check_directories() - self.pre_check_regex() self.pre_check_duplicate() self.log("info", "Consuming {}".format(self.filename)) # Determine the parser class. - parser_class = get_parser_class(self.filename) + mime_type = magic.from_file(self.path, mime=True) + + parser_class = get_parser_class_for_mime_type(mime_type) if not parser_class: raise ConsumerError("No parsers abvailable for {}".format(self.filename)) else: - self.log("debug", "Parser: {}".format(parser_class.__name__)) + self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type)) # Notify all listeners that we're going to do some work. @@ -162,7 +157,8 @@ class Consumer(LoggingMixin): # store the document. document = self._store( text=text, - date=date + date=date, + mime_type=mime_type ) # If we get here, it was successful. Proceed with post-consume @@ -197,7 +193,7 @@ class Consumer(LoggingMixin): return document - def _store(self, text, date): + def _store(self, text, date, mime_type): # If someone gave us the original filename, use it instead of doc. @@ -220,7 +216,7 @@ class Consumer(LoggingMixin): correspondent=file_info.correspondent, title=file_info.title, content=text, - file_type=file_info.extension, + mime_type=mime_type, checksum=hashlib.md5(f.read()).hexdigest(), created=created, modified=created, diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index 024003118..06d4d2957 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -91,9 +91,9 @@ def generate_filename(document): # Always append the primary key to guarantee uniqueness of filename if len(path) > 0: - filename = "%s-%07i.%s" % (path, document.pk, document.file_type) + filename = "%s-%07i%s" % (path, document.pk, document.file_type) else: - filename = "%07i.%s" % (document.pk, document.file_type) + filename = "%07i%s" % (document.pk, document.file_type) # Append .gpg for encrypted files if document.storage_type == document.STORAGE_TYPE_GPG: diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 971e6a829..441f1c475 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -127,8 +127,8 @@ class Command(Renderable, BaseCommand): tags = ",".join([t.slug for t in doc.tags.all()]) if tags: - return "{} - {} - {} - {}.{}".format( + return "{} - {} - {} - {}{}".format( created, doc.correspondent, doc.title, tags, doc.file_type) - return "{} - {} - {}.{}".format( + return "{} - {} - {}{}".format( created, doc.correspondent, doc.title, doc.file_type) diff --git a/src/documents/migrations/1003_mime_types.py b/src/documents/migrations/1003_mime_types.py new file mode 100644 index 000000000..4c73a4235 --- /dev/null +++ b/src/documents/migrations/1003_mime_types.py @@ -0,0 +1,50 @@ +# Generated by Django 3.1.3 on 2020-11-20 11:21 +import os + +import magic +from django.conf import settings +from django.db import migrations, models + + +def source_path(self): + if self.filename: + fname = str(self.filename) + else: + fname = "{:07}.{}".format(self.pk, self.file_type) + if self.storage_type == self.STORAGE_TYPE_GPG: + fname += ".gpg" + + return os.path.join( + settings.ORIGINALS_DIR, + fname + ) + + +def add_mime_types(apps, schema_editor): + Document = apps.get_model("documents", "Document") + documents = Document.objects.all() + + for d in documents: + d.mime_type = magic.from_file(source_path(d), mime=True) + d.save() + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1002_auto_20201111_1105'), + ] + + operations = [ + migrations.AddField( + model_name='document', + name='mime_type', + field=models.CharField(default="-", editable=False, max_length=256), + preserve_default=False, + ), + migrations.RunPython(add_mime_types), + migrations.RemoveField( + model_name='document', + name='file_type', + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 4badd2d56..559c395e0 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,6 +1,7 @@ # coding=utf-8 import logging +import mimetypes import os import re from collections import OrderedDict @@ -113,18 +114,6 @@ class DocumentType(MatchingModel): class Document(models.Model): - # TODO: why do we need an explicit list - TYPE_PDF = "pdf" - TYPE_PNG = "png" - TYPE_JPG = "jpg" - TYPE_GIF = "gif" - TYPE_TIF = "tiff" - TYPE_TXT = "txt" - TYPE_CSV = "csv" - TYPE_MD = "md" - TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF, - TYPE_TXT, TYPE_CSV, TYPE_MD) - STORAGE_TYPE_UNENCRYPTED = "unencrypted" STORAGE_TYPE_GPG = "gpg" STORAGE_TYPES = ( @@ -156,10 +145,9 @@ class Document(models.Model): "primarily used for searching." ) - file_type = models.CharField( - max_length=4, - editable=False, - choices=tuple([(t, t.upper()) for t in TYPES]) + mime_type = models.CharField( + max_length=256, + editable=False ) tags = models.ManyToManyField( @@ -223,7 +211,7 @@ class Document(models.Model): if self.filename: fname = str(self.filename) else: - fname = "{:07}.{}".format(self.pk, self.file_type) + fname = "{:07}{}".format(self.pk, self.file_type) if self.storage_type == self.STORAGE_TYPE_GPG: fname += ".gpg" @@ -238,7 +226,11 @@ class Document(models.Model): @property def file_name(self): - return slugify(str(self)) + "." + self.file_type + return slugify(str(self)) + self.file_type + + @property + def file_type(self): + return mimetypes.guess_extension(str(self.mime_type)) @property def thumbnail_path(self): diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 496efa188..98f4c5b12 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -6,6 +6,7 @@ import subprocess import tempfile import dateparser +import magic from django.conf import settings from django.utils import timezone @@ -37,10 +38,11 @@ DATE_REGEX = re.compile( logger = logging.getLogger(__name__) -def get_parser_class(doc): - """ - Determine the appropriate parser class based on the file - """ +def is_mime_type_supported(mime_type): + return get_parser_class_for_mime_type(mime_type) is not None + + +def get_parser_class_for_mime_type(mime_type): options = [] @@ -48,9 +50,9 @@ def get_parser_class(doc): for response in document_consumer_declaration.send(None): parser_declaration = response[1] - parser_test = parser_declaration["test"] + supported_mime_types = parser_declaration["mime_types"] - if parser_test(doc): + if mime_type in supported_mime_types: options.append(parser_declaration) if not options: @@ -61,6 +63,16 @@ def get_parser_class(doc): options, key=lambda _: _["weight"], reverse=True)[0]["parser"] +def get_parser_class(path): + """ + Determine the appropriate parser class based on the file + """ + + mime_type = magic.from_file(path, mime=True) + + return get_parser_class_for_mime_type(mime_type) + + def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None): environment = os.environ.copy() if settings.CONVERT_MEMORY_LIMIT: diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index e42e26881..cf48e8bd7 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -91,7 +91,7 @@ class DocumentSerializer(serializers.ModelSerializer): "document_type_id", "title", "content", - "file_type", + "mime_type", "tags", "tags_id", "checksum", diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index a049fb825..b0318d2b3 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -45,7 +45,7 @@ class DocumentApiTest(APITestCase): dt = DocumentType.objects.create(name="dt", pk=63) tag = Tag.objects.create(name="t", pk=85) - doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123") + doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123", mime_type="application/pdf") doc.tags.add(tag) @@ -95,7 +95,7 @@ class DocumentApiTest(APITestCase): with open(filename, "wb") as f: f.write(content) - doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf") + doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf") with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f: f.write(content_thumbnail) @@ -117,7 +117,7 @@ class DocumentApiTest(APITestCase): def test_document_actions_not_existing_file(self): - doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf") + doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf") response = self.client.get('/api/documents/{}/download/'.format(doc.pk)) self.assertEqual(response.status_code, 404) @@ -130,9 +130,9 @@ class DocumentApiTest(APITestCase): def test_document_filters(self): - doc1 = Document.objects.create(title="none1", checksum="A") - doc2 = Document.objects.create(title="none2", checksum="B") - doc3 = Document.objects.create(title="none3", checksum="C") + doc1 = Document.objects.create(title="none1", checksum="A", mime_type="application/pdf") + doc2 = Document.objects.create(title="none2", checksum="B", mime_type="application/pdf") + doc3 = Document.objects.create(title="none3", checksum="C", mime_type="application/pdf") tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True) tag_2 = Tag.objects.create(name="t2") diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index f61fd5718..a89bd75ae 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -437,6 +437,18 @@ class FaultyParser(DocumentParser): raise ParseError("Does not compute.") +def fake_magic_from_file(file, mime=False): + + if mime: + if os.path.splitext(file)[1] == ".pdf": + return "application/pdf" + else: + return "unknown" + else: + return "A verbose string that describes the contents of the file" + + +@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) class TestConsumer(TestCase): def make_dummy_parser(self, path, logging_group): @@ -462,7 +474,7 @@ class TestConsumer(TestCase): m = patcher.start() m.return_value = [(None, { "parser": self.make_dummy_parser, - "test": lambda _: True, + "mime_types": ["application/pdf"], "weight": 0 })] @@ -592,7 +604,7 @@ class TestConsumer(TestCase): def testFaultyParser(self, m): m.return_value = [(None, { "parser": self.make_faulty_parser, - "test": lambda _: True, + "mime_types": ["application/pdf"], "weight": 0 })] diff --git a/src/documents/tests/test_document_model.py b/src/documents/tests/test_document_model.py index 2da674527..5b27e2643 100644 --- a/src/documents/tests/test_document_model.py +++ b/src/documents/tests/test_document_model.py @@ -13,9 +13,12 @@ class TestDocument(TestCase): title="Title", content="content", checksum="checksum", + mime_type="application/pdf" ) + file_path = document.source_path thumb_path = document.thumbnail_path + with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink: document.delete() mock_unlink.assert_any_call(file_path) diff --git a/src/documents/tests/test_file_handling.py b/src/documents/tests/test_file_handling.py index d44e5056a..5ffd35f61 100644 --- a/src/documents/tests/test_file_handling.py +++ b/src/documents/tests/test_file_handling.py @@ -31,7 +31,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="") def test_generate_source_filename(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -44,7 +44,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_file_renaming(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -81,7 +81,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_file_renaming_missing_permissions(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -111,10 +111,10 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_file_renaming_database_error(self): - document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA") + document1 = Document.objects.create(mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA") document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.checksum = "BBBBB" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -149,7 +149,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_document_delete(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -170,7 +170,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_document_delete_nofile(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -179,7 +179,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") def test_directory_not_empty(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -206,7 +206,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") def test_tags_with_underscore(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -222,7 +222,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") def test_tags_with_dash(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -238,7 +238,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") def test_tags_malformed(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -254,7 +254,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}") def test_tags_all(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -269,7 +269,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}") def test_tags_out_of_bounds(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -284,7 +284,7 @@ class TestDate(TestCase): @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}") def test_nested_directory_cleanup(self): document = Document() - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.save() @@ -309,7 +309,7 @@ class TestDate(TestCase): def test_format_none(self): document = Document() document.pk = 1 - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED self.assertEqual(generate_filename(document), "0000001.pdf") @@ -335,7 +335,7 @@ class TestDate(TestCase): def test_invalid_format(self): document = Document() document.pk = 1 - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED self.assertEqual(generate_filename(document), "0000001.pdf") @@ -344,7 +344,7 @@ class TestDate(TestCase): def test_invalid_format_key(self): document = Document() document.pk = 1 - document.file_type = "pdf" + document.mime_type = "application/pdf" document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED self.assertEqual(generate_filename(document), "0000001.pdf") diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index 93601b9d2..24e285ae7 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -213,7 +213,7 @@ class TestDocumentConsumptionFinishedSignal(TestCase): TestCase.setUp(self) User.objects.create_user(username='test_consumer', password='12345') self.doc_contains = Document.objects.create( - content="I contain the keyword.", file_type="pdf") + content="I contain the keyword.", mime_type="application/pdf") def test_tag_applied_any(self): t1 = Tag.objects.create( diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 5896f3ba3..e99bb8dc6 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -1,3 +1,4 @@ +import os from tempfile import TemporaryDirectory from unittest import mock @@ -5,7 +6,18 @@ from django.test import TestCase from documents.parsers import get_parser_class +def fake_magic_from_file(file, mime=False): + if mime: + if os.path.splitext(file)[1] == ".pdf": + return "application/pdf" + else: + return "unknown" + else: + return "A verbose string that describes the contents of the file" + + +@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file) class TestParserDiscovery(TestCase): @mock.patch("documents.parsers.document_consumer_declaration.send") @@ -14,7 +26,7 @@ class TestParserDiscovery(TestCase): pass m.return_value = ( - (None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}), + (None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}), ) self.assertEqual( @@ -32,8 +44,8 @@ class TestParserDiscovery(TestCase): pass m.return_value = ( - (None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}), - (None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}), + (None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}), + (None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}), ) self.assertEqual( diff --git a/src/documents/views.py b/src/documents/views.py index f4c5d0797..89d03a4df 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -104,18 +104,6 @@ class DocumentViewSet(RetrieveModelMixin, return super(DocumentViewSet, self).destroy(request, *args, **kwargs) def file_response(self, pk, disposition): - # TODO: this should not be necessary here. - content_types = { - Document.TYPE_PDF: "application/pdf", - Document.TYPE_PNG: "image/png", - Document.TYPE_JPG: "image/jpeg", - Document.TYPE_GIF: "image/gif", - Document.TYPE_TIF: "image/tiff", - Document.TYPE_CSV: "text/csv", - Document.TYPE_MD: "text/markdown", - Document.TYPE_TXT: "text/plain" - } - doc = Document.objects.get(id=pk) if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: @@ -123,7 +111,7 @@ class DocumentViewSet(RetrieveModelMixin, else: file_handle = GnuPG.decrypted(doc.source_file) - response = HttpResponse(file_handle, content_type=content_types[doc.file_type]) + response = HttpResponse(file_handle, content_type=doc.mime_type) response["Content-Disposition"] = '{}; filename="{}"'.format( disposition, doc.file_name) return response diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py index b942e420a..1aea65d90 100644 --- a/src/paperless_mail/mail.py +++ b/src/paperless_mail/mail.py @@ -10,6 +10,7 @@ from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \ from documents.loggers import LoggingMixin from documents.models import Correspondent +from documents.parsers import is_mime_type_supported from paperless_mail.models import MailAccount, MailRule @@ -249,8 +250,7 @@ class MailAccountHandler(LoggingMixin): title = get_title(message, att, rule) - # TODO: check with parsers what files types are supported - if att.content_type == 'application/pdf': + if is_mime_type_supported(att.content_type): os.makedirs(settings.SCRATCH_DIR, exist_ok=True) _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR) diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py index 3fc6c2a11..712034038 100644 --- a/src/paperless_tesseract/signals.py +++ b/src/paperless_tesseract/signals.py @@ -1,5 +1,3 @@ -import re - from .parsers import RasterisedDocumentParser @@ -7,12 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs): return { "parser": RasterisedDocumentParser, "weight": 0, - "test": tesseract_consumer_test + "mime_types": [ + "application/pdf", + "image/jpeg", + "image/png" + ] } - - -MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") - - -def tesseract_consumer_test(doc): - return MATCHING_FILES.match(doc.lower()) diff --git a/src/paperless_tesseract/tests/test_signals.py b/src/paperless_tesseract/tests/test_signals.py deleted file mode 100644 index 354557732..000000000 --- a/src/paperless_tesseract/tests/test_signals.py +++ /dev/null @@ -1,36 +0,0 @@ -from django.test import TestCase - -from paperless_tesseract.signals import tesseract_consumer_test - - -class SignalsTestCase(TestCase): - - def test_test_handles_various_file_names_true(self): - - prefixes = ( - "doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags", - "A document with a . in it", "Doc with -- in it" - ) - suffixes = ( - "pdf", "jpg", "jpeg", "gif", "png", "tiff", "tif", "pnm", "bmp", - "PDF", "JPG", "JPEG", "GIF", "PNG", "TIFF", "TIF", "PNM", "BMP", - "pDf", "jPg", "jpEg", "gIf", "pNg", "tIff", "tIf", "pNm", "bMp", - ) - - for prefix in prefixes: - for suffix in suffixes: - name = "{}.{}".format(prefix, suffix) - self.assertTrue(tesseract_consumer_test(name)) - - def test_test_handles_various_file_names_false(self): - - prefixes = ("doc",) - suffixes = ("txt", "markdown", "",) - - for prefix in prefixes: - for suffix in suffixes: - name = "{}.{}".format(prefix, suffix) - self.assertFalse(tesseract_consumer_test(name)) - - self.assertFalse(tesseract_consumer_test("")) - self.assertFalse(tesseract_consumer_test("doc")) diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py index 784bfd45d..f9ac9ad23 100644 --- a/src/paperless_text/signals.py +++ b/src/paperless_text/signals.py @@ -1,5 +1,3 @@ -import re - from .parsers import TextDocumentParser @@ -7,12 +5,8 @@ def text_consumer_declaration(sender, **kwargs): return { "parser": TextDocumentParser, "weight": 10, - "test": text_consumer_test + "mime_types": [ + "text/plain", + "text/comma-separated-values" + ] } - - -MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$") - - -def text_consumer_test(doc): - return MATCHING_FILES.match(doc.lower())