From fa4924d5ba804cb79f4cee30c0460d155fb59fa6 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Tue, 28 Mar 2017 21:01:50 +0000 Subject: [PATCH] fix: allow for caps in file name suffixes #206 @schinkelg ran aground of this one and I took the opportunity to add a test to catch this sort of thing for next time. --- docs/changelog.rst | 5 ++ src/documents/consumer.py | 12 ++++- src/documents/tests/test_consumer.py | 48 +++++++++++++++++++ src/paperless_tesseract/signals.py | 4 +- src/paperless_tesseract/tests/test_signals.py | 36 ++++++++++++++ 5 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 src/paperless_tesseract/tests/test_signals.py diff --git a/docs/changelog.rst b/docs/changelog.rst index e995fd8c6..3a86bc745 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,10 @@ Changelog ######### +* 0.4.1 + * Fix for `#206`_ wherein the pluggable parser didn't recognise files with + all-caps suffixes like ``.PDF`` + * 0.4.0 * Introducing reminders. See `#199`_ for more information, but the short explanation is that you can now attach simple notes & times to documents @@ -211,3 +215,4 @@ Changelog .. _#179: https://github.com/danielquinn/paperless/pull/179 .. _#199: https://github.com/danielquinn/paperless/issues/199 .. _#200: https://github.com/danielquinn/paperless/issues/200 +.. _#206: https://github.com/danielquinn/paperless/issues/206 diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 65e74f3a8..b4f300400 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -102,7 +102,7 @@ class Consumer(object): parser_class = self._get_parser_class(doc) if not parser_class: self.log( - "info", "No parsers could be found for {}".format(doc)) + "error", "No parsers could be found for {}".format(doc)) self._ignore.append(doc) continue @@ -160,6 +160,16 @@ class Consumer(object): if result: options.append(result) + self.log( + "info", + "Parsers available: {}".format( + ", ".join([str(o["parser"].__name__) for o in options]) + ) + ) + + if not options: + return None + # Return the parser with the highest weight. return sorted( options, key=lambda _: _["weight"], reverse=True)[0]["parser"] diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 1e9f189f7..101655f6c 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,8 +1,56 @@ from django.test import TestCase +from unittest import mock +from ..consumer import Consumer from ..models import FileInfo +class TestConsumer(TestCase): + + class DummyParser(object): + pass + + def test__get_parser_class_1_parser(self): + self.assertEqual( + self._get_consumer()._get_parser_class("doc.pdf"), + self.DummyParser + ) + + @mock.patch("documents.consumer.os.makedirs") + @mock.patch("documents.consumer.os.path.exists", return_value=True) + @mock.patch("documents.consumer.document_consumer_declaration.send") + def test__get_parser_class_n_parsers(self, m, *args): + + class DummyParser1(object): + pass + + class DummyParser2(object): + pass + + m.return_value = ( + (None, lambda _: {"weight": 0, "parser": DummyParser1}), + (None, lambda _: {"weight": 1, "parser": DummyParser2}), + ) + + self.assertEqual(Consumer()._get_parser_class("doc.pdf"), DummyParser2) + + @mock.patch("documents.consumer.os.makedirs") + @mock.patch("documents.consumer.os.path.exists", return_value=True) + @mock.patch("documents.consumer.document_consumer_declaration.send") + def test__get_parser_class_0_parsers(self, m, *args): + m.return_value = ((None, lambda _: None),) + self.assertIsNone(Consumer()._get_parser_class("doc.pdf")) + + @mock.patch("documents.consumer.os.makedirs") + @mock.patch("documents.consumer.os.path.exists", return_value=True) + @mock.patch("documents.consumer.document_consumer_declaration.send") + def _get_consumer(self, m, *args): + m.return_value = ( + (None, lambda _: {"weight": 0, "parser": self.DummyParser}), + ) + return Consumer() + + class TestAttributes(TestCase): TAGS = ("tag1", "tag2", "tag3") diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py index 3e5555383..e1717f105 100644 --- a/src/paperless_tesseract/signals.py +++ b/src/paperless_tesseract/signals.py @@ -5,7 +5,7 @@ from .parsers import RasterisedDocumentParser class ConsumerDeclaration(object): - MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$") + MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff?|pnm|bmp)$") @classmethod def handle(cls, sender, **kwargs): @@ -14,7 +14,7 @@ class ConsumerDeclaration(object): @classmethod def test(cls, doc): - if cls.MATCHING_FILES.match(doc): + if cls.MATCHING_FILES.match(doc.lower()): return { "parser": RasterisedDocumentParser, "weight": 0 diff --git a/src/paperless_tesseract/tests/test_signals.py b/src/paperless_tesseract/tests/test_signals.py new file mode 100644 index 000000000..62d286daf --- /dev/null +++ b/src/paperless_tesseract/tests/test_signals.py @@ -0,0 +1,36 @@ +from django.test import TestCase + +from ..signals import ConsumerDeclaration + + +class SignalsTestCase(TestCase): + + def test_test_handles_various_file_names_true(self): + + prefixes = ( + "doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags", + "A document with a . in it", "Doc with -- in it" + ) + suffixes = ( + "pdf", "jpg", "gif", "png", "tiff", "tif", "pnm", "bmp", + "PDF", "JPG", "GIF", "PNG", "TIFF", "TIF", "PNM", "BMP", + "pDf", "jPg", "gIf", "pNg", "tIff", "tIf", "pNm", "bMp", + ) + + for prefix in prefixes: + for suffix in suffixes: + name = "{}.{}".format(prefix, suffix) + self.assertTrue(ConsumerDeclaration.test(name)) + + def test_test_handles_various_file_names_false(self): + + prefixes = ("doc",) + suffixes = ("txt", "markdown", "",) + + for prefix in prefixes: + for suffix in suffixes: + name = "{}.{}".format(prefix, suffix) + self.assertFalse(ConsumerDeclaration.test(name)) + + self.assertFalse(ConsumerDeclaration.test("")) + self.assertFalse(ConsumerDeclaration.test("doc"))