diff --git a/src/documents/consumer.py b/src/documents/consumer.py index eca0c1c68..ccc3ee250 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,4 +1,5 @@ import datetime +import hashlib import logging import tempfile import uuid @@ -101,6 +102,14 @@ class Consumer(object): if self._is_ready(doc): continue + if self._is_duplicate(doc): + self.log( + "info", + "Skipping {} as it appears to be a duplicate".format(doc) + ) + self._ignore.append(doc) + continue + self.logging_group = uuid.uuid4() self.log("info", "Consuming {}".format(doc)) @@ -340,6 +349,12 @@ class Consumer(object): return False + @staticmethod + def _is_duplicate(doc): + with open(doc, "rb") as f: + checksum = hashlib.md5(f.read()).hexdigest() + return Document.objects.filter(checksum=checksum).exists() + def image_to_string(args): img, lang = args