Added test for duplicates

This commit is contained in:
Daniel Quinn 2016-04-03 18:44:00 +01:00
parent 2853545b9d
commit 64b72d4337

View File

@ -1,4 +1,5 @@
import datetime
import hashlib
import logging
import tempfile
import uuid
@ -101,6 +102,14 @@ class Consumer(object):
if self._is_ready(doc):
continue
if self._is_duplicate(doc):
self.log(
"info",
"Skipping {} as it appears to be a duplicate".format(doc)
)
self._ignore.append(doc)
continue
self.logging_group = uuid.uuid4()
self.log("info", "Consuming {}".format(doc))
@ -340,6 +349,12 @@ class Consumer(object):
return False
@staticmethod
def _is_duplicate(doc):
with open(doc, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
return Document.objects.filter(checksum=checksum).exists()
def image_to_string(args):
img, lang = args