diff --git a/.travis.yml b/.travis.yml index 41abf71ee..ccb5e2c2a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ language: python before_install: - sudo apt-get update -qq -- sudo apt-get install -qq libpoppler-cpp-dev +- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr tesseract-ocr-eng sudo: false diff --git a/requirements.txt b/requirements.txt index 04c8f9a7c..c3875b3b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ python-dateutil>=2.6.0 python-dotenv>=0.6.2 python-gnupg>=0.3.9 pytz>=2016.10 +dateparser>=0.6.0 gunicorn==19.7.1 pdftotext>=2.0.1 diff --git a/src/documents/consumer.py b/src/documents/consumer.py index b4f300400..b7f7f61e7 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -118,12 +118,14 @@ class Consumer(object): parsed_document = parser_class(doc) thumbnail = parsed_document.get_thumbnail() + date = parsed_document.get_date() try: document = self._store( parsed_document.get_text(), doc, - thumbnail + thumbnail, + date ) except ParseError as e: @@ -174,7 +176,7 @@ class Consumer(object): return sorted( options, key=lambda _: _["weight"], reverse=True)[0]["parser"] - def _store(self, text, doc, thumbnail): + def _store(self, text, doc, thumbnail, date): file_info = FileInfo.from_path(doc) @@ -182,7 +184,7 @@ class Consumer(object): self.log("debug", "Saving record to database") - created = file_info.created or timezone.make_aware( + created = file_info.created or date or timezone.make_aware( datetime.datetime.fromtimestamp(stats.st_mtime)) with open(doc, "rb") as f: diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 9f63cbbcd..c11501487 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -35,6 +35,12 @@ class DocumentParser(object): """ raise NotImplementedError() + def get_date(self): + """ + Returns the date of the document. + """ + raise NotImplementedError() + def log(self, level, message): getattr(self.logger, level)(message, extra={ "group": self.logging_group diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 6d750c9b0..14b2aeb63 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -261,3 +261,6 @@ PAPERLESS_LIST_PER_PAGE = int(os.getenv("PAPERLESS_LIST_PER_PAGE", 100)) FY_START = os.getenv("PAPERLESS_FINANCIAL_YEAR_START") FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END") + +# Specify the default date order (for autodetected dates) +DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 43c898df5..85209dd8e 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -3,6 +3,7 @@ import os import re import subprocess from multiprocessing.pool import Pool +import dateparser import pdftotext import langdetect @@ -31,8 +32,10 @@ class RasterisedDocumentParser(DocumentParser): DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None UNPAPER = settings.UNPAPER_BINARY + DATE_ORDER = settings.DATE_ORDER DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE OCR_ALWAYS = settings.OCR_ALWAYS + TEXT_CACHE = None def get_thumbnail(self): """ @@ -60,15 +63,20 @@ class RasterisedDocumentParser(DocumentParser): return False def get_text(self): + if self.TEXT_CACHE is not None: + return self.TEXT_CACHE + if not self.OCR_ALWAYS and self._is_ocred(): self.log("info", "Skipping OCR, using Text from PDF") - return get_text_from_pdf(self.document_path) + self.TEXT_CACHE = get_text_from_pdf(self.document_path) + return self.TEXT_CACHE images = self._get_greyscale() try: - return self._get_ocr(images) + self.TEXT_CACHE = self._get_ocr(images) + return self.TEXT_CACHE except OCRError as e: raise ParseError(e) @@ -191,6 +199,29 @@ class RasterisedDocumentParser(DocumentParser): text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) return text + def get_date(self): + text = self.get_text() + + # This regular expression will try to find dates in the document at + # hand and will match the following formats: + # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits + # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits + # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits + # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits + # - MONTH ZZZZ + m = re.search( + r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + + r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + + r'\b([^ ]{3,9} [0-9]{4})\b', text) + + if m is None: + return None + + return dateparser.parse(m.group(0), + settings={'DATE_ORDER': self.DATE_ORDER, + 'PREFER_DAY_OF_MONTH': 'first', + 'RETURN_AS_TIMEZONE_AWARE': True}) + def run_convert(*args): diff --git a/src/paperless_tesseract/tests/samples/tests_date_1.pdf b/src/paperless_tesseract/tests/samples/tests_date_1.pdf new file mode 100644 index 000000000..629125956 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_1.pdf differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_1.png b/src/paperless_tesseract/tests/samples/tests_date_1.png new file mode 100644 index 000000000..4a7671635 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_1.png differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_2.pdf b/src/paperless_tesseract/tests/samples/tests_date_2.pdf new file mode 100644 index 000000000..ae51a950c Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_2.pdf differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_2.png b/src/paperless_tesseract/tests/samples/tests_date_2.png new file mode 100644 index 000000000..f01bdfed1 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_2.png differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_3.pdf b/src/paperless_tesseract/tests/samples/tests_date_3.pdf new file mode 100644 index 000000000..0270ae097 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_3.pdf differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_3.png b/src/paperless_tesseract/tests/samples/tests_date_3.png new file mode 100644 index 000000000..7af752cbc Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_3.png differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_4.pdf b/src/paperless_tesseract/tests/samples/tests_date_4.pdf new file mode 100644 index 000000000..e235ad215 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_4.pdf differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_4.png b/src/paperless_tesseract/tests/samples/tests_date_4.png new file mode 100644 index 000000000..c76a55dfb Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_4.png differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_5.pdf b/src/paperless_tesseract/tests/samples/tests_date_5.pdf new file mode 100644 index 000000000..717ea4ef4 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_5.pdf differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_5.png b/src/paperless_tesseract/tests/samples/tests_date_5.png new file mode 100644 index 000000000..97afeec01 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_5.png differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_6.pdf b/src/paperless_tesseract/tests/samples/tests_date_6.pdf new file mode 100644 index 000000000..bda2dc280 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_6.pdf differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_6.png b/src/paperless_tesseract/tests/samples/tests_date_6.png new file mode 100644 index 000000000..425105d35 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_6.png differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_7.pdf b/src/paperless_tesseract/tests/samples/tests_date_7.pdf new file mode 100644 index 000000000..c10f66883 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_7.pdf differ diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py new file mode 100644 index 000000000..019d5199a --- /dev/null +++ b/src/paperless_tesseract/tests/test_date.py @@ -0,0 +1,206 @@ +import os +from unittest import skipIf, mock + +import pyocr +from django.test import TestCase + +from ..parsers import RasterisedDocumentParser +import datetime +from dateutil import tz + + +class TestDate(TestCase): + SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_1_pdf(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), True) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 4, 1, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_1_png(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.png") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), False) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 4, 1, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_2_pdf(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.pdf") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), True) + self.assertEqual(document.get_date(), + datetime.datetime(2013, 2, 1, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_2_png(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.png") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), False) + self.assertEqual(document.get_date(), + datetime.datetime(2013, 2, 1, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_3_pdf(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), True) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 10, 5, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_3_png(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), False) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 10, 5, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_4_pdf(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), True) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 10, 5, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_4_png(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), False) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 10, 5, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_5_pdf(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.pdf") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), True) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 12, 17, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_5_png(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.png") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), False) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 12, 17, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_6_pdf_us(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf") + document = RasterisedDocumentParser(input_file) + document.get_text() + document.DATE_ORDER = "MDY" + self.assertEqual(document._is_ocred(), True) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 12, 17, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_6_png_us(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png") + document = RasterisedDocumentParser(input_file) + document.get_text() + document.DATE_ORDER = "MDY" + self.assertEqual(document._is_ocred(), False) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 12, 17, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_6_pdf_eu(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), True) + self.assertEqual(document.get_date(), None) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_6_png_eu(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), False) + self.assertEqual(document.get_date(), None) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_7_pdf(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_7.pdf") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), True) + self.assertEqual(document.get_date(), + datetime.datetime(2018, 4, 1, 0, 0, + tzinfo=tz.tzutc()))