Merge pull request #291 from BastianPoe/feature/heuristically-extract-date-from-document-text

Add support for a heuristic that extracts the document date from its text
This commit is contained in:
Daniel Quinn 2018-02-03 14:13:14 +01:00 committed by GitHub
commit 4d96551619
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 255 additions and 6 deletions

View File

@ -2,7 +2,7 @@ language: python
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libpoppler-cpp-dev
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr tesseract-ocr-eng
sudo: false

View File

@ -13,6 +13,7 @@ python-dateutil>=2.6.0
python-dotenv>=0.6.2
python-gnupg>=0.3.9
pytz>=2016.10
dateparser>=0.6.0
gunicorn==19.7.1
pdftotext>=2.0.1

View File

@ -118,12 +118,14 @@ class Consumer(object):
parsed_document = parser_class(doc)
thumbnail = parsed_document.get_thumbnail()
date = parsed_document.get_date()
try:
document = self._store(
parsed_document.get_text(),
doc,
thumbnail
thumbnail,
date
)
except ParseError as e:
@ -174,7 +176,7 @@ class Consumer(object):
return sorted(
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def _store(self, text, doc, thumbnail):
def _store(self, text, doc, thumbnail, date):
file_info = FileInfo.from_path(doc)
@ -182,7 +184,7 @@ class Consumer(object):
self.log("debug", "Saving record to database")
created = file_info.created or timezone.make_aware(
created = file_info.created or date or timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
with open(doc, "rb") as f:

View File

@ -35,6 +35,12 @@ class DocumentParser(object):
"""
raise NotImplementedError()
def get_date(self):
"""
Returns the date of the document.
"""
raise NotImplementedError()
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group

View File

@ -261,3 +261,6 @@ PAPERLESS_LIST_PER_PAGE = int(os.getenv("PAPERLESS_LIST_PER_PAGE", 100))
FY_START = os.getenv("PAPERLESS_FINANCIAL_YEAR_START")
FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
# Specify the default date order (for autodetected dates)
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")

View File

@ -3,6 +3,7 @@ import os
import re
import subprocess
from multiprocessing.pool import Pool
import dateparser
import pdftotext
import langdetect
@ -31,8 +32,10 @@ class RasterisedDocumentParser(DocumentParser):
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
TEXT_CACHE = None
def get_thumbnail(self):
"""
@ -60,15 +63,20 @@ class RasterisedDocumentParser(DocumentParser):
return False
def get_text(self):
if self.TEXT_CACHE is not None:
return self.TEXT_CACHE
if not self.OCR_ALWAYS and self._is_ocred():
self.log("info", "Skipping OCR, using Text from PDF")
return get_text_from_pdf(self.document_path)
self.TEXT_CACHE = get_text_from_pdf(self.document_path)
return self.TEXT_CACHE
images = self._get_greyscale()
try:
return self._get_ocr(images)
self.TEXT_CACHE = self._get_ocr(images)
return self.TEXT_CACHE
except OCRError as e:
raise ParseError(e)
@ -191,6 +199,29 @@ class RasterisedDocumentParser(DocumentParser):
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
return text
def get_date(self):
text = self.get_text()
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ
m = re.search(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^ ]{3,9} [0-9]{4})\b', text)
if m is None:
return None
return dateparser.parse(m.group(0),
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
def run_convert(*args):

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 135 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 138 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 138 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

View File

@ -0,0 +1,206 @@
import os
from unittest import skipIf, mock
import pyocr
from django.test import TestCase
from ..parsers import RasterisedDocumentParser
import datetime
from dateutil import tz
class TestDate(TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_1_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_1_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.png")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_2_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2013, 2, 1, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_2_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.png")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2013, 2, 1, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_3_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_3_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_4_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_4_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_5_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_5_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.png")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_6_pdf_us(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
document.DATE_ORDER = "MDY"
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_6_png_us(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
document = RasterisedDocumentParser(input_file)
document.get_text()
document.DATE_ORDER = "MDY"
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_6_pdf_eu(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_6_png_eu(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_7_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_7.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0,
tzinfo=tz.tzutc()))