Consolidate get_date onto the DocumentParser parent class

This commit is contained in:
Daniel Quinn 2018-10-07 14:48:49 +01:00
parent 14bb52b6a4
commit 2a3f766b93
5 changed files with 83 additions and 90 deletions

View File

@ -9,6 +9,8 @@ Changelog
for reporting this. `#414`_. for reporting this. `#414`_.
* A bug in the Dockerfile meant that Tesseract language files weren't being * A bug in the Dockerfile meant that Tesseract language files weren't being
installed correctly. `euri10`_ was quick to provide a fix: `#406`_, `#413`_. installed correctly. `euri10`_ was quick to provide a fix: `#406`_, `#413`_.
* The ``get_date()`` functionality of the parsers has been consolidated onto
the ``DocumentParser`` class since much of that code was redundant anyway.
2.4.0 2.4.0
===== =====

View File

@ -1,9 +1,12 @@
import logging import logging
import os
import re
import shutil import shutil
import tempfile import tempfile
import re
import dateparser
from django.conf import settings from django.conf import settings
from django.utils import timezone
# This regular expression will try to find dates in the document at # This regular expression will try to find dates in the document at
# hand and will match the following formats: # hand and will match the following formats:
@ -32,6 +35,7 @@ class DocumentParser:
""" """
SCRATCH = settings.SCRATCH_DIR SCRATCH = settings.SCRATCH_DIR
DATE_ORDER = settings.DATE_ORDER
def __init__(self, path): def __init__(self, path):
self.document_path = path self.document_path = path
@ -55,7 +59,52 @@ class DocumentParser:
""" """
Returns the date of the document. Returns the date of the document.
""" """
raise NotImplementedError()
date = None
date_string = None
try:
text = self.get_text()
except ParseError:
return None
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
# Iterate through all regex matches and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
try:
date = dateparser.parse(
date_string,
settings={
"DATE_ORDER": self.DATE_ORDER,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True
}
)
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
break
else:
date = None
if date is not None:
self.log(
"info",
"Detected document date {} based on string {}".format(
date.isoformat(),
date_string
)
)
else:
self.log("info", "Unable to detect date for document")
return date
def log(self, level, message): def log(self, level, message):
getattr(self.logger, level)(message, extra={ getattr(self.logger, level)(message, extra={

View File

@ -4,7 +4,6 @@ import re
import subprocess import subprocess
from multiprocessing.pool import Pool from multiprocessing.pool import Pool
import dateparser
import langdetect import langdetect
import pyocr import pyocr
from django.conf import settings from django.conf import settings
@ -14,7 +13,7 @@ from pyocr.libtesseract.tesseract_raw import \
from pyocr.tesseract import TesseractError from pyocr.tesseract import TesseractError
import pdftotext import pdftotext
from documents.parsers import DocumentParser, ParseError, DATE_REGEX from documents.parsers import DocumentParser, ParseError
from .languages import ISO639 from .languages import ISO639
@ -33,7 +32,6 @@ class RasterisedDocumentParser(DocumentParser):
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY UNPAPER = settings.UNPAPER_BINARY
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS OCR_ALWAYS = settings.OCR_ALWAYS
@ -202,51 +200,6 @@ class RasterisedDocumentParser(DocumentParser):
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
return text return text
def get_date(self):
date = None
datestring = None
try:
text = self.get_text()
except ParseError as e:
return None
# Iterate through all regex matches and try to parse the date
for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0)
try:
date = dateparser.parse(
datestring,
settings={
"DATE_ORDER": self.DATE_ORDER,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True
}
)
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None and date.year > 1900:
break
else:
date = None
if date is not None:
self.log(
"info",
"Detected document date {} based on string {}".format(
date.isoformat(),
datestring
)
)
else:
self.log("info", "Unable to detect date for document")
return date
def run_convert(*args): def run_convert(*args):

View File

@ -393,7 +393,33 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH SCRATCH
) )
def test_crazy_date(self, *args): def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-2350 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date_future(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null") document = RasterisedDocumentParser("/dev/null")
document.get_text() document.get_text()
self.assertIsNone(document.get_date()) self.assertIsNone(document.get_date())

View File

@ -1,11 +1,9 @@
import os import os
import re
import subprocess import subprocess
import dateparser
from django.conf import settings from django.conf import settings
from documents.parsers import DocumentParser, ParseError, DATE_REGEX from documents.parsers import DocumentParser, ParseError
class TextDocumentParser(DocumentParser): class TextDocumentParser(DocumentParser):
@ -16,7 +14,6 @@ class TextDocumentParser(DocumentParser):
CONVERT = settings.CONVERT_BINARY CONVERT = settings.CONVERT_BINARY
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY UNPAPER = settings.UNPAPER_BINARY
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS OCR_ALWAYS = settings.OCR_ALWAYS
@ -26,7 +23,7 @@ class TextDocumentParser(DocumentParser):
def get_thumbnail(self): def get_thumbnail(self):
""" """
The thumbnail of a txt is just a 500px wide image of the text The thumbnail of a text file is just a 500px wide image of the text
rendered onto a letter-sized page. rendered onto a letter-sized page.
""" """
# The below is heavily cribbed from https://askubuntu.com/a/590951 # The below is heavily cribbed from https://askubuntu.com/a/590951
@ -84,40 +81,6 @@ class TextDocumentParser(DocumentParser):
return self._text return self._text
def get_date(self):
date = None
datestring = None
try:
text = self.get_text()
except ParseError as e:
return None
# Iterate through all regex matches and try to parse the date
for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0)
try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None:
break
if date is not None:
self.log("info", "Detected document date " + date.isoformat() +
" based on string " + datestring)
else:
self.log("info", "Unable to detect date for document")
return date
def run_command(*args): def run_command(*args):
environment = os.environ.copy() environment = os.environ.copy()