move date-matching regex pattern to base parser module for use by all subclasses

This commit is contained in:
Joshua Taillon 2018-09-05 21:13:36 -04:00
parent 23bf79274c
commit 72c828170e
3 changed files with 17 additions and 31 deletions

View File

@ -1,9 +1,24 @@
import logging import logging
import shutil import shutil
import tempfile import tempfile
import re
from django.conf import settings from django.conf import settings
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
class ParseError(Exception): class ParseError(Exception):
pass pass

View File

@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \
from pyocr.tesseract import TesseractError from pyocr.tesseract import TesseractError
import pdftotext import pdftotext
from documents.parsers import DocumentParser, ParseError from documents.parsers import DocumentParser, ParseError, pattern
from .languages import ISO639 from .languages import ISO639
@ -210,20 +210,6 @@ class RasterisedDocumentParser(DocumentParser):
except ParseError as e: except ParseError as e:
return None return None
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
# Iterate through all regex matches and try to parse the date # Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text): for m in re.finditer(pattern, text):
datestring = m.group(0) datestring = m.group(0)

View File

@ -5,7 +5,7 @@ import subprocess
import dateparser import dateparser
from django.conf import settings from django.conf import settings
from documents.parsers import DocumentParser, ParseError from documents.parsers import DocumentParser, ParseError, pattern
class TextDocumentParser(DocumentParser): class TextDocumentParser(DocumentParser):
@ -13,7 +13,6 @@ class TextDocumentParser(DocumentParser):
This parser directly parses a text document (.txt, .md, or .csv) This parser directly parses a text document (.txt, .md, or .csv)
""" """
CONVERT = settings.CONVERT_BINARY CONVERT = settings.CONVERT_BINARY
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY UNPAPER = settings.UNPAPER_BINARY
@ -94,20 +93,6 @@ class TextDocumentParser(DocumentParser):
except ParseError as e: except ParseError as e:
return None return None
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
# Iterate through all regex matches and try to parse the date # Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text): for m in re.finditer(pattern, text):
datestring = m.group(0) datestring = m.group(0)