Rename parsers to DATE_REGEX

In moving the `parsers` variable into the package-level, it lost the
context, so a more descriptive name was needed.
This commit is contained in:
Daniel Quinn 2018-09-09 21:02:30 +01:00
parent ef302abed7
commit c99f5923d5
3 changed files with 7 additions and 6 deletions

View File

@ -13,11 +13,12 @@ from django.conf import settings
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
DATE_REGEX = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
r'\b([^\W\d_]{3,9} [0-9]{4})\b'
)
class ParseError(Exception):

View File

@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \
from pyocr.tesseract import TesseractError
import pdftotext
from documents.parsers import DocumentParser, ParseError, pattern
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
from .languages import ISO639
@ -211,7 +211,7 @@ class RasterisedDocumentParser(DocumentParser):
return None
# Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text):
for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0)
try:

View File

@ -5,7 +5,7 @@ import subprocess
import dateparser
from django.conf import settings
from documents.parsers import DocumentParser, ParseError, pattern
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
class TextDocumentParser(DocumentParser):
@ -94,7 +94,7 @@ class TextDocumentParser(DocumentParser):
return None
# Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text):
for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0)
try: