Rename parsers to DATE_REGEX

In moving the `parsers` variable into the package-level, it lost the
context, so a more descriptive name was needed.
This commit is contained in:
Daniel Quinn 2018-09-09 21:02:30 +01:00
parent ef302abed7
commit c99f5923d5
3 changed files with 7 additions and 6 deletions

View File

@ -13,11 +13,12 @@ from django.conf import settings
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile( DATE_REGEX = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b') r'\b([^\W\d_]{3,9} [0-9]{4})\b'
)
class ParseError(Exception): class ParseError(Exception):

View File

@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \
from pyocr.tesseract import TesseractError from pyocr.tesseract import TesseractError
import pdftotext import pdftotext
from documents.parsers import DocumentParser, ParseError, pattern from documents.parsers import DocumentParser, ParseError, DATE_REGEX
from .languages import ISO639 from .languages import ISO639
@ -211,7 +211,7 @@ class RasterisedDocumentParser(DocumentParser):
return None return None
# Iterate through all regex matches and try to parse the date # Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text): for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0) datestring = m.group(0)
try: try:

View File

@ -5,7 +5,7 @@ import subprocess
import dateparser import dateparser
from django.conf import settings from django.conf import settings
from documents.parsers import DocumentParser, ParseError, pattern from documents.parsers import DocumentParser, ParseError, DATE_REGEX
class TextDocumentParser(DocumentParser): class TextDocumentParser(DocumentParser):
@ -94,7 +94,7 @@ class TextDocumentParser(DocumentParser):
return None return None
# Iterate through all regex matches and try to parse the date # Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text): for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0) datestring = m.group(0)
try: try: