diff --git a/src/documents/parsers.py b/src/documents/parsers.py index c28b31a6b..884f91ae4 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -13,11 +13,12 @@ from django.conf import settings # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits -pattern = re.compile( +DATE_REGEX = re.compile( r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{4})\b') + r'\b([^\W\d_]{3,9} [0-9]{4})\b' +) class ParseError(Exception): diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 1ecf36906..1aa4513cb 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \ from pyocr.tesseract import TesseractError import pdftotext -from documents.parsers import DocumentParser, ParseError, pattern +from documents.parsers import DocumentParser, ParseError, DATE_REGEX from .languages import ISO639 @@ -211,7 +211,7 @@ class RasterisedDocumentParser(DocumentParser): return None # Iterate through all regex matches and try to parse the date - for m in re.finditer(pattern, text): + for m in re.finditer(DATE_REGEX, text): datestring = m.group(0) try: diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 1b97d0ea1..f02ba3ef8 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -5,7 +5,7 @@ import subprocess import dateparser from django.conf import settings -from documents.parsers import DocumentParser, ParseError, pattern +from documents.parsers import DocumentParser, ParseError, DATE_REGEX class TextDocumentParser(DocumentParser): @@ -94,7 +94,7 @@ class TextDocumentParser(DocumentParser): return None # Iterate through all regex matches and try to parse the date - for m in re.finditer(pattern, text): + for m in re.finditer(DATE_REGEX, text): datestring = m.group(0) try: