mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Consolidate get_date onto the DocumentParser parent class
This commit is contained in:
parent
14bb52b6a4
commit
2a3f766b93
@ -9,6 +9,8 @@ Changelog
|
|||||||
for reporting this. `#414`_.
|
for reporting this. `#414`_.
|
||||||
* A bug in the Dockerfile meant that Tesseract language files weren't being
|
* A bug in the Dockerfile meant that Tesseract language files weren't being
|
||||||
installed correctly. `euri10`_ was quick to provide a fix: `#406`_, `#413`_.
|
installed correctly. `euri10`_ was quick to provide a fix: `#406`_, `#413`_.
|
||||||
|
* The ``get_date()`` functionality of the parsers has been consolidated onto
|
||||||
|
the ``DocumentParser`` class since much of that code was redundant anyway.
|
||||||
|
|
||||||
2.4.0
|
2.4.0
|
||||||
=====
|
=====
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import re
|
|
||||||
|
|
||||||
|
import dateparser
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
# This regular expression will try to find dates in the document at
|
# This regular expression will try to find dates in the document at
|
||||||
# hand and will match the following formats:
|
# hand and will match the following formats:
|
||||||
@ -32,6 +35,7 @@ class DocumentParser:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
SCRATCH = settings.SCRATCH_DIR
|
SCRATCH = settings.SCRATCH_DIR
|
||||||
|
DATE_ORDER = settings.DATE_ORDER
|
||||||
|
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
self.document_path = path
|
self.document_path = path
|
||||||
@ -55,7 +59,52 @@ class DocumentParser:
|
|||||||
"""
|
"""
|
||||||
Returns the date of the document.
|
Returns the date of the document.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
|
||||||
|
date = None
|
||||||
|
date_string = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
text = self.get_text()
|
||||||
|
except ParseError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||||
|
|
||||||
|
# Iterate through all regex matches and try to parse the date
|
||||||
|
for m in re.finditer(DATE_REGEX, text):
|
||||||
|
|
||||||
|
date_string = m.group(0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
date = dateparser.parse(
|
||||||
|
date_string,
|
||||||
|
settings={
|
||||||
|
"DATE_ORDER": self.DATE_ORDER,
|
||||||
|
"PREFER_DAY_OF_MONTH": "first",
|
||||||
|
"RETURN_AS_TIMEZONE_AWARE": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except TypeError:
|
||||||
|
# Skip all matches that do not parse to a proper date
|
||||||
|
continue
|
||||||
|
|
||||||
|
if date is not None and next_year > date.year > 1900:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
date = None
|
||||||
|
|
||||||
|
if date is not None:
|
||||||
|
self.log(
|
||||||
|
"info",
|
||||||
|
"Detected document date {} based on string {}".format(
|
||||||
|
date.isoformat(),
|
||||||
|
date_string
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.log("info", "Unable to detect date for document")
|
||||||
|
|
||||||
|
return date
|
||||||
|
|
||||||
def log(self, level, message):
|
def log(self, level, message):
|
||||||
getattr(self.logger, level)(message, extra={
|
getattr(self.logger, level)(message, extra={
|
||||||
|
@ -4,7 +4,6 @@ import re
|
|||||||
import subprocess
|
import subprocess
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import Pool
|
||||||
|
|
||||||
import dateparser
|
|
||||||
import langdetect
|
import langdetect
|
||||||
import pyocr
|
import pyocr
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
@ -14,7 +13,7 @@ from pyocr.libtesseract.tesseract_raw import \
|
|||||||
from pyocr.tesseract import TesseractError
|
from pyocr.tesseract import TesseractError
|
||||||
|
|
||||||
import pdftotext
|
import pdftotext
|
||||||
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
|
from documents.parsers import DocumentParser, ParseError
|
||||||
|
|
||||||
from .languages import ISO639
|
from .languages import ISO639
|
||||||
|
|
||||||
@ -33,7 +32,6 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
|
||||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||||
UNPAPER = settings.UNPAPER_BINARY
|
UNPAPER = settings.UNPAPER_BINARY
|
||||||
DATE_ORDER = settings.DATE_ORDER
|
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||||
|
|
||||||
@ -202,51 +200,6 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def get_date(self):
|
|
||||||
|
|
||||||
date = None
|
|
||||||
datestring = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
text = self.get_text()
|
|
||||||
except ParseError as e:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Iterate through all regex matches and try to parse the date
|
|
||||||
for m in re.finditer(DATE_REGEX, text):
|
|
||||||
datestring = m.group(0)
|
|
||||||
|
|
||||||
try:
|
|
||||||
date = dateparser.parse(
|
|
||||||
datestring,
|
|
||||||
settings={
|
|
||||||
"DATE_ORDER": self.DATE_ORDER,
|
|
||||||
"PREFER_DAY_OF_MONTH": "first",
|
|
||||||
"RETURN_AS_TIMEZONE_AWARE": True
|
|
||||||
}
|
|
||||||
)
|
|
||||||
except TypeError:
|
|
||||||
# Skip all matches that do not parse to a proper date
|
|
||||||
continue
|
|
||||||
|
|
||||||
if date is not None and date.year > 1900:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
date = None
|
|
||||||
|
|
||||||
if date is not None:
|
|
||||||
self.log(
|
|
||||||
"info",
|
|
||||||
"Detected document date {} based on string {}".format(
|
|
||||||
date.isoformat(),
|
|
||||||
datestring
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.log("info", "Unable to detect date for document")
|
|
||||||
|
|
||||||
return date
|
|
||||||
|
|
||||||
|
|
||||||
def run_convert(*args):
|
def run_convert(*args):
|
||||||
|
|
||||||
|
@ -393,7 +393,33 @@ class TestDate(TestCase):
|
|||||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||||
SCRATCH
|
SCRATCH
|
||||||
)
|
)
|
||||||
def test_crazy_date(self, *args):
|
def test_crazy_date_past(self, *args):
|
||||||
|
document = RasterisedDocumentParser("/dev/null")
|
||||||
|
document.get_text()
|
||||||
|
self.assertIsNone(document.get_date())
|
||||||
|
|
||||||
|
@mock.patch(
|
||||||
|
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||||
|
return_value="01-07-2350 00:00:00"
|
||||||
|
)
|
||||||
|
@mock.patch(
|
||||||
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||||
|
SCRATCH
|
||||||
|
)
|
||||||
|
def test_crazy_date_future(self, *args):
|
||||||
|
document = RasterisedDocumentParser("/dev/null")
|
||||||
|
document.get_text()
|
||||||
|
self.assertIsNone(document.get_date())
|
||||||
|
|
||||||
|
@mock.patch(
|
||||||
|
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||||
|
return_value="01-07-0590 00:00:00"
|
||||||
|
)
|
||||||
|
@mock.patch(
|
||||||
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||||
|
SCRATCH
|
||||||
|
)
|
||||||
|
def test_crazy_date_past(self, *args):
|
||||||
document = RasterisedDocumentParser("/dev/null")
|
document = RasterisedDocumentParser("/dev/null")
|
||||||
document.get_text()
|
document.get_text()
|
||||||
self.assertIsNone(document.get_date())
|
self.assertIsNone(document.get_date())
|
||||||
|
@ -1,11 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
import dateparser
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
|
from documents.parsers import DocumentParser, ParseError
|
||||||
|
|
||||||
|
|
||||||
class TextDocumentParser(DocumentParser):
|
class TextDocumentParser(DocumentParser):
|
||||||
@ -16,7 +14,6 @@ class TextDocumentParser(DocumentParser):
|
|||||||
CONVERT = settings.CONVERT_BINARY
|
CONVERT = settings.CONVERT_BINARY
|
||||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||||
UNPAPER = settings.UNPAPER_BINARY
|
UNPAPER = settings.UNPAPER_BINARY
|
||||||
DATE_ORDER = settings.DATE_ORDER
|
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||||
|
|
||||||
@ -26,7 +23,7 @@ class TextDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
def get_thumbnail(self):
|
def get_thumbnail(self):
|
||||||
"""
|
"""
|
||||||
The thumbnail of a txt is just a 500px wide image of the text
|
The thumbnail of a text file is just a 500px wide image of the text
|
||||||
rendered onto a letter-sized page.
|
rendered onto a letter-sized page.
|
||||||
"""
|
"""
|
||||||
# The below is heavily cribbed from https://askubuntu.com/a/590951
|
# The below is heavily cribbed from https://askubuntu.com/a/590951
|
||||||
@ -84,40 +81,6 @@ class TextDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
return self._text
|
return self._text
|
||||||
|
|
||||||
def get_date(self):
|
|
||||||
date = None
|
|
||||||
datestring = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
text = self.get_text()
|
|
||||||
except ParseError as e:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Iterate through all regex matches and try to parse the date
|
|
||||||
for m in re.finditer(DATE_REGEX, text):
|
|
||||||
datestring = m.group(0)
|
|
||||||
|
|
||||||
try:
|
|
||||||
date = dateparser.parse(
|
|
||||||
datestring,
|
|
||||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
|
||||||
'PREFER_DAY_OF_MONTH': 'first',
|
|
||||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
|
||||||
except TypeError:
|
|
||||||
# Skip all matches that do not parse to a proper date
|
|
||||||
continue
|
|
||||||
|
|
||||||
if date is not None:
|
|
||||||
break
|
|
||||||
|
|
||||||
if date is not None:
|
|
||||||
self.log("info", "Detected document date " + date.isoformat() +
|
|
||||||
" based on string " + datestring)
|
|
||||||
else:
|
|
||||||
self.log("info", "Unable to detect date for document")
|
|
||||||
|
|
||||||
return date
|
|
||||||
|
|
||||||
|
|
||||||
def run_command(*args):
|
def run_command(*args):
|
||||||
environment = os.environ.copy()
|
environment = os.environ.copy()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user