mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Consolidate get_date onto the DocumentParser parent class
This commit is contained in:
		@@ -9,6 +9,8 @@ Changelog
 | 
			
		||||
  for reporting this. `#414`_.
 | 
			
		||||
* A bug in the Dockerfile meant that Tesseract language files weren't being
 | 
			
		||||
  installed correctly.  `euri10`_ was quick to provide a fix: `#406`_, `#413`_.
 | 
			
		||||
* The ``get_date()`` functionality of the parsers has been consolidated onto
 | 
			
		||||
  the ``DocumentParser`` class since much of that code was redundant anyway.
 | 
			
		||||
 | 
			
		||||
2.4.0
 | 
			
		||||
=====
 | 
			
		||||
 
 | 
			
		||||
@@ -1,9 +1,12 @@
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
import dateparser
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.utils import timezone
 | 
			
		||||
 | 
			
		||||
# This regular expression will try to find dates in the document at
 | 
			
		||||
# hand and will match the following formats:
 | 
			
		||||
@@ -32,6 +35,7 @@ class DocumentParser:
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    SCRATCH = settings.SCRATCH_DIR
 | 
			
		||||
    DATE_ORDER = settings.DATE_ORDER
 | 
			
		||||
 | 
			
		||||
    def __init__(self, path):
 | 
			
		||||
        self.document_path = path
 | 
			
		||||
@@ -55,7 +59,52 @@ class DocumentParser:
 | 
			
		||||
        """
 | 
			
		||||
        Returns the date of the document.
 | 
			
		||||
        """
 | 
			
		||||
        raise NotImplementedError()
 | 
			
		||||
 | 
			
		||||
        date = None
 | 
			
		||||
        date_string = None
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            text = self.get_text()
 | 
			
		||||
        except ParseError:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
 | 
			
		||||
 | 
			
		||||
        # Iterate through all regex matches and try to parse the date
 | 
			
		||||
        for m in re.finditer(DATE_REGEX, text):
 | 
			
		||||
 | 
			
		||||
            date_string = m.group(0)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                date = dateparser.parse(
 | 
			
		||||
                    date_string,
 | 
			
		||||
                    settings={
 | 
			
		||||
                        "DATE_ORDER": self.DATE_ORDER,
 | 
			
		||||
                        "PREFER_DAY_OF_MONTH": "first",
 | 
			
		||||
                        "RETURN_AS_TIMEZONE_AWARE": True
 | 
			
		||||
                    }
 | 
			
		||||
                )
 | 
			
		||||
            except TypeError:
 | 
			
		||||
                # Skip all matches that do not parse to a proper date
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if date is not None and next_year > date.year > 1900:
 | 
			
		||||
                break
 | 
			
		||||
            else:
 | 
			
		||||
                date = None
 | 
			
		||||
 | 
			
		||||
        if date is not None:
 | 
			
		||||
            self.log(
 | 
			
		||||
                "info",
 | 
			
		||||
                "Detected document date {} based on string {}".format(
 | 
			
		||||
                    date.isoformat(),
 | 
			
		||||
                    date_string
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            self.log("info", "Unable to detect date for document")
 | 
			
		||||
 | 
			
		||||
        return date
 | 
			
		||||
 | 
			
		||||
    def log(self, level, message):
 | 
			
		||||
        getattr(self.logger, level)(message, extra={
 | 
			
		||||
 
 | 
			
		||||
@@ -4,7 +4,6 @@ import re
 | 
			
		||||
import subprocess
 | 
			
		||||
from multiprocessing.pool import Pool
 | 
			
		||||
 | 
			
		||||
import dateparser
 | 
			
		||||
import langdetect
 | 
			
		||||
import pyocr
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
@@ -14,7 +13,7 @@ from pyocr.libtesseract.tesseract_raw import \
 | 
			
		||||
from pyocr.tesseract import TesseractError
 | 
			
		||||
 | 
			
		||||
import pdftotext
 | 
			
		||||
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
 | 
			
		||||
from documents.parsers import DocumentParser, ParseError
 | 
			
		||||
 | 
			
		||||
from .languages import ISO639
 | 
			
		||||
 | 
			
		||||
@@ -33,7 +32,6 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
 | 
			
		||||
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
			
		||||
    UNPAPER = settings.UNPAPER_BINARY
 | 
			
		||||
    DATE_ORDER = settings.DATE_ORDER
 | 
			
		||||
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
			
		||||
    OCR_ALWAYS = settings.OCR_ALWAYS
 | 
			
		||||
 | 
			
		||||
@@ -202,51 +200,6 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
 | 
			
		||||
        return text
 | 
			
		||||
 | 
			
		||||
    def get_date(self):
 | 
			
		||||
 | 
			
		||||
        date = None
 | 
			
		||||
        datestring = None
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            text = self.get_text()
 | 
			
		||||
        except ParseError as e:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        # Iterate through all regex matches and try to parse the date
 | 
			
		||||
        for m in re.finditer(DATE_REGEX, text):
 | 
			
		||||
            datestring = m.group(0)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                date = dateparser.parse(
 | 
			
		||||
                    datestring,
 | 
			
		||||
                    settings={
 | 
			
		||||
                        "DATE_ORDER": self.DATE_ORDER,
 | 
			
		||||
                        "PREFER_DAY_OF_MONTH": "first",
 | 
			
		||||
                        "RETURN_AS_TIMEZONE_AWARE": True
 | 
			
		||||
                    }
 | 
			
		||||
                )
 | 
			
		||||
            except TypeError:
 | 
			
		||||
                # Skip all matches that do not parse to a proper date
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if date is not None and date.year > 1900:
 | 
			
		||||
                break
 | 
			
		||||
            else:
 | 
			
		||||
                date = None
 | 
			
		||||
 | 
			
		||||
        if date is not None:
 | 
			
		||||
            self.log(
 | 
			
		||||
                "info",
 | 
			
		||||
                "Detected document date {} based on string {}".format(
 | 
			
		||||
                    date.isoformat(),
 | 
			
		||||
                    datestring
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            self.log("info", "Unable to detect date for document")
 | 
			
		||||
 | 
			
		||||
        return date
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_convert(*args):
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -393,7 +393,33 @@ class TestDate(TestCase):
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SCRATCH
 | 
			
		||||
    )
 | 
			
		||||
    def test_crazy_date(self, *args):
 | 
			
		||||
    def test_crazy_date_past(self, *args):
 | 
			
		||||
        document = RasterisedDocumentParser("/dev/null")
 | 
			
		||||
        document.get_text()
 | 
			
		||||
        self.assertIsNone(document.get_date())
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
 | 
			
		||||
        return_value="01-07-2350 00:00:00"
 | 
			
		||||
    )
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SCRATCH
 | 
			
		||||
    )
 | 
			
		||||
    def test_crazy_date_future(self, *args):
 | 
			
		||||
        document = RasterisedDocumentParser("/dev/null")
 | 
			
		||||
        document.get_text()
 | 
			
		||||
        self.assertIsNone(document.get_date())
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
 | 
			
		||||
        return_value="01-07-0590 00:00:00"
 | 
			
		||||
    )
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SCRATCH
 | 
			
		||||
    )
 | 
			
		||||
    def test_crazy_date_past(self, *args):
 | 
			
		||||
        document = RasterisedDocumentParser("/dev/null")
 | 
			
		||||
        document.get_text()
 | 
			
		||||
        self.assertIsNone(document.get_date())
 | 
			
		||||
 
 | 
			
		||||
@@ -1,11 +1,9 @@
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import subprocess
 | 
			
		||||
 | 
			
		||||
import dateparser
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
 | 
			
		||||
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
 | 
			
		||||
from documents.parsers import DocumentParser, ParseError
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TextDocumentParser(DocumentParser):
 | 
			
		||||
@@ -16,7 +14,6 @@ class TextDocumentParser(DocumentParser):
 | 
			
		||||
    CONVERT = settings.CONVERT_BINARY
 | 
			
		||||
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
			
		||||
    UNPAPER = settings.UNPAPER_BINARY
 | 
			
		||||
    DATE_ORDER = settings.DATE_ORDER
 | 
			
		||||
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
			
		||||
    OCR_ALWAYS = settings.OCR_ALWAYS
 | 
			
		||||
 | 
			
		||||
@@ -26,7 +23,7 @@ class TextDocumentParser(DocumentParser):
 | 
			
		||||
 | 
			
		||||
    def get_thumbnail(self):
 | 
			
		||||
        """
 | 
			
		||||
        The thumbnail of a txt is just a 500px wide image of the text
 | 
			
		||||
        The thumbnail of a text file is just a 500px wide image of the text
 | 
			
		||||
        rendered onto a letter-sized page.
 | 
			
		||||
        """
 | 
			
		||||
        # The below is heavily cribbed from https://askubuntu.com/a/590951
 | 
			
		||||
@@ -84,40 +81,6 @@ class TextDocumentParser(DocumentParser):
 | 
			
		||||
 | 
			
		||||
        return self._text
 | 
			
		||||
 | 
			
		||||
    def get_date(self):
 | 
			
		||||
        date = None
 | 
			
		||||
        datestring = None
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            text = self.get_text()
 | 
			
		||||
        except ParseError as e:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        # Iterate through all regex matches and try to parse the date
 | 
			
		||||
        for m in re.finditer(DATE_REGEX, text):
 | 
			
		||||
            datestring = m.group(0)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                date = dateparser.parse(
 | 
			
		||||
                           datestring,
 | 
			
		||||
                           settings={'DATE_ORDER': self.DATE_ORDER,
 | 
			
		||||
                                     'PREFER_DAY_OF_MONTH': 'first',
 | 
			
		||||
                                     'RETURN_AS_TIMEZONE_AWARE': True})
 | 
			
		||||
            except TypeError:
 | 
			
		||||
                # Skip all matches that do not parse to a proper date
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if date is not None:
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
        if date is not None:
 | 
			
		||||
            self.log("info", "Detected document date " + date.isoformat() +
 | 
			
		||||
                             " based on string " + datestring)
 | 
			
		||||
        else:
 | 
			
		||||
            self.log("info", "Unable to detect date for document")
 | 
			
		||||
 | 
			
		||||
        return date
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_command(*args):
 | 
			
		||||
    environment = os.environ.copy()
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user