mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Updates the ignore date parsing to utilize the settings defined date order, instead of guessing a bit
This commit is contained in:
		
				
					committed by
					
						
						Trenton Holmes
					
				
			
			
				
	
			
			
			
						parent
						
							a944ef1ca6
						
					
				
				
					commit
					304d5b0d5a
				
			@@ -247,10 +247,8 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
 | 
			
		||||
 | 
			
		||||
    # if filename date parsing is enabled, search there first:
 | 
			
		||||
    if settings.FILENAME_DATE_ORDER:
 | 
			
		||||
        logger.info("Attempting parsing from filename")
 | 
			
		||||
        for m in re.finditer(DATE_REGEX, filename):
 | 
			
		||||
            date_string = m.group(0)
 | 
			
		||||
            logger.info(f"Found potential date: {date_string}")
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                date = __parser(date_string, settings.FILENAME_DATE_ORDER)
 | 
			
		||||
@@ -260,16 +258,11 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
 | 
			
		||||
 | 
			
		||||
            date = __filter(date)
 | 
			
		||||
            if date is not None:
 | 
			
		||||
                logger.info(f"Found date: {date}")
 | 
			
		||||
                return date
 | 
			
		||||
            else:
 | 
			
		||||
                logger.info("Filtered date out")
 | 
			
		||||
 | 
			
		||||
    logger.info("Attempting parsing from content")
 | 
			
		||||
    # Iterate through all regex matches in text and try to parse the date
 | 
			
		||||
    for m in re.finditer(DATE_REGEX, text):
 | 
			
		||||
        date_string = m.group(0)
 | 
			
		||||
        logger.info(f"Found potential date: {date_string}")
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            date = __parser(date_string, settings.DATE_ORDER)
 | 
			
		||||
@@ -279,10 +272,7 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
 | 
			
		||||
 | 
			
		||||
        date = __filter(date)
 | 
			
		||||
        if date is not None:
 | 
			
		||||
            logger.info(f"Found date: {date}")
 | 
			
		||||
            return date
 | 
			
		||||
        else:
 | 
			
		||||
            logger.info("Filtered date out")
 | 
			
		||||
 | 
			
		||||
    return date
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -605,21 +605,40 @@ PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
 | 
			
		||||
if PAPERLESS_TIKA_ENABLED:
 | 
			
		||||
    INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
 | 
			
		||||
 | 
			
		||||
# List dates that should be ignored when trying to parse date from document text
 | 
			
		||||
IGNORE_DATES: Set[datetime.date] = set()
 | 
			
		||||
 | 
			
		||||
def _parse_ignore_dates(
 | 
			
		||||
    env_ignore: str,
 | 
			
		||||
    date_order: str = DATE_ORDER,
 | 
			
		||||
) -> Set[datetime.datetime]:
 | 
			
		||||
    """
 | 
			
		||||
    If the PAPERLESS_IGNORE_DATES environment variable is set, parse the
 | 
			
		||||
    user provided string(s) into dates
 | 
			
		||||
 | 
			
		||||
def _parse_ignore_dates(env_ignore: str) -> Set[datetime.datetime]:
 | 
			
		||||
    Args:
 | 
			
		||||
        env_ignore (str): The value of the environment variable, comma seperated dates
 | 
			
		||||
        date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER.
 | 
			
		||||
 | 
			
		||||
    Returns:
 | 
			
		||||
        Set[datetime.datetime]: The set of parsed date objects
 | 
			
		||||
    """
 | 
			
		||||
    import dateparser
 | 
			
		||||
 | 
			
		||||
    ignored_dates = set()
 | 
			
		||||
    for s in env_ignore.split(","):
 | 
			
		||||
        d = dateparser.parse(s)
 | 
			
		||||
        d = dateparser.parse(
 | 
			
		||||
            s,
 | 
			
		||||
            settings={
 | 
			
		||||
                "DATE_ORDER": date_order,
 | 
			
		||||
            },
 | 
			
		||||
        )
 | 
			
		||||
        if d:
 | 
			
		||||
            ignored_dates.add(d.date())
 | 
			
		||||
    return ignored_dates
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# List dates that should be ignored when trying to parse date from document text
 | 
			
		||||
IGNORE_DATES: Set[datetime.date] = set()
 | 
			
		||||
 | 
			
		||||
if os.getenv("PAPERLESS_IGNORE_DATES") is not None:
 | 
			
		||||
    IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES"))
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -9,6 +9,20 @@ class TestIgnoreDateParsing(TestCase):
 | 
			
		||||
    Tests the parsing of the PAPERLESS_IGNORE_DATES setting value
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def _parse_checker(self, test_cases):
 | 
			
		||||
        """
 | 
			
		||||
        Helper function to check ignore date parsing
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            test_cases (_type_): _description_
 | 
			
		||||
        """
 | 
			
		||||
        for env_str, date_format, expected_date_set in test_cases:
 | 
			
		||||
 | 
			
		||||
            self.assertSetEqual(
 | 
			
		||||
                _parse_ignore_dates(env_str, date_format),
 | 
			
		||||
                expected_date_set,
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def test_no_ignore_dates_set(self):
 | 
			
		||||
        """
 | 
			
		||||
        GIVEN:
 | 
			
		||||
@@ -26,20 +40,19 @@ class TestIgnoreDateParsing(TestCase):
 | 
			
		||||
            - All ignore dates are parsed
 | 
			
		||||
        """
 | 
			
		||||
        test_cases = [
 | 
			
		||||
            ("1985-05-01", [datetime.date(1985, 5, 1)]),
 | 
			
		||||
            ("1985-05-01", "YMD", {datetime.date(1985, 5, 1)}),
 | 
			
		||||
            (
 | 
			
		||||
                "1985-05-01,1991-12-05",
 | 
			
		||||
                [datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)],
 | 
			
		||||
                "YMD",
 | 
			
		||||
                {datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)},
 | 
			
		||||
            ),
 | 
			
		||||
            ("2010-12-13", "YMD", {datetime.date(2010, 12, 13)}),
 | 
			
		||||
            ("11.01.10", "DMY", {datetime.date(2010, 1, 11)}),
 | 
			
		||||
            (
 | 
			
		||||
                "11.01.2001,15-06-1996",
 | 
			
		||||
                "DMY",
 | 
			
		||||
                {datetime.date(2001, 1, 11), datetime.date(1996, 6, 15)},
 | 
			
		||||
            ),
 | 
			
		||||
            ("2010-12-13", [datetime.date(2010, 12, 13)]),
 | 
			
		||||
        ]
 | 
			
		||||
        for env_str, expected_dates in test_cases:
 | 
			
		||||
            expected_date_set = set()
 | 
			
		||||
 | 
			
		||||
            for expected_date in expected_dates:
 | 
			
		||||
                expected_date_set.add(expected_date)
 | 
			
		||||
 | 
			
		||||
            self.assertSetEqual(
 | 
			
		||||
                _parse_ignore_dates(env_str),
 | 
			
		||||
                expected_date_set,
 | 
			
		||||
            )
 | 
			
		||||
        self._parse_checker(test_cases)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user