mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Add support for a heuristic that extracts the document date from its text
This commit is contained in:
		@@ -3,6 +3,7 @@ import os
 | 
			
		||||
import re
 | 
			
		||||
import subprocess
 | 
			
		||||
from multiprocessing.pool import Pool
 | 
			
		||||
import dateparser
 | 
			
		||||
 | 
			
		||||
import langdetect
 | 
			
		||||
import pyocr
 | 
			
		||||
@@ -30,6 +31,7 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
 | 
			
		||||
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 | 
			
		||||
    UNPAPER = settings.UNPAPER_BINARY
 | 
			
		||||
    DATE_ORDER = settings.DATE_ORDER
 | 
			
		||||
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 | 
			
		||||
 | 
			
		||||
    def get_thumbnail(self):
 | 
			
		||||
@@ -175,6 +177,29 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
 | 
			
		||||
        return text
 | 
			
		||||
 | 
			
		||||
    def get_date(self):
 | 
			
		||||
        text = self.get_text()
 | 
			
		||||
 | 
			
		||||
        # This regular expression will try to find dates in the document at
 | 
			
		||||
        # hand and will match the following formats:
 | 
			
		||||
        # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - MONTH ZZZZ
 | 
			
		||||
        m = re.search(
 | 
			
		||||
            r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
 | 
			
		||||
            r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
 | 
			
		||||
            r'\b([^ ]{3,9} [0-9]{4})\b', text)
 | 
			
		||||
 | 
			
		||||
        if m is None:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        return dateparser.parse(m.group(0),
 | 
			
		||||
                                settings={'DATE_ORDER': self.DATE_ORDER,
 | 
			
		||||
                                          'PREFER_DAY_OF_MONTH': 'first',
 | 
			
		||||
                                          'RETURN_AS_TIMEZONE_AWARE': True})
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_convert(*args):
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user