mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Restructured the consumer into a loop and added a check for scans in-progress
This commit is contained in:
		@@ -4,6 +4,7 @@ import random
 | 
			
		||||
import re
 | 
			
		||||
import shutil
 | 
			
		||||
import subprocess
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
import pyocr
 | 
			
		||||
 | 
			
		||||
@@ -35,8 +36,11 @@ class Command(BaseCommand):
 | 
			
		||||
    MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img")
 | 
			
		||||
    MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
 | 
			
		||||
 | 
			
		||||
    PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
 | 
			
		||||
 | 
			
		||||
    def __init__(self, *args, **kwargs):
 | 
			
		||||
        self.verbosity = 0
 | 
			
		||||
        self.stats = {}
 | 
			
		||||
        BaseCommand.__init__(self, *args, **kwargs)
 | 
			
		||||
 | 
			
		||||
    def handle(self, *args, **options):
 | 
			
		||||
@@ -45,18 +49,32 @@ class Command(BaseCommand):
 | 
			
		||||
 | 
			
		||||
        self._setup()
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            while True:
 | 
			
		||||
                self.loop()
 | 
			
		||||
                time.sleep(10)
 | 
			
		||||
                print(".")
 | 
			
		||||
        except KeyboardInterrupt:
 | 
			
		||||
            print("Exiting")
 | 
			
		||||
 | 
			
		||||
    def loop(self):
 | 
			
		||||
 | 
			
		||||
        for pdf in os.listdir(self.CONSUME):
 | 
			
		||||
 | 
			
		||||
            if not os.path.isfile(os.path.join(self.CONSUME, pdf)):
 | 
			
		||||
            pdf = os.path.join(self.CONSUME, pdf)
 | 
			
		||||
 | 
			
		||||
            if not os.path.isfile(pdf):
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if not pdf.endswith(".pdf"):
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if self._is_ready(pdf):
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if self.verbosity > 1:
 | 
			
		||||
                print("Consuming {}".format(pdf))
 | 
			
		||||
 | 
			
		||||
            pdf = os.path.join(self.CONSUME, pdf)
 | 
			
		||||
            pngs = self._get_greyscale(pdf)
 | 
			
		||||
            jpgs = self._get_colour(pdf)
 | 
			
		||||
            text = self._get_ocr(pngs)
 | 
			
		||||
@@ -71,6 +89,22 @@ class Command(BaseCommand):
 | 
			
		||||
            except FileExistsError:
 | 
			
		||||
                pass
 | 
			
		||||
 | 
			
		||||
    def _is_ready(self, pdf):
 | 
			
		||||
        """
 | 
			
		||||
        Detect whether `pdf` is ready to consume or if it's still being written
 | 
			
		||||
        to by the scanner.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        t = os.stat(pdf).st_mtime
 | 
			
		||||
 | 
			
		||||
        if self.stats.get(pdf) == t:
 | 
			
		||||
            del(self.stats[pdf])
 | 
			
		||||
            return True
 | 
			
		||||
 | 
			
		||||
        self.stats[pdf] = t
 | 
			
		||||
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    def _get_greyscale(self, pdf):
 | 
			
		||||
 | 
			
		||||
        i = random.randint(1000000, 4999999)
 | 
			
		||||
@@ -104,13 +138,28 @@ class Command(BaseCommand):
 | 
			
		||||
 | 
			
		||||
    def _store(self, text, jpgs, pdf):
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.create(content=text)
 | 
			
		||||
        sender, title = self._parse_file_name(pdf)
 | 
			
		||||
 | 
			
		||||
        doc = Document.objects.create(sender=sender, title=title, content=text)
 | 
			
		||||
 | 
			
		||||
        shutil.move(jpgs[0], os.path.join(
 | 
			
		||||
            self.MEDIA_IMG, "{:07}.jpg".format(doc.pk)))
 | 
			
		||||
        shutil.move(pdf, os.path.join(
 | 
			
		||||
            self.MEDIA_PDF, "{:07}.pdf".format(doc.pk)))
 | 
			
		||||
 | 
			
		||||
    def _parse_file_name(self, pdf):
 | 
			
		||||
        """
 | 
			
		||||
        We use a crude naming convention to make handling the sender and title
 | 
			
		||||
        easier:
 | 
			
		||||
          "sender - title.pdf"
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        m = re.match(self.PARSER_REGEX, pdf)
 | 
			
		||||
        if m:
 | 
			
		||||
            return m.group(1), m.group(2)
 | 
			
		||||
 | 
			
		||||
        return "", ""
 | 
			
		||||
 | 
			
		||||
    def _cleanup(self, pngs, jpgs):
 | 
			
		||||
 | 
			
		||||
        jpg_glob = os.path.join(
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user