mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Restructured the consumer into a loop and added a check for scans in-progress
This commit is contained in:
parent
855ee64097
commit
802e9d7045
@ -4,6 +4,7 @@ import random
|
|||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import time
|
||||||
|
|
||||||
import pyocr
|
import pyocr
|
||||||
|
|
||||||
@ -35,8 +36,11 @@ class Command(BaseCommand):
|
|||||||
MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img")
|
MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img")
|
||||||
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
|
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
|
||||||
|
|
||||||
|
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.verbosity = 0
|
self.verbosity = 0
|
||||||
|
self.stats = {}
|
||||||
BaseCommand.__init__(self, *args, **kwargs)
|
BaseCommand.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
@ -45,18 +49,32 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
self._setup()
|
self._setup()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
self.loop()
|
||||||
|
time.sleep(10)
|
||||||
|
print(".")
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("Exiting")
|
||||||
|
|
||||||
|
def loop(self):
|
||||||
|
|
||||||
for pdf in os.listdir(self.CONSUME):
|
for pdf in os.listdir(self.CONSUME):
|
||||||
|
|
||||||
if not os.path.isfile(os.path.join(self.CONSUME, pdf)):
|
pdf = os.path.join(self.CONSUME, pdf)
|
||||||
|
|
||||||
|
if not os.path.isfile(pdf):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not pdf.endswith(".pdf"):
|
if not pdf.endswith(".pdf"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if self._is_ready(pdf):
|
||||||
|
continue
|
||||||
|
|
||||||
if self.verbosity > 1:
|
if self.verbosity > 1:
|
||||||
print("Consuming {}".format(pdf))
|
print("Consuming {}".format(pdf))
|
||||||
|
|
||||||
pdf = os.path.join(self.CONSUME, pdf)
|
|
||||||
pngs = self._get_greyscale(pdf)
|
pngs = self._get_greyscale(pdf)
|
||||||
jpgs = self._get_colour(pdf)
|
jpgs = self._get_colour(pdf)
|
||||||
text = self._get_ocr(pngs)
|
text = self._get_ocr(pngs)
|
||||||
@ -71,6 +89,22 @@ class Command(BaseCommand):
|
|||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def _is_ready(self, pdf):
|
||||||
|
"""
|
||||||
|
Detect whether `pdf` is ready to consume or if it's still being written
|
||||||
|
to by the scanner.
|
||||||
|
"""
|
||||||
|
|
||||||
|
t = os.stat(pdf).st_mtime
|
||||||
|
|
||||||
|
if self.stats.get(pdf) == t:
|
||||||
|
del(self.stats[pdf])
|
||||||
|
return True
|
||||||
|
|
||||||
|
self.stats[pdf] = t
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def _get_greyscale(self, pdf):
|
def _get_greyscale(self, pdf):
|
||||||
|
|
||||||
i = random.randint(1000000, 4999999)
|
i = random.randint(1000000, 4999999)
|
||||||
@ -104,13 +138,28 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
def _store(self, text, jpgs, pdf):
|
def _store(self, text, jpgs, pdf):
|
||||||
|
|
||||||
doc = Document.objects.create(content=text)
|
sender, title = self._parse_file_name(pdf)
|
||||||
|
|
||||||
|
doc = Document.objects.create(sender=sender, title=title, content=text)
|
||||||
|
|
||||||
shutil.move(jpgs[0], os.path.join(
|
shutil.move(jpgs[0], os.path.join(
|
||||||
self.MEDIA_IMG, "{:07}.jpg".format(doc.pk)))
|
self.MEDIA_IMG, "{:07}.jpg".format(doc.pk)))
|
||||||
shutil.move(pdf, os.path.join(
|
shutil.move(pdf, os.path.join(
|
||||||
self.MEDIA_PDF, "{:07}.pdf".format(doc.pk)))
|
self.MEDIA_PDF, "{:07}.pdf".format(doc.pk)))
|
||||||
|
|
||||||
|
def _parse_file_name(self, pdf):
|
||||||
|
"""
|
||||||
|
We use a crude naming convention to make handling the sender and title
|
||||||
|
easier:
|
||||||
|
"sender - title.pdf"
|
||||||
|
"""
|
||||||
|
|
||||||
|
m = re.match(self.PARSER_REGEX, pdf)
|
||||||
|
if m:
|
||||||
|
return m.group(1), m.group(2)
|
||||||
|
|
||||||
|
return "", ""
|
||||||
|
|
||||||
def _cleanup(self, pngs, jpgs):
|
def _cleanup(self, pngs, jpgs):
|
||||||
|
|
||||||
jpg_glob = os.path.join(
|
jpg_glob = os.path.join(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user