mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
81 lines
1.8 KiB
Python
81 lines
1.8 KiB
Python
import os
|
|
import re
|
|
|
|
from django.conf import settings
|
|
|
|
from .base import Consumer, OCRError
|
|
|
|
|
|
class FileConsumerError(Exception):
|
|
pass
|
|
|
|
|
|
class FileConsumer(Consumer):
|
|
|
|
CONSUME = settings.CONSUMPTION_DIR
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
Consumer.__init__(self, *args, **kwargs)
|
|
|
|
self.stats = {}
|
|
self._ignore = []
|
|
|
|
if not self.CONSUME:
|
|
raise FileConsumerError(
|
|
"The CONSUMPTION_DIR settings variable does not appear to be "
|
|
"set."
|
|
)
|
|
|
|
if not os.path.exists(self.CONSUME):
|
|
raise FileConsumerError(
|
|
"Consumption directory {} does not exist".format(self.CONSUME))
|
|
|
|
def consume(self):
|
|
|
|
for doc in os.listdir(self.CONSUME):
|
|
|
|
doc = os.path.join(self.CONSUME, doc)
|
|
|
|
if not os.path.isfile(doc):
|
|
continue
|
|
|
|
if not re.match(self.REGEX_TITLE, doc):
|
|
continue
|
|
|
|
if doc in self._ignore:
|
|
continue
|
|
|
|
if self._is_ready(doc):
|
|
continue
|
|
|
|
self._render("Consuming {}".format(doc), 1)
|
|
|
|
pngs = self._get_greyscale(doc)
|
|
|
|
try:
|
|
text = self._get_ocr(pngs)
|
|
except OCRError:
|
|
self._ignore.append(doc)
|
|
self._render("OCR FAILURE: {}".format(doc), 0)
|
|
continue
|
|
|
|
self._store(text, doc)
|
|
self._cleanup(pngs, doc)
|
|
|
|
def _is_ready(self, doc):
|
|
"""
|
|
Detect whether `doc` is ready to consume or if it's still being written
|
|
to by the uploader.
|
|
"""
|
|
|
|
t = os.stat(doc).st_mtime
|
|
|
|
if self.stats.get(doc) == t:
|
|
del(self.stats[doc])
|
|
return True
|
|
|
|
self.stats[doc] = t
|
|
|
|
return False
|