mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge branch 'pitkley-fix/secure-temporary-files'
This commit is contained in:
commit
52f242574f
@ -1,15 +1,16 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import glob
|
import tempfile
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import Pool
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
import langdetect
|
import langdetect
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
import pyocr
|
import pyocr
|
||||||
|
import shutil
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
@ -111,34 +112,35 @@ class Consumer(object):
|
|||||||
|
|
||||||
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
|
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||||
|
|
||||||
pngs = self._get_greyscale(doc)
|
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||||
|
pngs = self._get_greyscale(tempdir, doc)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
text = self._get_ocr(pngs)
|
text = self._get_ocr(pngs)
|
||||||
|
self._store(text, doc)
|
||||||
except OCRError:
|
except OCRError:
|
||||||
self._ignore.append(doc)
|
self._ignore.append(doc)
|
||||||
Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
|
Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||||
continue
|
continue
|
||||||
|
finally:
|
||||||
|
self._cleanup(tempdir, doc)
|
||||||
|
|
||||||
self._store(text, doc)
|
def _get_greyscale(self, tempdir, doc):
|
||||||
self._cleanup(pngs, doc)
|
|
||||||
|
|
||||||
def _get_greyscale(self, doc):
|
|
||||||
|
|
||||||
Log.debug(
|
Log.debug(
|
||||||
"Generating greyscale image from {}".format(doc),
|
"Generating greyscale image from {}".format(doc),
|
||||||
Log.COMPONENT_CONSUMER
|
Log.COMPONENT_CONSUMER
|
||||||
)
|
)
|
||||||
|
|
||||||
i = random.randint(1000000, 9999999)
|
png = os.path.join(tempdir, "convert-%04d.jpg")
|
||||||
png = os.path.join(self.SCRATCH, "{}.png".format(i))
|
|
||||||
|
|
||||||
subprocess.Popen((
|
subprocess.Popen((
|
||||||
self.CONVERT, "-density", "300", "-depth", "8",
|
self.CONVERT, "-density", "300", "-depth", "8",
|
||||||
"-type", "grayscale", doc, png
|
"-type", "grayscale", doc, png
|
||||||
)).wait()
|
)).wait()
|
||||||
|
|
||||||
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")]
|
||||||
|
return sorted(filter(lambda f: os.path.isfile(f), pngs))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _guess_language(text):
|
def _guess_language(text):
|
||||||
@ -303,14 +305,14 @@ class Consumer(object):
|
|||||||
Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
|
Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
|
||||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||||
|
|
||||||
def _cleanup(self, pngs, doc):
|
def _cleanup(self, tempdir, doc):
|
||||||
|
# Remove temporary directory recursively
|
||||||
|
Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER)
|
||||||
|
shutil.rmtree(tempdir)
|
||||||
|
|
||||||
png_glob = os.path.join(
|
# Remove doc
|
||||||
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
|
Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||||
|
os.unlink(doc)
|
||||||
for f in list(glob.glob(png_glob)) + [doc]:
|
|
||||||
Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER)
|
|
||||||
os.unlink(f)
|
|
||||||
|
|
||||||
def _is_ready(self, doc):
|
def _is_ready(self, doc):
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user