mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'master' into feature/api
This commit is contained in:
@@ -1,21 +1,23 @@
|
||||
import datetime
|
||||
import glob
|
||||
import tempfile
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import itertools
|
||||
|
||||
import langdetect
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
import pyocr
|
||||
import shutil
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.template.defaultfilters import slugify
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
from logger.models import Log
|
||||
from paperless.db import GnuPG
|
||||
@@ -27,6 +29,12 @@ from .languages import ISO639
|
||||
def image_to_string(args):
|
||||
self, png, lang = args
|
||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||
if self.OCR.can_detect_orientation():
|
||||
try:
|
||||
orientation = self.OCR.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except TesseractError:
|
||||
pass
|
||||
return self.OCR.image_to_string(f, lang=lang)
|
||||
|
||||
|
||||
@@ -111,34 +119,41 @@ class Consumer(object):
|
||||
|
||||
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||
|
||||
pngs = self._get_greyscale(doc)
|
||||
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||
pngs = self._get_greyscale(tempdir, doc)
|
||||
|
||||
try:
|
||||
text = self._get_ocr(pngs)
|
||||
self._store(text, doc)
|
||||
except OCRError:
|
||||
self._ignore.append(doc)
|
||||
Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||
self._cleanup_tempdir(tempdir)
|
||||
continue
|
||||
else:
|
||||
self._cleanup_tempdir(tempdir)
|
||||
self._cleanup_doc(doc)
|
||||
|
||||
self._store(text, doc)
|
||||
self._cleanup(pngs, doc)
|
||||
|
||||
def _get_greyscale(self, doc):
|
||||
def _get_greyscale(self, tempdir, doc):
|
||||
|
||||
Log.debug(
|
||||
"Generating greyscale image from {}".format(doc),
|
||||
Log.COMPONENT_CONSUMER
|
||||
)
|
||||
|
||||
i = random.randint(1000000, 9999999)
|
||||
png = os.path.join(self.SCRATCH, "{}.png".format(i))
|
||||
png = os.path.join(tempdir, "convert-%04d.jpg")
|
||||
|
||||
subprocess.Popen((
|
||||
self.CONVERT, "-density", "300", "-depth", "8",
|
||||
"-type", "grayscale", doc, png
|
||||
)).wait()
|
||||
|
||||
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
||||
pngs = []
|
||||
for f in os.listdir(tempdir):
|
||||
if f.startswith("convert"):
|
||||
pngs.append(os.path.join(tempdir, f))
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pngs))
|
||||
|
||||
@staticmethod
|
||||
def _guess_language(text):
|
||||
@@ -271,11 +286,7 @@ class Consumer(object):
|
||||
def _store(self, text, doc):
|
||||
|
||||
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
|
||||
tags = list(tags)
|
||||
|
||||
lower_text = text.lower()
|
||||
relevant_tags = set(
|
||||
[t for t in Tag.objects.all() if t.matches(lower_text)] + tags)
|
||||
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
|
||||
|
||||
stats = os.stat(doc)
|
||||
|
||||
@@ -303,14 +314,15 @@ class Consumer(object):
|
||||
Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
def _cleanup(self, pngs, doc):
|
||||
@staticmethod
|
||||
def _cleanup_tempdir(d):
|
||||
Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
|
||||
shutil.rmtree(d)
|
||||
|
||||
png_glob = os.path.join(
|
||||
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
|
||||
|
||||
for f in list(glob.glob(png_glob)) + [doc]:
|
||||
Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER)
|
||||
os.unlink(f)
|
||||
@staticmethod
|
||||
def _cleanup_doc(doc):
|
||||
Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||
os.unlink(doc)
|
||||
|
||||
def _is_ready(self, doc):
|
||||
"""
|
||||
|
@@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand):
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
for document in Document.objects.all():
|
||||
|
||||
tags = Tag.objects.exclude(
|
||||
pk__in=document.tags.values_list("pk", flat=True))
|
||||
for tag in tags:
|
||||
if tag.matches(document.content):
|
||||
print('Tagging {} with "{}"'.format(document, tag))
|
||||
document.tags.add(tag)
|
||||
|
||||
for tag in Tag.match_all(document.content, tags):
|
||||
print('Tagging {} with "{}"'.format(document, tag))
|
||||
document.tags.add(tag)
|
||||
|
23
src/documents/management/commands/loaddata_stdin.py
Normal file
23
src/documents/management/commands/loaddata_stdin.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
Source:
|
||||
https://gist.github.com/bmispelon/ad5a2c333443b3a1d051
|
||||
|
||||
License:
|
||||
MIT
|
||||
Copyright (c) 2016 Baptiste Mispelon
|
||||
"""
|
||||
import sys
|
||||
|
||||
from django.core.management.commands.loaddata import Command as LoadDataCommand
|
||||
|
||||
|
||||
class Command(LoadDataCommand):
|
||||
def parse_name(self, fixture_name):
|
||||
self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None)
|
||||
if fixture_name == '-':
|
||||
return '-', 'json', 'stdin'
|
||||
|
||||
def find_fixtures(self, fixture_label):
|
||||
if fixture_label == '-':
|
||||
return [('-', None, '-')]
|
||||
return super(Command, self).find_fixtures(fixture_label)
|
@@ -86,28 +86,40 @@ class Tag(SluggedModel):
|
||||
return "{}: \"{}\" ({})".format(
|
||||
self.name, self.match, self.get_matching_algorithm_display())
|
||||
|
||||
@classmethod
|
||||
def match_all(cls, text, tags=None):
|
||||
|
||||
if tags is None:
|
||||
tags = cls.objects.all()
|
||||
|
||||
text = text.lower()
|
||||
for tag in tags:
|
||||
if tag.matches(text):
|
||||
yield tag
|
||||
|
||||
def matches(self, text):
|
||||
|
||||
# Check that match is not empty
|
||||
if self.match.strip() == "":
|
||||
return False
|
||||
|
||||
if self.matching_algorithm == self.MATCH_ALL:
|
||||
for word in self.match.split(" "):
|
||||
if word not in text:
|
||||
if not re.search(r"\b{}\b".format(word), text):
|
||||
return False
|
||||
return True
|
||||
|
||||
if self.matching_algorithm == self.MATCH_ANY:
|
||||
for word in self.match.split(" "):
|
||||
if word in text:
|
||||
if re.search(r"\b{}\b".format(word), text):
|
||||
return True
|
||||
return False
|
||||
|
||||
if self.matching_algorithm == self.MATCH_LITERAL:
|
||||
return self.match in text
|
||||
return bool(re.search(r"\b{}\b".format(self.match), text))
|
||||
|
||||
if self.matching_algorithm == self.MATCH_REGEX:
|
||||
return re.search(re.compile(self.match), text)
|
||||
return bool(re.search(re.compile(self.match), text))
|
||||
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
|
||||
|
120
src/documents/tests/test_tags.py
Normal file
120
src/documents/tests/test_tags.py
Normal file
@@ -0,0 +1,120 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from ..models import Tag
|
||||
|
||||
|
||||
class TestTagMatching(TestCase):
|
||||
|
||||
def test_match_all(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha charlie gamma",
|
||||
matching_algorithm=Tag.MATCH_ALL
|
||||
)
|
||||
self.assertFalse(t.matches("I have alpha in me"))
|
||||
self.assertFalse(t.matches("I have charlie in me"))
|
||||
self.assertFalse(t.matches("I have gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||
self.assertTrue(t.matches("I have alpha, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
self.assertFalse(t.matches("I have bravo in me"))
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 1",
|
||||
match="12 34 56",
|
||||
matching_algorithm=Tag.MATCH_ALL
|
||||
)
|
||||
self.assertFalse(t.matches("I have 12 in me"))
|
||||
self.assertFalse(t.matches("I have 34 in me"))
|
||||
self.assertFalse(t.matches("I have 56 in me"))
|
||||
self.assertFalse(t.matches("I have 12 and 34 in me"))
|
||||
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 34, and 56 in me"))
|
||||
self.assertFalse(t.matches("I have 123456 in me"))
|
||||
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||
|
||||
def test_match_any(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha charlie gamma",
|
||||
matching_algorithm=Tag.MATCH_ANY
|
||||
)
|
||||
|
||||
self.assertTrue(t.matches("I have alpha in me"))
|
||||
self.assertTrue(t.matches("I have charlie in me"))
|
||||
self.assertTrue(t.matches("I have gamma in me"))
|
||||
self.assertTrue(t.matches("I have alpha and charlie in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
self.assertFalse(t.matches("I have bravo in me"))
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 1",
|
||||
match="12 34 56",
|
||||
matching_algorithm=Tag.MATCH_ANY
|
||||
)
|
||||
self.assertTrue(t.matches("I have 12 in me"))
|
||||
self.assertTrue(t.matches("I have 34 in me"))
|
||||
self.assertTrue(t.matches("I have 56 in me"))
|
||||
self.assertTrue(t.matches("I have 12 and 34 in me"))
|
||||
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
|
||||
self.assertTrue(t.matches("I have 120, 34, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 123456 in me"))
|
||||
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||
|
||||
def test_match_literal(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha charlie gamma",
|
||||
matching_algorithm=Tag.MATCH_LITERAL
|
||||
)
|
||||
|
||||
self.assertFalse(t.matches("I have alpha in me"))
|
||||
self.assertFalse(t.matches("I have charlie in me"))
|
||||
self.assertFalse(t.matches("I have gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||
self.assertTrue(t.matches("I have 'alpha charlie gamma' in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
self.assertFalse(t.matches("I have bravo in me"))
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 1",
|
||||
match="12 34 56",
|
||||
matching_algorithm=Tag.MATCH_LITERAL
|
||||
)
|
||||
self.assertFalse(t.matches("I have 12 in me"))
|
||||
self.assertFalse(t.matches("I have 34 in me"))
|
||||
self.assertFalse(t.matches("I have 56 in me"))
|
||||
self.assertFalse(t.matches("I have 12 and 34 in me"))
|
||||
self.assertFalse(t.matches("I have 12 34, and 56 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 34, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 123456 in me"))
|
||||
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||
self.assertTrue(t.matches("I have 12 34 56 in me"))
|
||||
|
||||
def test_match_regex(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha\w+gamma",
|
||||
matching_algorithm=Tag.MATCH_REGEX
|
||||
)
|
||||
|
||||
self.assertFalse(t.matches("I have alpha in me"))
|
||||
self.assertFalse(t.matches("I have gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||
self.assertTrue(t.matches("I have alpha_and_gamma in me"))
|
||||
self.assertTrue(t.matches("I have alphas_and_gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha,and,gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
|
Reference in New Issue
Block a user