Merge branch 'dev' into feature-autocolor

This commit is contained in:
jonaswinkler
2021-02-24 23:10:59 +01:00
378 changed files with 37340 additions and 5324 deletions

View File

@@ -6,13 +6,18 @@ import magic
from django.conf import settings
from django.db import migrations, models
from paperless.db import GnuPG
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
def source_path(self):
if self.filename:
fname = str(self.filename)
else:
fname = "{:07}.{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
if self.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg"
return os.path.join(
@@ -26,9 +31,18 @@ def add_mime_types(apps, schema_editor):
documents = Document.objects.all()
for d in documents:
d.mime_type = magic.from_file(source_path(d), mime=True)
f = open(source_path(d), "rb")
if d.storage_type == STORAGE_TYPE_GPG:
data = GnuPG.decrypted(f)
else:
data = f.read(1024)
d.mime_type = magic.from_buffer(data, mime=True)
d.save()
f.close()
def add_file_extensions(apps, schema_editor):
Document = apps.get_model("documents", "Document")

View File

@@ -0,0 +1,25 @@
# Generated by Django 3.1.4 on 2020-12-08 22:09
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('documents', '1005_checksums'),
]
operations = [
migrations.RemoveField(
model_name='correspondent',
name='slug',
),
migrations.RemoveField(
model_name='documenttype',
name='slug',
),
migrations.RemoveField(
model_name='tag',
name='slug',
),
]

View File

@@ -0,0 +1,37 @@
# Generated by Django 3.1.4 on 2020-12-12 14:41
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('documents', '1006_auto_20201208_2209'),
]
operations = [
migrations.CreateModel(
name='SavedView',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=128)),
('show_on_dashboard', models.BooleanField()),
('show_in_sidebar', models.BooleanField()),
('sort_field', models.CharField(max_length=128)),
('sort_reverse', models.BooleanField(default=False)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
),
migrations.CreateModel(
name='SavedViewFilterRule',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('rule_type', models.PositiveIntegerField(choices=[(0, 'Title contains'), (1, 'Content contains'), (2, 'ASN is'), (3, 'Correspondent is'), (4, 'Document type is'), (5, 'Is in inbox'), (6, 'Has tag'), (7, 'Has any tag'), (8, 'Created before'), (9, 'Created after'), (10, 'Created year is'), (11, 'Created month is'), (12, 'Created day is'), (13, 'Added before'), (14, 'Added after'), (15, 'Modified before'), (16, 'Modified after'), (17, 'Does not have tag')])),
('value', models.CharField(max_length=128)),
('saved_view', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='filter_rules', to='documents.savedview')),
],
),
]

View File

@@ -0,0 +1,34 @@
# Generated by Django 3.1.4 on 2020-12-16 17:36
from django.db import migrations
import django.db.models.functions.text
class Migration(migrations.Migration):
dependencies = [
('documents', '1007_savedview_savedviewfilterrule'),
]
operations = [
migrations.AlterModelOptions(
name='correspondent',
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
),
migrations.AlterModelOptions(
name='document',
options={'ordering': ('-created',)},
),
migrations.AlterModelOptions(
name='documenttype',
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
),
migrations.AlterModelOptions(
name='savedview',
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
),
migrations.AlterModelOptions(
name='tag',
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
),
]

View File

@@ -0,0 +1,29 @@
# Generated by Django 3.1.4 on 2020-12-16 20:05
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('documents', '1008_auto_20201216_1736'),
]
operations = [
migrations.AlterModelOptions(
name='correspondent',
options={'ordering': ('name',)},
),
migrations.AlterModelOptions(
name='documenttype',
options={'ordering': ('name',)},
),
migrations.AlterModelOptions(
name='savedview',
options={'ordering': ('name',)},
),
migrations.AlterModelOptions(
name='tag',
options={'ordering': ('name',)},
),
]

View File

@@ -0,0 +1,18 @@
# Generated by Django 3.1.4 on 2021-01-01 21:59
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1009_auto_20201216_2005'),
]
operations = [
migrations.AlterField(
model_name='savedviewfilterrule',
name='value',
field=models.CharField(blank=True, max_length=128, null=True),
),
]

View File

@@ -0,0 +1,250 @@
# Generated by Django 3.1.4 on 2021-01-01 23:40
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('documents', '1010_auto_20210101_2159'),
]
operations = [
migrations.AlterModelOptions(
name='correspondent',
options={'ordering': ('name',), 'verbose_name': 'correspondent', 'verbose_name_plural': 'correspondents'},
),
migrations.AlterModelOptions(
name='document',
options={'ordering': ('-created',), 'verbose_name': 'document', 'verbose_name_plural': 'documents'},
),
migrations.AlterModelOptions(
name='documenttype',
options={'verbose_name': 'document type', 'verbose_name_plural': 'document types'},
),
migrations.AlterModelOptions(
name='log',
options={'ordering': ('-created',), 'verbose_name': 'log', 'verbose_name_plural': 'logs'},
),
migrations.AlterModelOptions(
name='savedview',
options={'ordering': ('name',), 'verbose_name': 'saved view', 'verbose_name_plural': 'saved views'},
),
migrations.AlterModelOptions(
name='savedviewfilterrule',
options={'verbose_name': 'filter rule', 'verbose_name_plural': 'filter rules'},
),
migrations.AlterModelOptions(
name='tag',
options={'verbose_name': 'tag', 'verbose_name_plural': 'tags'},
),
migrations.AlterField(
model_name='correspondent',
name='is_insensitive',
field=models.BooleanField(default=True, verbose_name='is insensitive'),
),
migrations.AlterField(
model_name='correspondent',
name='match',
field=models.CharField(blank=True, max_length=256, verbose_name='match'),
),
migrations.AlterField(
model_name='correspondent',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any word'), (2, 'All words'), (3, 'Exact match'), (4, 'Regular expression'), (5, 'Fuzzy word'), (6, 'Automatic')], default=1, verbose_name='matching algorithm'),
),
migrations.AlterField(
model_name='correspondent',
name='name',
field=models.CharField(max_length=128, unique=True, verbose_name='name'),
),
migrations.AlterField(
model_name='document',
name='added',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, editable=False, verbose_name='added'),
),
migrations.AlterField(
model_name='document',
name='archive_checksum',
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True, verbose_name='archive checksum'),
),
migrations.AlterField(
model_name='document',
name='archive_serial_number',
field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True, verbose_name='archive serial number'),
),
migrations.AlterField(
model_name='document',
name='checksum',
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True, verbose_name='checksum'),
),
migrations.AlterField(
model_name='document',
name='content',
field=models.TextField(blank=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.', verbose_name='content'),
),
migrations.AlterField(
model_name='document',
name='correspondent',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.correspondent', verbose_name='correspondent'),
),
migrations.AlterField(
model_name='document',
name='created',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, verbose_name='created'),
),
migrations.AlterField(
model_name='document',
name='document_type',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.documenttype', verbose_name='document type'),
),
migrations.AlterField(
model_name='document',
name='filename',
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, verbose_name='filename'),
),
migrations.AlterField(
model_name='document',
name='mime_type',
field=models.CharField(editable=False, max_length=256, verbose_name='mime type'),
),
migrations.AlterField(
model_name='document',
name='modified',
field=models.DateTimeField(auto_now=True, db_index=True, verbose_name='modified'),
),
migrations.AlterField(
model_name='document',
name='storage_type',
field=models.CharField(choices=[('unencrypted', 'Unencrypted'), ('gpg', 'Encrypted with GNU Privacy Guard')], default='unencrypted', editable=False, max_length=11, verbose_name='storage type'),
),
migrations.AlterField(
model_name='document',
name='tags',
field=models.ManyToManyField(blank=True, related_name='documents', to='documents.Tag', verbose_name='tags'),
),
migrations.AlterField(
model_name='document',
name='title',
field=models.CharField(blank=True, db_index=True, max_length=128, verbose_name='title'),
),
migrations.AlterField(
model_name='documenttype',
name='is_insensitive',
field=models.BooleanField(default=True, verbose_name='is insensitive'),
),
migrations.AlterField(
model_name='documenttype',
name='match',
field=models.CharField(blank=True, max_length=256, verbose_name='match'),
),
migrations.AlterField(
model_name='documenttype',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any word'), (2, 'All words'), (3, 'Exact match'), (4, 'Regular expression'), (5, 'Fuzzy word'), (6, 'Automatic')], default=1, verbose_name='matching algorithm'),
),
migrations.AlterField(
model_name='documenttype',
name='name',
field=models.CharField(max_length=128, unique=True, verbose_name='name'),
),
migrations.AlterField(
model_name='log',
name='created',
field=models.DateTimeField(auto_now_add=True, verbose_name='created'),
),
migrations.AlterField(
model_name='log',
name='group',
field=models.UUIDField(blank=True, null=True, verbose_name='group'),
),
migrations.AlterField(
model_name='log',
name='level',
field=models.PositiveIntegerField(choices=[(10, 'debug'), (20, 'information'), (30, 'warning'), (40, 'error'), (50, 'critical')], default=20, verbose_name='level'),
),
migrations.AlterField(
model_name='log',
name='message',
field=models.TextField(verbose_name='message'),
),
migrations.AlterField(
model_name='savedview',
name='name',
field=models.CharField(max_length=128, verbose_name='name'),
),
migrations.AlterField(
model_name='savedview',
name='show_in_sidebar',
field=models.BooleanField(verbose_name='show in sidebar'),
),
migrations.AlterField(
model_name='savedview',
name='show_on_dashboard',
field=models.BooleanField(verbose_name='show on dashboard'),
),
migrations.AlterField(
model_name='savedview',
name='sort_field',
field=models.CharField(max_length=128, verbose_name='sort field'),
),
migrations.AlterField(
model_name='savedview',
name='sort_reverse',
field=models.BooleanField(default=False, verbose_name='sort reverse'),
),
migrations.AlterField(
model_name='savedview',
name='user',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL, verbose_name='user'),
),
migrations.AlterField(
model_name='savedviewfilterrule',
name='rule_type',
field=models.PositiveIntegerField(choices=[(0, 'title contains'), (1, 'content contains'), (2, 'ASN is'), (3, 'correspondent is'), (4, 'document type is'), (5, 'is in inbox'), (6, 'has tag'), (7, 'has any tag'), (8, 'created before'), (9, 'created after'), (10, 'created year is'), (11, 'created month is'), (12, 'created day is'), (13, 'added before'), (14, 'added after'), (15, 'modified before'), (16, 'modified after'), (17, 'does not have tag')], verbose_name='rule type'),
),
migrations.AlterField(
model_name='savedviewfilterrule',
name='saved_view',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='filter_rules', to='documents.savedview', verbose_name='saved view'),
),
migrations.AlterField(
model_name='savedviewfilterrule',
name='value',
field=models.CharField(blank=True, max_length=128, null=True, verbose_name='value'),
),
migrations.AlterField(
model_name='tag',
name='colour',
field=models.PositiveIntegerField(choices=[(1, '#a6cee3'), (2, '#1f78b4'), (3, '#b2df8a'), (4, '#33a02c'), (5, '#fb9a99'), (6, '#e31a1c'), (7, '#fdbf6f'), (8, '#ff7f00'), (9, '#cab2d6'), (10, '#6a3d9a'), (11, '#b15928'), (12, '#000000'), (13, '#cccccc')], default=1, verbose_name='color'),
),
migrations.AlterField(
model_name='tag',
name='is_inbox_tag',
field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.', verbose_name='is inbox tag'),
),
migrations.AlterField(
model_name='tag',
name='is_insensitive',
field=models.BooleanField(default=True, verbose_name='is insensitive'),
),
migrations.AlterField(
model_name='tag',
name='match',
field=models.CharField(blank=True, max_length=256, verbose_name='match'),
),
migrations.AlterField(
model_name='tag',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any word'), (2, 'All words'), (3, 'Exact match'), (4, 'Regular expression'), (5, 'Fuzzy word'), (6, 'Automatic')], default=1, verbose_name='matching algorithm'),
),
migrations.AlterField(
model_name='tag',
name='name',
field=models.CharField(max_length=128, unique=True, verbose_name='name'),
),
]

View File

@@ -0,0 +1,330 @@
# Generated by Django 3.1.6 on 2021-02-07 22:26
import datetime
import hashlib
import logging
import os
import shutil
from time import sleep
import pathvalidate
from django.conf import settings
from django.db import migrations, models
from django.template.defaultfilters import slugify
from documents.file_handling import defaultdictNoStr, many_to_dictionary
logger = logging.getLogger("paperless.migrations")
###############################################################################
# This is code copied straight paperless before the change.
###############################################################################
def archive_name_from_filename(filename):
return os.path.splitext(filename)[0] + ".pdf"
def archive_path_old(doc):
if doc.filename:
fname = archive_name_from_filename(doc.filename)
else:
fname = "{:07}.pdf".format(doc.pk)
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
STORAGE_TYPE_GPG = "gpg"
def archive_path_new(doc):
if doc.archive_filename is not None:
return os.path.join(
settings.ARCHIVE_DIR,
str(doc.archive_filename)
)
else:
return None
def source_path(doc):
if doc.filename:
fname = str(doc.filename)
else:
fname = "{:07}{}".format(doc.pk, doc.file_type)
if doc.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg" # pragma: no cover
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
def generate_unique_filename(doc, archive_filename=False):
if archive_filename:
old_filename = doc.archive_filename
root = settings.ARCHIVE_DIR
else:
old_filename = doc.filename
root = settings.ORIGINALS_DIR
counter = 0
while True:
new_filename = generate_filename(
doc, counter, archive_filename=archive_filename)
if new_filename == old_filename:
# still the same as before.
return new_filename
if os.path.exists(os.path.join(root, new_filename)):
counter += 1
else:
return new_filename
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
path = ""
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdictNoStr(lambda: slugify(None),
many_to_dictionary(doc.tags))
tag_list = pathvalidate.sanitize_filename(
",".join(sorted(
[tag.name for tag in doc.tags.all()]
)),
replacement_text="-"
)
if doc.correspondent:
correspondent = pathvalidate.sanitize_filename(
doc.correspondent.name, replacement_text="-"
)
else:
correspondent = "none"
if doc.document_type:
document_type = pathvalidate.sanitize_filename(
doc.document_type.name, replacement_text="-"
)
else:
document_type = "none"
path = settings.PAPERLESS_FILENAME_FORMAT.format(
title=pathvalidate.sanitize_filename(
doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(doc.created),
created_year=doc.created.year if doc.created else "none",
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
created_day=f"{doc.created.day:02}" if doc.created else "none",
added=datetime.date.isoformat(doc.added),
added_year=doc.added.year if doc.added else "none",
added_month=f"{doc.added.month:02}" if doc.added else "none",
added_day=f"{doc.added.day:02}" if doc.added else "none",
tags=tags,
tag_list=tag_list
).strip()
path = path.strip(os.sep)
except (ValueError, KeyError, IndexError):
logger.warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if len(path) > 0:
filename = f"{path}{counter_str}{filetype_str}"
else:
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Append .gpg for encrypted files
if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
###############################################################################
# This code performs bidirection archive file transformation.
###############################################################################
def parse_wrapper(parser, path, mime_type, file_name):
# this is here so that I can mock this out for testing.
parser.parse(path, mime_type, file_name)
def create_archive_version(doc, retry_count=3):
from documents.parsers import get_parser_class_for_mime_type, \
DocumentParser, \
ParseError
logger.info(
f"Regenerating archive document for document ID:{doc.id}"
)
parser_class = get_parser_class_for_mime_type(doc.mime_type)
for try_num in range(retry_count):
parser: DocumentParser = parser_class(None, None)
try:
parse_wrapper(parser, source_path(doc), doc.mime_type,
os.path.basename(doc.filename))
doc.content = parser.get_text()
if parser.get_archive_path() and os.path.isfile(
parser.get_archive_path()):
doc.archive_filename = generate_unique_filename(
doc, archive_filename=True)
with open(parser.get_archive_path(), "rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
os.makedirs(os.path.dirname(archive_path_new(doc)),
exist_ok=True)
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
else:
doc.archive_checksum = None
logger.error(
f"Parser did not return an archive document for document "
f"ID:{doc.id}. Removing archive document."
)
doc.save()
return
except ParseError:
if try_num + 1 == retry_count:
logger.exception(
f"Unable to regenerate archive document for ID:{doc.id}. You "
f"need to invoke the document_archiver management command "
f"manually for that document."
)
doc.archive_checksum = None
doc.save()
return
else:
# This is mostly here for the tika parser in docker
# environemnts. The servers for parsing need to come up first,
# and the docker setup doesn't ensure that tika is running
# before attempting migrations.
logger.error("Parse error, will try again in 5 seconds...")
sleep(5)
finally:
parser.cleanup()
def move_old_to_new_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
affected_document_ids = set()
old_archive_path_to_id = {}
# check for documents that have incorrect archive versions
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
if old_path in old_archive_path_to_id:
affected_document_ids.add(doc.id)
affected_document_ids.add(old_archive_path_to_id[old_path])
else:
old_archive_path_to_id[old_path] = doc.id
# check that archive files of all unaffected documents are in place
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
if doc.id not in affected_document_ids and not os.path.isfile(old_path):
raise ValueError(
f"Archived document ID:{doc.id} does not exist at: "
f"{old_path}")
# check that we can regenerate affected archive versions
for doc_id in affected_document_ids:
from documents.parsers import get_parser_class_for_mime_type
doc = Document.objects.get(id=doc_id)
parser_class = get_parser_class_for_mime_type(doc.mime_type)
if not parser_class:
raise ValueError(
f"Document ID:{doc.id} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.")
for doc in Document.objects.filter(archive_checksum__isnull=False):
if doc.id in affected_document_ids:
old_path = archive_path_old(doc)
# remove affected archive versions
if os.path.isfile(old_path):
logger.debug(
f"Removing {old_path}"
)
os.unlink(old_path)
else:
# Set archive path for unaffected files
doc.archive_filename = archive_name_from_filename(doc.filename)
Document.objects.filter(id=doc.id).update(
archive_filename=doc.archive_filename
)
# regenerate archive documents
for doc_id in affected_document_ids:
doc = Document.objects.get(id=doc_id)
create_archive_version(doc)
def move_new_to_old_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
old_archive_paths = set()
for doc in Document.objects.filter(archive_checksum__isnull=False):
new_archive_path = archive_path_new(doc)
old_archive_path = archive_path_old(doc)
if old_archive_path in old_archive_paths:
raise ValueError(
f"Cannot migrate: Archive file name {old_archive_path} of "
f"document {doc.filename} would clash with another archive "
f"filename.")
old_archive_paths.add(old_archive_path)
if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
raise ValueError(
f"Cannot migrate: Cannot move {new_archive_path} to "
f"{old_archive_path}: file already exists."
)
for doc in Document.objects.filter(archive_checksum__isnull=False):
new_archive_path = archive_path_new(doc)
old_archive_path = archive_path_old(doc)
if new_archive_path != old_archive_path:
logger.debug(f"Moving {new_archive_path} to {old_archive_path}")
shutil.move(new_archive_path, old_archive_path)
class Migration(migrations.Migration):
dependencies = [
('documents', '1011_auto_20210101_2340'),
]
operations = [
migrations.AddField(
model_name='document',
name='archive_filename',
field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'),
),
migrations.AlterField(
model_name='document',
name='filename',
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'),
),
migrations.RunPython(
move_old_to_new_locations,
move_new_to_old_locations
),
]