Merge pull request #168 from kpj/feature-black

GitHub Actions workflow for black code formatting
This commit is contained in:
Quinn Casey 2022-02-28 10:32:10 -08:00 committed by GitHub
commit 5c9c10a6db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
144 changed files with 6258 additions and 3987 deletions

View File

@ -1,37 +0,0 @@
#!/bin/bash
# Verify that all text files end in a trailing newline.
# Exit on first failing command.
set -e
# Exit on unset variable.
set -u
success=0
function is_plaintext_file() {
local file="$1"
if [[ $file == *.svg ]]; then
echo ""
return
fi
file --brief "${file}" | grep text
}
# Split strings on newlines.
IFS='
'
for file in $(git ls-files)
do
if [[ -z $(is_plaintext_file "${file}") ]]; then
continue
fi
if ! [[ -z "$(tail -c 1 "${file}")" ]]; then
printf "File must end in a trailing newline: %s\n" "${file}" >&2
success=255
fi
done
exit "${success}"

View File

@ -1,26 +0,0 @@
#!/bin/bash
# Check for trailing whitespace at end of lines.
# Exit on first failing command.
set -e
# Exit on unset variable.
set -u
FOUND_TRAILING_WHITESPACE=0
while read -r line; do
if grep \
"\s$" \
--line-number \
--with-filename \
--binary-files=without-match \
--exclude="*.svg" \
--exclude="*.eps" \
"${line}"; then
echo "ERROR: Found trailing whitespace" >&2;
FOUND_TRAILING_WHITESPACE=1
fi
done < <(git ls-files)
exit "${FOUND_TRAILING_WHITESPACE}"

View File

@ -80,21 +80,19 @@ jobs:
name: Codestyle
run: |
cd src/
pycodestyle
whitespace:
pycodestyle --max-line-length=88 --ignore=E121,E123,E126,E226,E24,E704,W503,W504,E203
codeformatting:
runs-on: ubuntu-20.04
steps:
-
name: Checkout
uses: actions/checkout@v2
-
name: Ensure there are no trailing spaces
run: |
.github/workflow-scripts/check-trailing-whitespace
-
name: Ensure all text files end with a trailing newline
run: |
.github/workflow-scripts/check-trailing-whitespace
name: Run black
uses: psf/black@stable
with:
options: "--check --diff"
version: "22.1.0"
tests:
runs-on: ubuntu-20.04
@ -145,7 +143,7 @@ jobs:
coveralls --service=github
build-release:
needs: [build-docker-image, documentation, tests, whitespace, codestyle]
needs: [build-docker-image, documentation, tests, codeformatting, codestyle]
runs-on: ubuntu-20.04
steps:
-
@ -256,7 +254,7 @@ jobs:
build-docker-image:
if: github.event_name == 'push' && (startsWith(github.ref, 'refs/heads/feature-') || github.ref == 'refs/heads/dev' || startsWith(github.ref, 'refs/tags/ng-'))
runs-on: ubuntu-latest
needs: [tests, whitespace, codestyle]
needs: [tests, codeformatting, codestyle]
steps:
-
name: Prepare

View File

@ -11,7 +11,7 @@ If you want to implement something big:
## Python
Paperless supports python 3.8 and 3.9.
Paperless supports python 3.8 and 3.9. We format Python code with [Black](https://github.com/psf/black).
## Branches
@ -23,7 +23,7 @@ Paperless supports python 3.8 and 3.9.
## Testing:
Please test your code! I know it's a hassle, but it makes sure that your code works now and will allow us to detect regressions easily.
Please format and test your code! I know it's a hassle, but it makes sure that your code works now and will allow us to detect regressions easily.
To test your code, execute `pytest` in the src/ directory. This also generates a html coverage report, which you can use to see if you missed anything important during testing.

View File

@ -65,3 +65,4 @@ pytest-xdist = "*"
sphinx = "~=3.4.2"
sphinx_rtd_theme = "*"
tox = "*"
black = "*"

View File

@ -6,29 +6,29 @@ exec(open("../src/paperless/version.py").read())
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
'sphinx.ext.todo',
'sphinx.ext.imgmath',
'sphinx.ext.viewcode',
'sphinx_rtd_theme',
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.todo",
"sphinx.ext.imgmath",
"sphinx.ext.viewcode",
"sphinx_rtd_theme",
]
# Add any paths that contain templates here, relative to this directory.
# templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
source_suffix = ".rst"
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
master_doc = "index"
# General information about the project.
project = u'Paperless-ngx'
copyright = u'2015-2022, Daniel Quinn, Jonas Winkler, and the paperless-ngx team'
project = "Paperless-ngx"
copyright = "2015-2022, Daniel Quinn, Jonas Winkler, and the paperless-ngx team"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@ -47,180 +47,174 @@ release = ".".join([str(_) for _ in __version__[:3]])
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
exclude_patterns = ["_build"]
# The reST default role (used for this markup: `text`) to use for all
# documents.
#default_role = None
# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'sphinx_rtd_theme'
html_theme = "sphinx_rtd_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_static_path = ["_static"]
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#html_extra_path = []
# html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'paperless'
htmlhelp_basename = "paperless"
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
('index', 'paperless.tex', u'Paperless Documentation',
u'Daniel Quinn', 'manual'),
("index", "paperless.tex", "Paperless Documentation", "Daniel Quinn", "manual"),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'paperless', u'Paperless Documentation',
[u'Daniel Quinn'], 1)
]
man_pages = [("index", "paperless", "Paperless Documentation", ["Daniel Quinn"], 1)]
# If true, show URL addresses after external links.
#man_show_urls = False
# man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
@ -229,93 +223,99 @@ man_pages = [
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'Paperless', u'Paperless Documentation',
u'Daniel Quinn', 'paperless', 'Scan, index, and archive all of your paper documents.',
'Miscellaneous'),
(
"index",
"Paperless",
"Paperless Documentation",
"Daniel Quinn",
"paperless",
"Scan, index, and archive all of your paper documents.",
"Miscellaneous",
),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False
# texinfo_no_detailmenu = False
# -- Options for Epub output ----------------------------------------------
# Bibliographic Dublin Core info.
epub_title = u'Paperless'
epub_author = u'Daniel Quinn'
epub_publisher = u'Daniel Quinn'
epub_copyright = u'2015, Daniel Quinn'
epub_title = "Paperless"
epub_author = "Daniel Quinn"
epub_publisher = "Daniel Quinn"
epub_copyright = "2015, Daniel Quinn"
# The basename for the epub file. It defaults to the project name.
#epub_basename = u'Paperless'
# epub_basename = u'Paperless'
# The HTML theme for the epub output. Since the default themes are not optimized
# for small screen space, using the same theme for HTML and epub output is
# usually not wise. This defaults to 'epub', a theme designed to save visual
# space.
#epub_theme = 'epub'
# epub_theme = 'epub'
# The language of the text. It defaults to the language option
# or en if the language is not set.
#epub_language = ''
# epub_language = ''
# The scheme of the identifier. Typical schemes are ISBN or URL.
#epub_scheme = ''
# epub_scheme = ''
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#epub_identifier = ''
# epub_identifier = ''
# A unique identification for the text.
#epub_uid = ''
# epub_uid = ''
# A tuple containing the cover image and cover page html template filenames.
#epub_cover = ()
# epub_cover = ()
# A sequence of (type, uri, title) tuples for the guide element of content.opf.
#epub_guide = ()
# epub_guide = ()
# HTML files that should be inserted before the pages created by sphinx.
# The format is a list of tuples containing the path and title.
#epub_pre_files = []
# epub_pre_files = []
# HTML files shat should be inserted after the pages created by sphinx.
# The format is a list of tuples containing the path and title.
#epub_post_files = []
# epub_post_files = []
# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']
epub_exclude_files = ["search.html"]
# The depth of the table of contents in toc.ncx.
#epub_tocdepth = 3
# epub_tocdepth = 3
# Allow duplicate toc entries.
#epub_tocdup = True
# epub_tocdup = True
# Choose between 'default' and 'includehidden'.
#epub_tocscope = 'default'
# epub_tocscope = 'default'
# Fix unsupported image types using the PIL.
#epub_fix_images = False
# epub_fix_images = False
# Scale large images.
#epub_max_image_width = 0
# epub_max_image_width = 0
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#epub_show_urls = 'inline'
# epub_show_urls = 'inline'
# If false, no index is generated.
#epub_use_index = True
# epub_use_index = True
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'http://docs.python.org/': None}
intersphinx_mapping = {"http://docs.python.org/": None}

View File

@ -108,6 +108,7 @@ Testing and code style:
* Run ``pytest`` in the src/ directory to execute all tests. This also generates a HTML coverage
report. When runnings test, paperless.conf is loaded as well. However: the tests rely on the default
configuration. This is not ideal. But for now, make sure no settings except for DEBUG are overridden when testing.
* Run ``black`` to format your code.
* Run ``pycodestyle`` to test your code for issues with the configured code style settings.
.. note::

View File

@ -2,35 +2,38 @@ import os
bind = f'0.0.0.0:{os.getenv("PAPERLESS_PORT", 8000)}'
workers = int(os.getenv("PAPERLESS_WEBSERVER_WORKERS", 2))
worker_class = 'paperless.workers.ConfigurableWorker'
worker_class = "paperless.workers.ConfigurableWorker"
timeout = 120
def pre_fork(server, worker):
pass
def pre_exec(server):
server.log.info("Forked child, re-executing.")
def when_ready(server):
server.log.info("Server is ready. Spawning workers")
def worker_int(worker):
worker.log.info("worker received INT or QUIT signal")
## get traceback info
import threading, sys, traceback
id2name = dict([(th.ident, th.name) for th in threading.enumerate()])
code = []
for threadId, stack in sys._current_frames().items():
code.append("\n# Thread: %s(%d)" % (id2name.get(threadId,""),
threadId))
code.append("\n# Thread: %s(%d)" % (id2name.get(threadId, ""), threadId))
for filename, lineno, name, line in traceback.extract_stack(stack):
code.append('File: "%s", line %d, in %s' % (filename,
lineno, name))
code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
if line:
code.append(" %s" % (line.strip()))
worker.log.debug("\n".join(code))
def worker_abort(worker):
worker.log.info("worker received SIGABRT signal")

View File

@ -1,39 +1,32 @@
from django.contrib import admin
from .models import Correspondent, Document, DocumentType, Tag, \
SavedView, SavedViewFilterRule
from .models import (
Correspondent,
Document,
DocumentType,
Tag,
SavedView,
SavedViewFilterRule,
)
class CorrespondentAdmin(admin.ModelAdmin):
list_display = (
"name",
"match",
"matching_algorithm"
)
list_display = ("name", "match", "matching_algorithm")
list_filter = ("matching_algorithm",)
list_editable = ("match", "matching_algorithm")
class TagAdmin(admin.ModelAdmin):
list_display = (
"name",
"color",
"match",
"matching_algorithm"
)
list_display = ("name", "color", "match", "matching_algorithm")
list_filter = ("color", "matching_algorithm")
list_editable = ("color", "match", "matching_algorithm")
class DocumentTypeAdmin(admin.ModelAdmin):
list_display = (
"name",
"match",
"matching_algorithm"
)
list_display = ("name", "match", "matching_algorithm")
list_filter = ("matching_algorithm",)
list_editable = ("match", "matching_algorithm")
@ -49,18 +42,12 @@ class DocumentAdmin(admin.ModelAdmin):
"filename",
"checksum",
"archive_filename",
"archive_checksum"
"archive_checksum",
)
list_display_links = ("title",)
list_display = (
"id",
"title",
"mime_type",
"filename",
"archive_filename"
)
list_display = ("id", "title", "mime_type", "filename", "archive_filename")
list_filter = (
("mime_type"),
@ -79,6 +66,7 @@ class DocumentAdmin(admin.ModelAdmin):
def created_(self, obj):
return obj.created.date().strftime("%Y-%m-%d")
created_.short_description = "Created"
def delete_queryset(self, request, queryset):
@ -92,11 +80,13 @@ class DocumentAdmin(admin.ModelAdmin):
def delete_model(self, request, obj):
from documents import index
index.remove_document_from_index(obj)
super(DocumentAdmin, self).delete_model(request, obj)
def save_model(self, request, obj, form, change):
from documents import index
index.add_or_update_document(obj)
super(DocumentAdmin, self).save_model(request, obj, form, change)
@ -109,9 +99,7 @@ class SavedViewAdmin(admin.ModelAdmin):
list_display = ("name", "user")
inlines = [
RuleInline
]
inlines = [RuleInline]
admin.site.register(Correspondent, CorrespondentAdmin)

View File

@ -17,7 +17,7 @@ class DocumentsConfig(AppConfig):
set_correspondent,
set_document_type,
set_tags,
add_to_index
add_to_index,
)
document_consumption_finished.connect(add_inbox_tags)

View File

@ -4,14 +4,12 @@ from documents.models import Document
class BulkArchiveStrategy:
def __init__(self, zipf: ZipFile):
self.zipf = zipf
def make_unique_filename(self,
doc: Document,
archive: bool = False,
folder: str = ""):
def make_unique_filename(
self, doc: Document, archive: bool = False, folder: str = ""
):
counter = 0
while True:
filename = folder + doc.get_public_filename(archive, counter)
@ -25,36 +23,31 @@ class BulkArchiveStrategy:
class OriginalsOnlyStrategy(BulkArchiveStrategy):
def add_document(self, doc: Document):
self.zipf.write(doc.source_path, self.make_unique_filename(doc))
class ArchiveOnlyStrategy(BulkArchiveStrategy):
def __init__(self, zipf):
super(ArchiveOnlyStrategy, self).__init__(zipf)
def add_document(self, doc: Document):
if doc.has_archive_version:
self.zipf.write(doc.archive_path,
self.make_unique_filename(doc, archive=True))
self.zipf.write(
doc.archive_path, self.make_unique_filename(doc, archive=True)
)
else:
self.zipf.write(doc.source_path,
self.make_unique_filename(doc))
self.zipf.write(doc.source_path, self.make_unique_filename(doc))
class OriginalAndArchiveStrategy(BulkArchiveStrategy):
def add_document(self, doc: Document):
if doc.has_archive_version:
self.zipf.write(
doc.archive_path, self.make_unique_filename(
doc, archive=True, folder="archive/"
)
doc.archive_path,
self.make_unique_filename(doc, archive=True, folder="archive/"),
)
self.zipf.write(
doc.source_path,
self.make_unique_filename(doc, folder="originals/")
doc.source_path, self.make_unique_filename(doc, folder="originals/")
)

View File

@ -10,13 +10,11 @@ def set_correspondent(doc_ids, correspondent):
if correspondent:
correspondent = Correspondent.objects.get(id=correspondent)
qs = Document.objects.filter(
Q(id__in=doc_ids) & ~Q(correspondent=correspondent))
qs = Document.objects.filter(Q(id__in=doc_ids) & ~Q(correspondent=correspondent))
affected_docs = [doc.id for doc in qs]
qs.update(correspondent=correspondent)
async_task(
"documents.tasks.bulk_update_documents", document_ids=affected_docs)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
return "OK"
@ -25,13 +23,11 @@ def set_document_type(doc_ids, document_type):
if document_type:
document_type = DocumentType.objects.get(id=document_type)
qs = Document.objects.filter(
Q(id__in=doc_ids) & ~Q(document_type=document_type))
qs = Document.objects.filter(Q(id__in=doc_ids) & ~Q(document_type=document_type))
affected_docs = [doc.id for doc in qs]
qs.update(document_type=document_type)
async_task(
"documents.tasks.bulk_update_documents", document_ids=affected_docs)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
return "OK"
@ -43,13 +39,11 @@ def add_tag(doc_ids, tag):
DocumentTagRelationship = Document.tags.through
DocumentTagRelationship.objects.bulk_create([
DocumentTagRelationship(
document_id=doc, tag_id=tag) for doc in affected_docs
])
DocumentTagRelationship.objects.bulk_create(
[DocumentTagRelationship(document_id=doc, tag_id=tag) for doc in affected_docs]
)
async_task(
"documents.tasks.bulk_update_documents", document_ids=affected_docs)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
return "OK"
@ -62,12 +56,10 @@ def remove_tag(doc_ids, tag):
DocumentTagRelationship = Document.tags.through
DocumentTagRelationship.objects.filter(
Q(document_id__in=affected_docs) &
Q(tag_id=tag)
Q(document_id__in=affected_docs) & Q(tag_id=tag)
).delete()
async_task(
"documents.tasks.bulk_update_documents", document_ids=affected_docs)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
return "OK"
@ -83,13 +75,15 @@ def modify_tags(doc_ids, add_tags, remove_tags):
tag_id__in=remove_tags,
).delete()
DocumentTagRelationship.objects.bulk_create([DocumentTagRelationship(
document_id=doc, tag_id=tag) for (doc, tag) in itertools.product(
affected_docs, add_tags)
], ignore_conflicts=True)
DocumentTagRelationship.objects.bulk_create(
[
DocumentTagRelationship(document_id=doc, tag_id=tag)
for (doc, tag) in itertools.product(affected_docs, add_tags)
],
ignore_conflicts=True,
)
async_task(
"documents.tasks.bulk_update_documents", document_ids=affected_docs)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
return "OK"

View File

@ -16,28 +16,36 @@ def changed_password_check(app_configs, **kwargs):
try:
encrypted_doc = Document.objects.filter(
storage_type=Document.STORAGE_TYPE_GPG).first()
storage_type=Document.STORAGE_TYPE_GPG
).first()
except (OperationalError, ProgrammingError, FieldError):
return [] # No documents table yet
if encrypted_doc:
if not settings.PASSPHRASE:
return [Error(
"The database contains encrypted documents but no password "
"is set."
)]
return [
Error(
"The database contains encrypted documents but no password "
"is set."
)
]
if not GnuPG.decrypted(encrypted_doc.source_file):
return [Error(textwrap.dedent(
"""
return [
Error(
textwrap.dedent(
"""
The current password doesn't match the password of the
existing documents.
If you intend to change your password, you must first export
all of the old documents, start fresh with the new password
and then re-import them."
"""))]
"""
)
)
]
return []
@ -50,7 +58,11 @@ def parser_check(app_configs, **kwargs):
parsers.append(response[1])
if len(parsers) == 0:
return [Error("No parsers found. This is a bug. The consumer won't be "
"able to consume any documents without parsers.")]
return [
Error(
"No parsers found. This is a bug. The consumer won't be "
"able to consume any documents without parsers."
)
]
else:
return []

View File

@ -39,8 +39,7 @@ def load_classifier():
try:
classifier.load()
except (ClassifierModelCorruptError,
IncompatibleClassifierVersionError):
except (ClassifierModelCorruptError, IncompatibleClassifierVersionError):
# there's something wrong with the model file.
logger.exception(
f"Unrecoverable error while loading document "
@ -49,14 +48,10 @@ def load_classifier():
os.unlink(settings.MODEL_FILE)
classifier = None
except OSError:
logger.exception(
f"IO error while loading document classification model"
)
logger.exception(f"IO error while loading document classification model")
classifier = None
except Exception:
logger.exception(
f"Unknown error while loading document classification model"
)
logger.exception(f"Unknown error while loading document classification model")
classifier = None
return classifier
@ -83,7 +78,8 @@ class DocumentClassifier(object):
if schema_version != self.FORMAT_VERSION:
raise IncompatibleClassifierVersionError(
"Cannor load classifier, incompatible versions.")
"Cannor load classifier, incompatible versions."
)
else:
try:
self.data_hash = pickle.load(f)
@ -125,30 +121,37 @@ class DocumentClassifier(object):
# Step 1: Extract and preprocess training data from the database.
logger.debug("Gathering data from database...")
m = hashlib.sha1()
for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True): # NOQA: E501
for doc in Document.objects.order_by("pk").exclude(
tags__is_inbox_tag=True
): # NOQA: E501
preprocessed_content = preprocess_content(doc.content)
m.update(preprocessed_content.encode('utf-8'))
m.update(preprocessed_content.encode("utf-8"))
data.append(preprocessed_content)
y = -1
dt = doc.document_type
if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
y = dt.pk
m.update(y.to_bytes(4, 'little', signed=True))
m.update(y.to_bytes(4, "little", signed=True))
labels_document_type.append(y)
y = -1
cor = doc.correspondent
if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
y = cor.pk
m.update(y.to_bytes(4, 'little', signed=True))
m.update(y.to_bytes(4, "little", signed=True))
labels_correspondent.append(y)
tags = sorted([tag.pk for tag in doc.tags.filter(
matching_algorithm=MatchingModel.MATCH_AUTO
)])
tags = sorted(
[
tag.pk
for tag in doc.tags.filter(
matching_algorithm=MatchingModel.MATCH_AUTO
)
]
)
for tag in tags:
m.update(tag.to_bytes(4, 'little', signed=True))
m.update(tag.to_bytes(4, "little", signed=True))
labels_tags.append(tags)
if not data:
@ -174,10 +177,7 @@ class DocumentClassifier(object):
logger.debug(
"{} documents, {} tag(s), {} correspondent(s), "
"{} document type(s).".format(
len(data),
num_tags,
num_correspondents,
num_document_types
len(data), num_tags, num_correspondents, num_document_types
)
)
@ -188,9 +188,7 @@ class DocumentClassifier(object):
# Step 2: vectorize data
logger.debug("Vectorizing data...")
self.data_vectorizer = CountVectorizer(
analyzer="word",
ngram_range=(1, 2),
min_df=0.01
analyzer="word", ngram_range=(1, 2), min_df=0.01
)
data_vectorized = self.data_vectorizer.fit_transform(data)
@ -201,54 +199,41 @@ class DocumentClassifier(object):
if num_tags == 1:
# Special case where only one tag has auto:
# Fallback to binary classification.
labels_tags = [label[0] if len(label) == 1 else -1
for label in labels_tags]
labels_tags = [
label[0] if len(label) == 1 else -1 for label in labels_tags
]
self.tags_binarizer = LabelBinarizer()
labels_tags_vectorized = self.tags_binarizer.fit_transform(
labels_tags).ravel()
labels_tags
).ravel()
else:
self.tags_binarizer = MultiLabelBinarizer()
labels_tags_vectorized = self.tags_binarizer.fit_transform(
labels_tags)
labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
self.tags_classifier = MLPClassifier(tol=0.01)
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
else:
self.tags_classifier = None
logger.debug(
"There are no tags. Not training tags classifier."
)
logger.debug("There are no tags. Not training tags classifier.")
if num_correspondents > 0:
logger.debug(
"Training correspondent classifier..."
)
logger.debug("Training correspondent classifier...")
self.correspondent_classifier = MLPClassifier(tol=0.01)
self.correspondent_classifier.fit(
data_vectorized,
labels_correspondent
)
self.correspondent_classifier.fit(data_vectorized, labels_correspondent)
else:
self.correspondent_classifier = None
logger.debug(
"There are no correspondents. Not training correspondent "
"classifier."
"There are no correspondents. Not training correspondent " "classifier."
)
if num_document_types > 0:
logger.debug(
"Training document type classifier..."
)
logger.debug("Training document type classifier...")
self.document_type_classifier = MLPClassifier(tol=0.01)
self.document_type_classifier.fit(
data_vectorized,
labels_document_type
)
self.document_type_classifier.fit(data_vectorized, labels_document_type)
else:
self.document_type_classifier = None
logger.debug(
"There are no document types. Not training document type "
"classifier."
"There are no document types. Not training document type " "classifier."
)
self.data_hash = new_data_hash
@ -284,10 +269,10 @@ class DocumentClassifier(object):
X = self.data_vectorizer.transform([preprocess_content(content)])
y = self.tags_classifier.predict(X)
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
if type_of_target(y).startswith('multilabel'):
if type_of_target(y).startswith("multilabel"):
# the usual case when there are multiple tags.
return list(tags_ids)
elif type_of_target(y) == 'binary' and tags_ids != -1:
elif type_of_target(y) == "binary" and tags_ids != -1:
# This is for when we have binary classification with only one
# tag and the result is to assign this tag.
return [tags_ids]

View File

@ -15,15 +15,11 @@ from filelock import FileLock
from rest_framework.reverse import reverse
from .classifier import load_classifier
from .file_handling import create_source_path_directory, \
generate_unique_filename
from .file_handling import create_source_path_directory, generate_unique_filename
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class_for_mime_type, parse_date
from .signals import (
document_consumption_finished,
document_consumption_started
)
from .signals import document_consumption_finished, document_consumption_started
class ConsumerError(Exception):
@ -49,23 +45,26 @@ class Consumer(LoggingMixin):
logging_name = "paperless.consumer"
def _send_progress(self, current_progress, max_progress, status,
message=None, document_id=None):
def _send_progress(
self, current_progress, max_progress, status, message=None, document_id=None
):
payload = {
'filename': os.path.basename(self.filename) if self.filename else None, # NOQA: E501
'task_id': self.task_id,
'current_progress': current_progress,
'max_progress': max_progress,
'status': status,
'message': message,
'document_id': document_id
"filename": os.path.basename(self.filename)
if self.filename
else None, # NOQA: E501
"task_id": self.task_id,
"current_progress": current_progress,
"max_progress": max_progress,
"status": status,
"message": message,
"document_id": document_id,
}
async_to_sync(self.channel_layer.group_send)("status_updates",
{'type': 'status_update',
'data': payload})
async_to_sync(self.channel_layer.group_send)(
"status_updates", {"type": "status_update", "data": payload}
)
def _fail(self, message, log_message=None, exc_info=None):
self._send_progress(100, 100, 'FAILED', message)
self._send_progress(100, 100, "FAILED", message)
self.log("error", log_message or message, exc_info=exc_info)
raise ConsumerError(f"{self.filename}: {log_message or message}")
@ -84,19 +83,20 @@ class Consumer(LoggingMixin):
def pre_check_file_exists(self):
if not os.path.isfile(self.path):
self._fail(
MESSAGE_FILE_NOT_FOUND,
f"Cannot consume {self.path}: File not found."
MESSAGE_FILE_NOT_FOUND, f"Cannot consume {self.path}: File not found."
)
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
if Document.objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum)
).exists(): # NOQA: E501
if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path)
self._fail(
MESSAGE_DOCUMENT_ALREADY_EXISTS,
f"Not consuming {self.filename}: It is a duplicate."
f"Not consuming {self.filename}: It is a duplicate.",
)
def pre_check_directories(self):
@ -113,10 +113,10 @@ class Consumer(LoggingMixin):
self._fail(
MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND,
f"Configured pre-consume script "
f"{settings.PRE_CONSUME_SCRIPT} does not exist.")
f"{settings.PRE_CONSUME_SCRIPT} does not exist.",
)
self.log("info",
f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
try:
Popen((settings.PRE_CONSUME_SCRIPT, self.path)).wait()
@ -124,7 +124,7 @@ class Consumer(LoggingMixin):
self._fail(
MESSAGE_PRE_CONSUME_SCRIPT_ERROR,
f"Error while executing pre-consume script: {e}",
exc_info=True
exc_info=True,
)
def run_post_consume_script(self, document):
@ -135,42 +135,44 @@ class Consumer(LoggingMixin):
self._fail(
MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND,
f"Configured post-consume script "
f"{settings.POST_CONSUME_SCRIPT} does not exist."
f"{settings.POST_CONSUME_SCRIPT} does not exist.",
)
self.log(
"info",
f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}"
"info", f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}"
)
try:
Popen((
settings.POST_CONSUME_SCRIPT,
str(document.pk),
document.get_public_filename(),
os.path.normpath(document.source_path),
os.path.normpath(document.thumbnail_path),
reverse("document-download", kwargs={"pk": document.pk}),
reverse("document-thumb", kwargs={"pk": document.pk}),
str(document.correspondent),
str(",".join(document.tags.all().values_list(
"name", flat=True)))
)).wait()
Popen(
(
settings.POST_CONSUME_SCRIPT,
str(document.pk),
document.get_public_filename(),
os.path.normpath(document.source_path),
os.path.normpath(document.thumbnail_path),
reverse("document-download", kwargs={"pk": document.pk}),
reverse("document-thumb", kwargs={"pk": document.pk}),
str(document.correspondent),
str(",".join(document.tags.all().values_list("name", flat=True))),
)
).wait()
except Exception as e:
self._fail(
MESSAGE_POST_CONSUME_SCRIPT_ERROR,
f"Error while executing post-consume script: {e}",
exc_info=True
exc_info=True,
)
def try_consume_file(self,
path,
override_filename=None,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None,
task_id=None):
def try_consume_file(
self,
path,
override_filename=None,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None,
task_id=None,
):
"""
Return the document object if it was successfully created.
"""
@ -183,7 +185,7 @@ class Consumer(LoggingMixin):
self.override_tag_ids = override_tag_ids
self.task_id = task_id or str(uuid.uuid4())
self._send_progress(0, 100, 'STARTING', MESSAGE_NEW_FILE)
self._send_progress(0, 100, "STARTING", MESSAGE_NEW_FILE)
# this is for grouping logging entries for this particular file
# together.
@ -206,17 +208,12 @@ class Consumer(LoggingMixin):
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
self._fail(
MESSAGE_UNSUPPORTED_TYPE,
f"Unsupported mime type {mime_type}"
)
self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
# Notify all listeners that we're going to do some work.
document_consumption_started.send(
sender=self.__class__,
filename=self.path,
logging_group=self.logging_group
sender=self.__class__, filename=self.path, logging_group=self.logging_group
)
self.run_pre_consume_script()
@ -243,21 +240,20 @@ class Consumer(LoggingMixin):
archive_path = None
try:
self._send_progress(20, 100, 'WORKING', MESSAGE_PARSING_DOCUMENT)
self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT)
self.log("debug", "Parsing {}...".format(self.filename))
document_parser.parse(self.path, mime_type, self.filename)
self.log("debug", f"Generating thumbnail for {self.filename}...")
self._send_progress(70, 100, 'WORKING',
MESSAGE_GENERATING_THUMBNAIL)
self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
thumbnail = document_parser.get_optimised_thumbnail(
self.path, mime_type, self.filename)
self.path, mime_type, self.filename
)
text = document_parser.get_text()
date = document_parser.get_date()
if not date:
self._send_progress(90, 100, 'WORKING',
MESSAGE_PARSE_DATE)
self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE)
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path()
@ -266,7 +262,7 @@ class Consumer(LoggingMixin):
self._fail(
str(e),
f"Error while consuming document {self.filename}: {e}",
exc_info=True
exc_info=True,
)
# Prepare the document classifier.
@ -277,18 +273,14 @@ class Consumer(LoggingMixin):
classifier = load_classifier()
self._send_progress(95, 100, 'WORKING', MESSAGE_SAVE_DOCUMENT)
self._send_progress(95, 100, "WORKING", MESSAGE_SAVE_DOCUMENT)
# now that everything is done, we can start to store the document
# in the system. This will be a transaction and reasonably fast.
try:
with transaction.atomic():
# store the document.
document = self._store(
text=text,
date=date,
mime_type=mime_type
)
document = self._store(text=text, date=date, mime_type=mime_type)
# If we get here, it was successful. Proceed with post-consume
# hooks. If they fail, nothing will get changed.
@ -297,7 +289,7 @@ class Consumer(LoggingMixin):
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier
classifier=classifier,
)
# After everything is in the database, copy the files into
@ -306,24 +298,25 @@ class Consumer(LoggingMixin):
document.filename = generate_unique_filename(document)
create_source_path_directory(document.source_path)
self._write(document.storage_type,
self.path, document.source_path)
self._write(document.storage_type, self.path, document.source_path)
self._write(document.storage_type,
thumbnail, document.thumbnail_path)
self._write(
document.storage_type, thumbnail, document.thumbnail_path
)
if archive_path and os.path.isfile(archive_path):
document.archive_filename = generate_unique_filename(
document,
archive_filename=True
document, archive_filename=True
)
create_source_path_directory(document.archive_path)
self._write(document.storage_type,
archive_path, document.archive_path)
self._write(
document.storage_type, archive_path, document.archive_path
)
with open(archive_path, 'rb') as f:
with open(archive_path, "rb") as f:
document.archive_checksum = hashlib.md5(
f.read()).hexdigest()
f.read()
).hexdigest()
# Don't save with the lock active. Saving will cause the file
# renaming logic to aquire the lock as well.
@ -335,8 +328,8 @@ class Consumer(LoggingMixin):
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = os.path.join(
os.path.dirname(self.path),
"._" + os.path.basename(self.path))
os.path.dirname(self.path), "._" + os.path.basename(self.path)
)
if os.path.isfile(shadow_file):
self.log("debug", "Deleting file {}".format(shadow_file))
@ -345,21 +338,17 @@ class Consumer(LoggingMixin):
except Exception as e:
self._fail(
str(e),
f"The following error occured while consuming "
f"{self.filename}: {e}",
exc_info=True
f"The following error occured while consuming " f"{self.filename}: {e}",
exc_info=True,
)
finally:
document_parser.cleanup()
self.run_post_consume_script(document)
self.log(
"info",
"Document {} consumption finished".format(document)
)
self.log("info", "Document {} consumption finished".format(document))
self._send_progress(100, 100, 'SUCCESS', MESSAGE_FINISHED, document.id)
self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id)
return document
@ -373,8 +362,11 @@ class Consumer(LoggingMixin):
self.log("debug", "Saving record to database")
created = file_info.created or date or timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
created = (
file_info.created
or date
or timezone.make_aware(datetime.datetime.fromtimestamp(stats.st_mtime))
)
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
@ -386,7 +378,7 @@ class Consumer(LoggingMixin):
checksum=hashlib.md5(f.read()).hexdigest(),
created=created,
modified=created,
storage_type=storage_type
storage_type=storage_type,
)
self.apply_overrides(document)
@ -398,11 +390,13 @@ class Consumer(LoggingMixin):
def apply_overrides(self, document):
if self.override_correspondent_id:
document.correspondent = Correspondent.objects.get(
pk=self.override_correspondent_id)
pk=self.override_correspondent_id
)
if self.override_document_type_id:
document.document_type = DocumentType.objects.get(
pk=self.override_document_type_id)
pk=self.override_document_type_id
)
if self.override_tag_ids:
for tag_id in self.override_tag_ids:

View File

@ -12,7 +12,6 @@ logger = logging.getLogger("paperless.filehandling")
class defaultdictNoStr(defaultdict):
def __str__(self):
raise ValueError("Don't use {tags} directly.")
@ -63,24 +62,23 @@ def many_to_dictionary(field):
mydictionary[index] = slugify(t.name)
# Find delimiter
delimiter = t.name.find('_')
delimiter = t.name.find("_")
if delimiter == -1:
delimiter = t.name.find('-')
delimiter = t.name.find("-")
if delimiter == -1:
continue
key = t.name[:delimiter]
value = t.name[delimiter + 1:]
value = t.name[delimiter + 1 :]
mydictionary[slugify(key)] = slugify(value)
return mydictionary
def generate_unique_filename(doc,
archive_filename=False):
def generate_unique_filename(doc, archive_filename=False):
"""
Generates a unique filename for doc in settings.ORIGINALS_DIR.
@ -104,14 +102,17 @@ def generate_unique_filename(doc,
if archive_filename and doc.filename:
new_filename = os.path.splitext(doc.filename)[0] + ".pdf"
if new_filename == old_filename or not os.path.exists(os.path.join(root, new_filename)): # NOQA: E501
if new_filename == old_filename or not os.path.exists(
os.path.join(root, new_filename)
): # NOQA: E501
return new_filename
counter = 0
while True:
new_filename = generate_filename(
doc, counter, archive_filename=archive_filename)
doc, counter, archive_filename=archive_filename
)
if new_filename == old_filename:
# still the same as before.
return new_filename
@ -127,14 +128,11 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdictNoStr(lambda: slugify(None),
many_to_dictionary(doc.tags))
tags = defaultdictNoStr(lambda: slugify(None), many_to_dictionary(doc.tags))
tag_list = pathvalidate.sanitize_filename(
",".join(sorted(
[tag.name for tag in doc.tags.all()]
)),
replacement_text="-"
",".join(sorted([tag.name for tag in doc.tags.all()])),
replacement_text="-",
)
if doc.correspondent:
@ -157,13 +155,14 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
asn = "none"
path = settings.PAPERLESS_FILENAME_FORMAT.format(
title=pathvalidate.sanitize_filename(
doc.title, replacement_text="-"),
title=pathvalidate.sanitize_filename(doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(doc.created),
created_year=doc.created.year if doc.created else "none",
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
created_month=f"{doc.created.month:02}"
if doc.created
else "none", # NOQA: E501
created_day=f"{doc.created.day:02}" if doc.created else "none",
added=datetime.date.isoformat(doc.added),
added_year=doc.added.year if doc.added else "none",
@ -171,7 +170,7 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
added_day=f"{doc.added.day:02}" if doc.added else "none",
asn=asn,
tags=tags,
tag_list=tag_list
tag_list=tag_list,
).strip()
path = path.strip(os.sep)
@ -179,7 +178,8 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
except (ValueError, KeyError, IndexError):
logger.warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default"
)
counter_str = f"_{counter:02}" if counter else ""

View File

@ -10,34 +10,24 @@ DATE_KWARGS = ["year", "month", "day", "date__gt", "gt", "date__lt", "lt"]
class CorrespondentFilterSet(FilterSet):
class Meta:
model = Correspondent
fields = {
"name": CHAR_KWARGS
}
fields = {"name": CHAR_KWARGS}
class TagFilterSet(FilterSet):
class Meta:
model = Tag
fields = {
"name": CHAR_KWARGS
}
fields = {"name": CHAR_KWARGS}
class DocumentTypeFilterSet(FilterSet):
class Meta:
model = DocumentType
fields = {
"name": CHAR_KWARGS
}
fields = {"name": CHAR_KWARGS}
class TagsFilter(Filter):
def __init__(self, exclude=False, in_list=False):
super(TagsFilter, self).__init__()
self.exclude = exclude
@ -48,7 +38,7 @@ class TagsFilter(Filter):
return qs
try:
tag_ids = [int(x) for x in value.split(',')]
tag_ids = [int(x) for x in value.split(",")]
except ValueError:
return qs
@ -65,22 +55,19 @@ class TagsFilter(Filter):
class InboxFilter(Filter):
def filter(self, qs, value):
if value == 'true':
if value == "true":
return qs.filter(tags__is_inbox_tag=True)
elif value == 'false':
elif value == "false":
return qs.exclude(tags__is_inbox_tag=True)
else:
return qs
class TitleContentFilter(Filter):
def filter(self, qs, value):
if value:
return qs.filter(Q(title__icontains=value) |
Q(content__icontains=value))
return qs.filter(Q(title__icontains=value) | Q(content__icontains=value))
else:
return qs
@ -88,10 +75,7 @@ class TitleContentFilter(Filter):
class DocumentFilterSet(FilterSet):
is_tagged = BooleanFilter(
label="Is tagged",
field_name="tags",
lookup_expr="isnull",
exclude=True
label="Is tagged", field_name="tags", lookup_expr="isnull", exclude=True
)
tags__id__all = TagsFilter()
@ -107,38 +91,24 @@ class DocumentFilterSet(FilterSet):
class Meta:
model = Document
fields = {
"title": CHAR_KWARGS,
"content": CHAR_KWARGS,
"archive_serial_number": INT_KWARGS,
"created": DATE_KWARGS,
"added": DATE_KWARGS,
"modified": DATE_KWARGS,
"correspondent": ["isnull"],
"correspondent__id": ID_KWARGS,
"correspondent__name": CHAR_KWARGS,
"tags__id": ID_KWARGS,
"tags__name": CHAR_KWARGS,
"document_type": ["isnull"],
"document_type__id": ID_KWARGS,
"document_type__name": CHAR_KWARGS,
}
class LogFilterSet(FilterSet):
class Meta:
model = Log
fields = {
"level": INT_KWARGS,
"created": DATE_KWARGS,
"group": ID_KWARGS
}
fields = {"level": INT_KWARGS, "created": DATE_KWARGS, "group": ID_KWARGS}

View File

@ -21,51 +21,22 @@ logger = logging.getLogger("paperless.index")
def get_schema():
return Schema(
id=NUMERIC(
stored=True,
unique=True
),
title=TEXT(
sortable=True
),
id=NUMERIC(stored=True, unique=True),
title=TEXT(sortable=True),
content=TEXT(),
asn=NUMERIC(
sortable=True
),
correspondent=TEXT(
sortable=True
),
asn=NUMERIC(sortable=True),
correspondent=TEXT(sortable=True),
correspondent_id=NUMERIC(),
has_correspondent=BOOLEAN(),
tag=KEYWORD(
commas=True,
scorable=True,
lowercase=True
),
tag_id=KEYWORD(
commas=True,
scorable=True
),
tag=KEYWORD(commas=True, scorable=True, lowercase=True),
tag_id=KEYWORD(commas=True, scorable=True),
has_tag=BOOLEAN(),
type=TEXT(
sortable=True
),
type=TEXT(sortable=True),
type_id=NUMERIC(),
has_type=BOOLEAN(),
created=DATETIME(
sortable=True
),
modified=DATETIME(
sortable=True
),
added=DATETIME(
sortable=True
),
created=DATETIME(sortable=True),
modified=DATETIME(sortable=True),
added=DATETIME(sortable=True),
)
@ -132,7 +103,7 @@ def remove_document(writer, doc):
def remove_document_by_id(writer, doc_id):
writer.delete_by_term('id', doc_id)
writer.delete_by_term("id", doc_id)
def add_or_update_document(document):
@ -146,48 +117,47 @@ def remove_document_from_index(document):
class DelayedQuery:
def _get_query(self):
raise NotImplementedError()
def _get_query_filter(self):
criterias = []
for k, v in self.query_params.items():
if k == 'correspondent__id':
criterias.append(query.Term('correspondent_id', v))
elif k == 'tags__id__all':
if k == "correspondent__id":
criterias.append(query.Term("correspondent_id", v))
elif k == "tags__id__all":
for tag_id in v.split(","):
criterias.append(query.Term('tag_id', tag_id))
elif k == 'document_type__id':
criterias.append(query.Term('type_id', v))
elif k == 'correspondent__isnull':
criterias.append(query.Term("tag_id", tag_id))
elif k == "document_type__id":
criterias.append(query.Term("type_id", v))
elif k == "correspondent__isnull":
criterias.append(query.Term("has_correspondent", v == "false"))
elif k == 'is_tagged':
elif k == "is_tagged":
criterias.append(query.Term("has_tag", v == "true"))
elif k == 'document_type__isnull':
elif k == "document_type__isnull":
criterias.append(query.Term("has_type", v == "false"))
elif k == 'created__date__lt':
elif k == "created__date__lt":
criterias.append(
query.DateRange("created", start=None, end=isoparse(v)))
elif k == 'created__date__gt':
query.DateRange("created", start=None, end=isoparse(v))
)
elif k == "created__date__gt":
criterias.append(
query.DateRange("created", start=isoparse(v), end=None))
elif k == 'added__date__gt':
criterias.append(
query.DateRange("added", start=isoparse(v), end=None))
elif k == 'added__date__lt':
criterias.append(
query.DateRange("added", start=None, end=isoparse(v)))
query.DateRange("created", start=isoparse(v), end=None)
)
elif k == "added__date__gt":
criterias.append(query.DateRange("added", start=isoparse(v), end=None))
elif k == "added__date__lt":
criterias.append(query.DateRange("added", start=None, end=isoparse(v)))
if len(criterias) > 0:
return query.And(criterias)
else:
return None
def _get_query_sortedby(self):
if 'ordering' not in self.query_params:
if "ordering" not in self.query_params:
return None, False
field: str = self.query_params['ordering']
field: str = self.query_params["ordering"]
sort_fields_map = {
"created": "created",
@ -196,10 +166,10 @@ class DelayedQuery:
"title": "title",
"correspondent__name": "correspondent",
"document_type__name": "type",
"archive_serial_number": "asn"
"archive_serial_number": "asn",
}
if field.startswith('-'):
if field.startswith("-"):
field = field[1:]
reverse = True
else:
@ -235,24 +205,23 @@ class DelayedQuery:
pagenum=math.floor(item.start / self.page_size) + 1,
pagelen=self.page_size,
sortedby=sortedby,
reverse=reverse
reverse=reverse,
)
page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
page.results.fragmenter = highlight.ContextFragmenter(surround=50)
page.results.formatter = HtmlFormatter(tagname="span", between=" ... ")
if (not self.first_score and
len(page.results) > 0 and
sortedby is None):
if not self.first_score and len(page.results) > 0 and sortedby is None:
self.first_score = page.results[0].score
page.results.top_n = list(map(
lambda hit: (
(hit[0] / self.first_score) if self.first_score else None,
hit[1]
),
page.results.top_n
))
page.results.top_n = list(
map(
lambda hit: (
(hit[0] / self.first_score) if self.first_score else None,
hit[1],
),
page.results.top_n,
)
)
self.saved_results[item.start] = page
@ -260,12 +229,12 @@ class DelayedQuery:
class DelayedFullTextQuery(DelayedQuery):
def _get_query(self):
q_str = self.query_params['query']
q_str = self.query_params["query"]
qp = MultifieldParser(
["content", "title", "correspondent", "tag", "type"],
self.searcher.ixreader.schema)
self.searcher.ixreader.schema,
)
qp.add_plugin(DateParserPlugin())
q = qp.parse(q_str)
@ -277,18 +246,17 @@ class DelayedFullTextQuery(DelayedQuery):
class DelayedMoreLikeThisQuery(DelayedQuery):
def _get_query(self):
more_like_doc_id = int(self.query_params['more_like_id'])
more_like_doc_id = int(self.query_params["more_like_id"])
content = Document.objects.get(id=more_like_doc_id).content
docnum = self.searcher.document_number(id=more_like_doc_id)
kts = self.searcher.key_terms_from_text(
'content', content, numterms=20,
model=classify.Bo1Model, normalize=False)
"content", content, numterms=20, model=classify.Bo1Model, normalize=False
)
q = query.Or(
[query.Term('content', word, boost=weight)
for word, weight in kts])
[query.Term("content", word, boost=weight) for word, weight in kts]
)
mask = {docnum}
return q, mask
@ -298,6 +266,7 @@ def autocomplete(ix, term, limit=10):
with ix.reader() as reader:
terms = []
for (score, t) in reader.most_distinctive_terms(
"content", number=limit, prefix=term.lower()):
"content", number=limit, prefix=term.lower()
):
terms.append(t)
return terms

View File

@ -17,12 +17,7 @@ class LoggingMixin:
if self.logging_name:
logger = logging.getLogger(self.logging_name)
else:
name = ".".join([
self.__class__.__module__,
self.__class__.__name__
])
name = ".".join([self.__class__.__module__, self.__class__.__name__])
logger = logging.getLogger(name)
getattr(logger, level)(message, extra={
"group": self.logging_group
}, **kwargs)
getattr(logger, level)(message, extra={"group": self.logging_group}, **kwargs)

View File

@ -19,7 +19,7 @@ class Command(BaseCommand):
parser.add_argument(
"--passphrase",
help="If PAPERLESS_PASSPHRASE isn't set already, you need to "
"specify it here"
"specify it here",
)
def handle(self, *args, **options):
@ -50,12 +50,12 @@ class Command(BaseCommand):
def __gpg_to_unencrypted(passphrase):
encrypted_files = Document.objects.filter(
storage_type=Document.STORAGE_TYPE_GPG)
storage_type=Document.STORAGE_TYPE_GPG
)
for document in encrypted_files:
print("Decrypting {}".format(
document).encode('utf-8'))
print("Decrypting {}".format(document).encode("utf-8"))
old_paths = [document.source_path, document.thumbnail_path]
@ -66,10 +66,11 @@ class Command(BaseCommand):
ext = os.path.splitext(document.filename)[1]
if not ext == '.gpg':
if not ext == ".gpg":
raise CommandError(
f"Abort: encrypted file {document.source_path} does not "
f"end with .gpg")
f"end with .gpg"
)
document.filename = os.path.splitext(document.filename)[0]
@ -80,7 +81,8 @@ class Command(BaseCommand):
f.write(raw_thumb)
Document.objects.filter(id=document.id).update(
storage_type=document.storage_type, filename=document.filename)
storage_type=document.storage_type, filename=document.filename
)
for path in old_paths:
os.unlink(path)

View File

@ -16,8 +16,7 @@ from whoosh.writing import AsyncWriter
from documents.models import Document
from ... import index
from ...file_handling import create_source_path_directory, \
generate_unique_filename
from ...file_handling import create_source_path_directory, generate_unique_filename
from ...parsers import get_parser_class_for_mime_type
@ -32,51 +31,49 @@ def handle_document(document_id):
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
logger.error(f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})")
logger.error(
f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})"
)
return
parser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(
document.source_path,
mime_type,
document.get_public_filename())
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_optimised_thumbnail(
document.source_path,
mime_type,
document.get_public_filename()
document.source_path, mime_type, document.get_public_filename()
)
if parser.get_archive_path():
with transaction.atomic():
with open(parser.get_archive_path(), 'rb') as f:
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document, archive_filename=True)
document, archive_filename=True
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
archive_filename=document.archive_filename
archive_filename=document.archive_filename,
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(),
document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
with index.open_index_writer() as writer:
index.update_document(writer, document)
except Exception as e:
logger.exception(f"Error while parsing document {document} "
f"(ID: {document_id})")
logger.exception(
f"Error while parsing document {document} " f"(ID: {document_id})"
)
finally:
parser.cleanup()
@ -88,29 +85,33 @@ class Command(BaseCommand):
and document types to all documents, effectively allowing you to
back-tag all previously indexed documents with metadata created (or
modified) after their initial import.
""".replace(" ", "")
""".replace(
" ", ""
)
def add_arguments(self, parser):
parser.add_argument(
"-f", "--overwrite",
"-f",
"--overwrite",
default=False,
action="store_true",
help="Recreates the archived document for documents that already "
"have an archived version."
"have an archived version.",
)
parser.add_argument(
"-d", "--document",
"-d",
"--document",
default=None,
type=int,
required=False,
help="Specify the ID of a document, and this command will only "
"run on this specific document."
"run on this specific document.",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options):
@ -119,18 +120,17 @@ class Command(BaseCommand):
overwrite = options["overwrite"]
if options['document']:
documents = Document.objects.filter(pk=options['document'])
if options["document"]:
documents = Document.objects.filter(pk=options["document"])
else:
documents = Document.objects.all()
document_ids = list(map(
lambda doc: doc.id,
filter(
lambda d: overwrite or not d.has_archive_version,
documents
document_ids = list(
map(
lambda doc: doc.id,
filter(lambda d: overwrite or not d.has_archive_version, documents),
)
))
)
# Note to future self: this prevents django from reusing database
# conncetions between processes, which is bad and does not work
@ -141,13 +141,12 @@ class Command(BaseCommand):
logging.getLogger().handlers[0].level = logging.ERROR
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
list(tqdm.tqdm(
pool.imap_unordered(
handle_document,
document_ids
),
total=len(document_ids),
disable=options['no_progress_bar']
))
list(
tqdm.tqdm(
pool.imap_unordered(handle_document, document_ids),
total=len(document_ids),
disable=options["no_progress_bar"],
)
)
except KeyboardInterrupt:
print("Aborting...")

View File

@ -23,24 +23,21 @@ logger = logging.getLogger("paperless.management.consumer")
def _tags_from_path(filepath):
"""Walk up the directory tree from filepath to CONSUMPTION_DIR
and get or create Tag IDs for every directory.
and get or create Tag IDs for every directory.
"""
tag_ids = set()
path_parts = Path(filepath).relative_to(
settings.CONSUMPTION_DIR).parent.parts
path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
for part in path_parts:
tag_ids.add(Tag.objects.get_or_create(name__iexact=part, defaults={
"name": part
})[0].pk)
tag_ids.add(
Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk
)
return tag_ids
def _is_ignored(filepath: str) -> bool:
filepath_relative = PurePath(filepath).relative_to(
settings.CONSUMPTION_DIR)
return any(
filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
return any(filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
def _consume(filepath):
@ -48,13 +45,11 @@ def _consume(filepath):
return
if not os.path.isfile(filepath):
logger.debug(
f"Not consuming file {filepath}: File has moved.")
logger.debug(f"Not consuming file {filepath}: File has moved.")
return
if not is_file_ext_supported(os.path.splitext(filepath)[1]):
logger.warning(
f"Not consuming file {filepath}: Unknown file extension.")
logger.warning(f"Not consuming file {filepath}: Unknown file extension.")
return
tag_ids = None
@ -66,10 +61,12 @@ def _consume(filepath):
try:
logger.info(f"Adding {filepath} to the task queue.")
async_task("documents.tasks.consume_file",
filepath,
override_tag_ids=tag_ids if tag_ids else None,
task_name=os.path.basename(filepath)[:100])
async_task(
"documents.tasks.consume_file",
filepath,
override_tag_ids=tag_ids if tag_ids else None,
task_name=os.path.basename(filepath)[:100],
)
except Exception as e:
# Catch all so that the consumer won't crash.
# This is also what the test case is listening for to check for
@ -88,8 +85,9 @@ def _consume_wait_unmodified(file):
try:
new_mtime = os.stat(file).st_mtime
except FileNotFoundError:
logger.debug(f"File {file} moved while waiting for it to remain "
f"unmodified.")
logger.debug(
f"File {file} moved while waiting for it to remain " f"unmodified."
)
return
if new_mtime == mtime:
_consume(file)
@ -102,16 +100,11 @@ def _consume_wait_unmodified(file):
class Handler(FileSystemEventHandler):
def on_created(self, event):
Thread(
target=_consume_wait_unmodified, args=(event.src_path,)
).start()
Thread(target=_consume_wait_unmodified, args=(event.src_path,)).start()
def on_moved(self, event):
Thread(
target=_consume_wait_unmodified, args=(event.dest_path,)
).start()
Thread(target=_consume_wait_unmodified, args=(event.dest_path,)).start()
class Command(BaseCommand):
@ -130,26 +123,19 @@ class Command(BaseCommand):
"directory",
default=settings.CONSUMPTION_DIR,
nargs="?",
help="The consumption directory."
)
parser.add_argument(
"--oneshot",
action="store_true",
help="Run only once."
help="The consumption directory.",
)
parser.add_argument("--oneshot", action="store_true", help="Run only once.")
def handle(self, *args, **options):
directory = options["directory"]
recursive = settings.CONSUMER_RECURSIVE
if not directory:
raise CommandError(
"CONSUMPTION_DIR does not appear to be set."
)
raise CommandError("CONSUMPTION_DIR does not appear to be set.")
if not os.path.isdir(directory):
raise CommandError(
f"Consumption directory {directory} does not exist")
raise CommandError(f"Consumption directory {directory} does not exist")
if recursive:
for dirpath, _, filenames in os.walk(directory):
@ -171,8 +157,7 @@ class Command(BaseCommand):
logger.debug("Consumer exiting.")
def handle_polling(self, directory, recursive):
logger.info(
f"Polling directory for changes: {directory}")
logger.info(f"Polling directory for changes: {directory}")
self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
self.observer.schedule(Handler(), directory, recursive=recursive)
self.observer.start()
@ -186,8 +171,7 @@ class Command(BaseCommand):
self.observer.join()
def handle_inotify(self, directory, recursive):
logger.info(
f"Using inotify to watch directory for changes: {directory}")
logger.info(f"Using inotify to watch directory for changes: {directory}")
inotify = INotify()
inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO

View File

@ -8,7 +8,9 @@ class Command(BaseCommand):
help = """
Trains the classifier on your data and saves the resulting models to a
file. The document consumer will then automatically use this new model.
""".replace(" ", "")
""".replace(
" ", ""
)
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)

View File

@ -12,10 +12,19 @@ from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from filelock import FileLock
from documents.models import Document, Correspondent, Tag, DocumentType, \
SavedView, SavedViewFilterRule
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
EXPORTER_ARCHIVE_NAME
from documents.models import (
Document,
Correspondent,
Tag,
DocumentType,
SavedView,
SavedViewFilterRule,
)
from documents.settings import (
EXPORTER_FILE_NAME,
EXPORTER_THUMBNAIL_NAME,
EXPORTER_ARCHIVE_NAME,
)
from paperless.db import GnuPG
from paperless_mail.models import MailAccount, MailRule
from ...file_handling import generate_filename, delete_empty_directories
@ -27,41 +36,46 @@ class Command(BaseCommand):
Decrypt and rename all files in our collection into a given target
directory. And include a manifest file containing document data for
easy import.
""".replace(" ", "")
""".replace(
" ", ""
)
def add_arguments(self, parser):
parser.add_argument("target")
parser.add_argument(
"-c", "--compare-checksums",
"-c",
"--compare-checksums",
default=False,
action="store_true",
help="Compare file checksums when determining whether to export "
"a file or not. If not specified, file size and time "
"modified is used instead."
"a file or not. If not specified, file size and time "
"modified is used instead.",
)
parser.add_argument(
"-f", "--use-filename-format",
"-f",
"--use-filename-format",
default=False,
action="store_true",
help="Use PAPERLESS_FILENAME_FORMAT for storing files in the "
"export directory, if configured."
"export directory, if configured.",
)
parser.add_argument(
"-d", "--delete",
"-d",
"--delete",
default=False,
action="store_true",
help="After exporting, delete files in the export directory that "
"do not belong to the current export, such as files from "
"deleted documents."
"do not belong to the current export, such as files from "
"deleted documents.",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
help="If set, the progress bar will not be shown",
)
def __init__(self, *args, **kwargs):
@ -76,9 +90,9 @@ class Command(BaseCommand):
def handle(self, *args, **options):
self.target = options["target"]
self.compare_checksums = options['compare_checksums']
self.use_filename_format = options['use_filename_format']
self.delete = options['delete']
self.compare_checksums = options["compare_checksums"]
self.use_filename_format = options["use_filename_format"]
self.delete = options["delete"]
if not os.path.exists(self.target):
raise CommandError("That path doesn't exist")
@ -87,7 +101,7 @@ class Command(BaseCommand):
raise CommandError("That path doesn't appear to be writable")
with FileLock(settings.MEDIA_LOCK):
self.dump(options['no_progress_bar'])
self.dump(options["no_progress_bar"])
def dump(self, progress_bar_disable=False):
# 1. Take a snapshot of what files exist in the current export folder
@ -100,43 +114,48 @@ class Command(BaseCommand):
# documents
with transaction.atomic():
manifest = json.loads(
serializers.serialize("json", Correspondent.objects.all()))
serializers.serialize("json", Correspondent.objects.all())
)
manifest += json.loads(serializers.serialize(
"json", Tag.objects.all()))
manifest += json.loads(serializers.serialize("json", Tag.objects.all()))
manifest += json.loads(serializers.serialize(
"json", DocumentType.objects.all()))
manifest += json.loads(
serializers.serialize("json", DocumentType.objects.all())
)
documents = Document.objects.order_by("id")
document_map = {d.pk: d for d in documents}
document_manifest = json.loads(
serializers.serialize("json", documents))
document_manifest = json.loads(serializers.serialize("json", documents))
manifest += document_manifest
manifest += json.loads(serializers.serialize(
"json", MailAccount.objects.all()))
manifest += json.loads(
serializers.serialize("json", MailAccount.objects.all())
)
manifest += json.loads(serializers.serialize(
"json", MailRule.objects.all()))
manifest += json.loads(
serializers.serialize("json", MailRule.objects.all())
)
manifest += json.loads(serializers.serialize(
"json", SavedView.objects.all()))
manifest += json.loads(
serializers.serialize("json", SavedView.objects.all())
)
manifest += json.loads(serializers.serialize(
"json", SavedViewFilterRule.objects.all()))
manifest += json.loads(
serializers.serialize("json", SavedViewFilterRule.objects.all())
)
manifest += json.loads(serializers.serialize(
"json", User.objects.all()))
manifest += json.loads(serializers.serialize("json", User.objects.all()))
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(
enumerate(document_manifest),
total=len(document_manifest),
disable=progress_bar_disable
disable=progress_bar_disable,
):
# 3.1. store files unencrypted
document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
document_dict["fields"][
"storage_type"
] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
document = document_map[document_dict["pk"]]
@ -145,11 +164,10 @@ class Command(BaseCommand):
while True:
if self.use_filename_format:
base_name = generate_filename(
document, counter=filename_counter,
append_gpg=False)
document, counter=filename_counter, append_gpg=False
)
else:
base_name = document.get_public_filename(
counter=filename_counter)
base_name = document.get_public_filename(counter=filename_counter)
if base_name not in self.exported_files:
self.exported_files.append(base_name)
@ -193,22 +211,19 @@ class Command(BaseCommand):
f.write(GnuPG.decrypted(document.archive_path))
os.utime(archive_target, times=(t, t))
else:
self.check_and_copy(document.source_path,
document.checksum,
original_target)
self.check_and_copy(
document.source_path, document.checksum, original_target
)
self.check_and_copy(document.thumbnail_path,
None,
thumbnail_target)
self.check_and_copy(document.thumbnail_path, None, thumbnail_target)
if archive_target:
self.check_and_copy(document.archive_path,
document.archive_checksum,
archive_target)
self.check_and_copy(
document.archive_path, document.archive_checksum, archive_target
)
# 4. write manifest to target forlder
manifest_path = os.path.abspath(
os.path.join(self.target, "manifest.json"))
manifest_path = os.path.abspath(os.path.join(self.target, "manifest.json"))
with open(manifest_path, "w") as f:
json.dump(manifest, f, indent=2)
@ -222,8 +237,9 @@ class Command(BaseCommand):
for f in self.files_in_export_dir:
os.remove(f)
delete_empty_directories(os.path.abspath(os.path.dirname(f)),
os.path.abspath(self.target))
delete_empty_directories(
os.path.abspath(os.path.dirname(f)), os.path.abspath(self.target)
)
def check_and_copy(self, source, source_checksum, target):
if os.path.abspath(target) in self.files_in_export_dir:

View File

@ -12,8 +12,11 @@ from django.db.models.signals import post_save, m2m_changed
from filelock import FileLock
from documents.models import Document
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
EXPORTER_ARCHIVE_NAME
from documents.settings import (
EXPORTER_FILE_NAME,
EXPORTER_THUMBNAIL_NAME,
EXPORTER_ARCHIVE_NAME,
)
from ...file_handling import create_source_path_directory
from ...signals.handlers import update_filename_and_move_files
@ -32,7 +35,9 @@ class Command(BaseCommand):
help = """
Using a manifest.json file, load the data from there, and import the
documents it refers to.
""".replace(" ", "")
""".replace(
" ", ""
)
def add_arguments(self, parser):
parser.add_argument("source")
@ -40,7 +45,7 @@ class Command(BaseCommand):
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
help="If set, the progress bar will not be shown",
)
def __init__(self, *args, **kwargs):
@ -67,26 +72,27 @@ class Command(BaseCommand):
self.manifest = json.load(f)
self._check_manifest()
with disable_signal(post_save,
receiver=update_filename_and_move_files,
sender=Document):
with disable_signal(m2m_changed,
receiver=update_filename_and_move_files,
sender=Document.tags.through):
with disable_signal(
post_save, receiver=update_filename_and_move_files, sender=Document
):
with disable_signal(
m2m_changed,
receiver=update_filename_and_move_files,
sender=Document.tags.through,
):
# Fill up the database with whatever is in the manifest
call_command("loaddata", manifest_path)
self._import_files_from_manifest(options['no_progress_bar'])
self._import_files_from_manifest(options["no_progress_bar"])
print("Updating search index...")
call_command('document_index', 'reindex')
call_command("document_index", "reindex")
@staticmethod
def _check_manifest_exists(path):
if not os.path.exists(path):
raise CommandError(
"That directory doesn't appear to contain a manifest.json "
"file."
"That directory doesn't appear to contain a manifest.json " "file."
)
def _check_manifest(self):
@ -98,15 +104,15 @@ class Command(BaseCommand):
if EXPORTER_FILE_NAME not in record:
raise CommandError(
'The manifest file contains a record which does not '
'refer to an actual document file.'
"The manifest file contains a record which does not "
"refer to an actual document file."
)
doc_file = record[EXPORTER_FILE_NAME]
if not os.path.exists(os.path.join(self.source, doc_file)):
raise CommandError(
'The manifest file refers to "{}" which does not '
'appear to be in the source directory.'.format(doc_file)
"appear to be in the source directory.".format(doc_file)
)
if EXPORTER_ARCHIVE_NAME in record:
@ -125,14 +131,11 @@ class Command(BaseCommand):
print("Copy files into paperless...")
manifest_documents = list(filter(
lambda r: r["model"] == "documents.document",
self.manifest))
manifest_documents = list(
filter(lambda r: r["model"] == "documents.document", self.manifest)
)
for record in tqdm.tqdm(
manifest_documents,
disable=progress_bar_disable
):
for record in tqdm.tqdm(manifest_documents, disable=progress_bar_disable):
document = Document.objects.get(pk=record["pk"])

View File

@ -9,17 +9,17 @@ class Command(BaseCommand):
help = "Manages the document index."
def add_arguments(self, parser):
parser.add_argument("command", choices=['reindex', 'optimize'])
parser.add_argument("command", choices=["reindex", "optimize"])
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options):
with transaction.atomic():
if options['command'] == 'reindex':
index_reindex(progress_bar_disable=options['no_progress_bar'])
elif options['command'] == 'optimize':
if options["command"] == "reindex":
index_reindex(progress_bar_disable=options["no_progress_bar"])
elif options["command"] == "optimize":
index_optimize()

View File

@ -11,14 +11,16 @@ class Command(BaseCommand):
help = """
This will rename all documents to match the latest filename format.
""".replace(" ", "")
""".replace(
" ", ""
)
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options):
@ -26,7 +28,6 @@ class Command(BaseCommand):
logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm(
Document.objects.all(),
disable=options['no_progress_bar']
Document.objects.all(), disable=options["no_progress_bar"]
):
post_save.send(Document, instance=document)

View File

@ -18,60 +18,46 @@ class Command(BaseCommand):
and document types to all documents, effectively allowing you to
back-tag all previously indexed documents with metadata created (or
modified) after their initial import.
""".replace(" ", "")
""".replace(
" ", ""
)
def add_arguments(self, parser):
parser.add_argument(
"-c", "--correspondent",
default=False,
action="store_true"
)
parser.add_argument(
"-T", "--tags",
default=False,
action="store_true"
)
parser.add_argument(
"-t", "--document_type",
default=False,
action="store_true"
)
parser.add_argument(
"-i", "--inbox-only",
default=False,
action="store_true"
)
parser.add_argument("-c", "--correspondent", default=False, action="store_true")
parser.add_argument("-T", "--tags", default=False, action="store_true")
parser.add_argument("-t", "--document_type", default=False, action="store_true")
parser.add_argument("-i", "--inbox-only", default=False, action="store_true")
parser.add_argument(
"--use-first",
default=False,
action="store_true",
help="By default this command won't try to assign a correspondent "
"if more than one matches the document. Use this flag if "
"you'd rather it just pick the first one it finds."
"if more than one matches the document. Use this flag if "
"you'd rather it just pick the first one it finds.",
)
parser.add_argument(
"-f", "--overwrite",
"-f",
"--overwrite",
default=False,
action="store_true",
help="If set, the document retagger will overwrite any previously"
"set correspondent, document and remove correspondents, types"
"and tags that do not match anymore due to changed rules."
"set correspondent, document and remove correspondents, types"
"and tags that do not match anymore due to changed rules.",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"--suggest",
default=False,
action="store_true",
help="Return the suggestion, don't change anything."
help="Return the suggestion, don't change anything.",
)
parser.add_argument(
"--base-url",
help="The base URL to use to build the link to the documents."
"--base-url", help="The base URL to use to build the link to the documents."
)
def handle(self, *args, **options):
@ -86,38 +72,39 @@ class Command(BaseCommand):
classifier = load_classifier()
for document in tqdm.tqdm(
documents,
disable=options['no_progress_bar']
):
for document in tqdm.tqdm(documents, disable=options["no_progress_bar"]):
if options['correspondent']:
if options["correspondent"]:
set_correspondent(
sender=None,
document=document,
classifier=classifier,
replace=options['overwrite'],
use_first=options['use_first'],
suggest=options['suggest'],
base_url=options['base_url'],
color=color)
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
color=color,
)
if options['document_type']:
set_document_type(sender=None,
document=document,
classifier=classifier,
replace=options['overwrite'],
use_first=options['use_first'],
suggest=options['suggest'],
base_url=options['base_url'],
color=color)
if options["document_type"]:
set_document_type(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
color=color,
)
if options['tags']:
if options["tags"]:
set_tags(
sender=None,
document=document,
classifier=classifier,
replace=options['overwrite'],
suggest=options['suggest'],
base_url=options['base_url'],
color=color)
replace=options["overwrite"],
suggest=options["suggest"],
base_url=options["base_url"],
color=color,
)

View File

@ -6,18 +6,20 @@ class Command(BaseCommand):
help = """
This command checks your document archive for issues.
""".replace(" ", "")
""".replace(
" ", ""
)
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options):
messages = check_sanity(progress=not options['no_progress_bar'])
messages = check_sanity(progress=not options["no_progress_bar"])
messages.log_messages()

View File

@ -22,9 +22,7 @@ def _process_document(doc_in):
try:
thumb = parser.get_optimised_thumbnail(
document.source_path,
document.mime_type,
document.get_public_filename()
document.source_path, document.mime_type, document.get_public_filename()
)
shutil.move(thumb, document.thumbnail_path)
@ -36,29 +34,32 @@ class Command(BaseCommand):
help = """
This will regenerate the thumbnails for all documents.
""".replace(" ", "")
""".replace(
" ", ""
)
def add_arguments(self, parser):
parser.add_argument(
"-d", "--document",
"-d",
"--document",
default=None,
type=int,
required=False,
help="Specify the ID of a document, and this command will only "
"run on this specific document."
"run on this specific document.",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown"
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options):
logging.getLogger().handlers[0].level = logging.ERROR
if options['document']:
documents = Document.objects.filter(pk=options['document'])
if options["document"]:
documents = Document.objects.filter(pk=options["document"])
else:
documents = Document.objects.all()
@ -70,8 +71,10 @@ class Command(BaseCommand):
db.connections.close_all()
with multiprocessing.Pool() as pool:
list(tqdm.tqdm(
pool.imap_unordered(_process_document, ids),
total=len(ids),
disable=options['no_progress_bar']
))
list(
tqdm.tqdm(
pool.imap_unordered(_process_document, ids),
total=len(ids),
disable=options["no_progress_bar"],
)
)

View File

@ -10,11 +10,11 @@ class Command(LoadDataCommand):
"""
def parse_name(self, fixture_name):
self.compression_formats['stdin'] = (lambda x, y: sys.stdin, None)
if fixture_name == '-':
return '-', 'json', 'stdin'
self.compression_formats["stdin"] = (lambda x, y: sys.stdin, None)
if fixture_name == "-":
return "-", "json", "stdin"
def find_fixtures(self, fixture_label):
if fixture_label == '-':
return [('-', None, '-')]
if fixture_label == "-":
return [("-", None, "-")]
return super(Command, self).find_fixtures(fixture_label)

View File

@ -12,16 +12,18 @@ class Command(BaseCommand):
help = """
Creates a Django superuser based on env variables.
""".replace(" ", "")
""".replace(
" ", ""
)
def handle(self, *args, **options):
username = os.getenv('PAPERLESS_ADMIN_USER')
username = os.getenv("PAPERLESS_ADMIN_USER")
if not username:
return
mail = os.getenv('PAPERLESS_ADMIN_MAIL', 'root@localhost')
password = os.getenv('PAPERLESS_ADMIN_PASSWORD')
mail = os.getenv("PAPERLESS_ADMIN_MAIL", "root@localhost")
password = os.getenv("PAPERLESS_ADMIN_PASSWORD")
# Check if user exists already, leave as is if it does
if User.objects.filter(username=username).exists():
@ -32,11 +34,10 @@ class Command(BaseCommand):
elif password:
# Create superuser based on env variables
User.objects.create_superuser(username, mail, password)
self.stdout.write(
f'Created superuser "{username}" with provided password.')
self.stdout.write(f'Created superuser "{username}" with provided password.')
else:
self.stdout.write(
f'Did not create superuser "{username}".')
self.stdout.write(f'Did not create superuser "{username}".')
self.stdout.write(
'Make sure you specified "PAPERLESS_ADMIN_PASSWORD" in your '
'"docker-compose.env" file.')
'"docker-compose.env" file.'
)

View File

@ -12,7 +12,8 @@ def log_reason(matching_model, document, reason):
class_name = type(matching_model).__name__
logger.debug(
f"{class_name} {matching_model.name} matched on document "
f"{document} because {reason}")
f"{document} because {reason}"
)
def match_correspondents(document, classifier):
@ -23,9 +24,9 @@ def match_correspondents(document, classifier):
correspondents = Correspondent.objects.all()
return list(filter(
lambda o: matches(o, document) or o.pk == pred_id,
correspondents))
return list(
filter(lambda o: matches(o, document) or o.pk == pred_id, correspondents)
)
def match_document_types(document, classifier):
@ -36,9 +37,9 @@ def match_document_types(document, classifier):
document_types = DocumentType.objects.all()
return list(filter(
lambda o: matches(o, document) or o.pk == pred_id,
document_types))
return list(
filter(lambda o: matches(o, document) or o.pk == pred_id, document_types)
)
def match_tags(document, classifier):
@ -49,9 +50,9 @@ def match_tags(document, classifier):
tags = Tag.objects.all()
return list(filter(
lambda o: matches(o, document) or o.pk in predicted_tag_ids,
tags))
return list(
filter(lambda o: matches(o, document) or o.pk in predicted_tag_ids, tags)
)
def matches(matching_model, document):
@ -68,73 +69,73 @@ def matches(matching_model, document):
if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
for word in _split_match(matching_model):
search_result = re.search(
rf"\b{word}\b", document_content, **search_kwargs)
search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)
if not search_result:
return False
log_reason(
matching_model, document,
f"it contains all of these words: {matching_model.match}"
matching_model,
document,
f"it contains all of these words: {matching_model.match}",
)
return True
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
for word in _split_match(matching_model):
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
log_reason(
matching_model, document,
f"it contains this word: {word}"
)
log_reason(matching_model, document, f"it contains this word: {word}")
return True
return False
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
result = bool(re.search(
rf"\b{re.escape(matching_model.match)}\b",
document_content,
**search_kwargs
))
result = bool(
re.search(
rf"\b{re.escape(matching_model.match)}\b",
document_content,
**search_kwargs,
)
)
if result:
log_reason(
matching_model, document,
f"it contains this string: \"{matching_model.match}\""
matching_model,
document,
f'it contains this string: "{matching_model.match}"',
)
return result
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
try:
match = re.search(
re.compile(matching_model.match, **search_kwargs),
document_content
re.compile(matching_model.match, **search_kwargs), document_content
)
except re.error:
logger.error(
f"Error while processing regular expression "
f"{matching_model.match}"
f"Error while processing regular expression " f"{matching_model.match}"
)
return False
if match:
log_reason(
matching_model, document,
matching_model,
document,
f"the string {match.group()} matches the regular expression "
f"{matching_model.match}"
f"{matching_model.match}",
)
return bool(match)
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
from fuzzywuzzy import fuzz
match = re.sub(r'[^\w\s]', '', matching_model.match)
text = re.sub(r'[^\w\s]', '', document_content)
match = re.sub(r"[^\w\s]", "", matching_model.match)
text = re.sub(r"[^\w\s]", "", document_content)
if matching_model.is_insensitive:
match = match.lower()
text = text.lower()
if fuzz.partial_ratio(match, text) >= 90:
# TODO: make this better
log_reason(
matching_model, document,
matching_model,
document,
f"parts of the document content somehow match the string "
f"{matching_model.match}"
f"{matching_model.match}",
)
return True
else:
@ -162,8 +163,6 @@ def _split_match(matching_model):
normspace = re.compile(r"\s+").sub
return [
# normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
re.escape(
normspace(" ", (t[0] or t[1]).strip())
).replace(r"\ ", r"\s+")
re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
for t in findterms(matching_model.match)
]

View File

@ -10,19 +10,33 @@ class Migration(migrations.Migration):
initial = True
dependencies = [
]
dependencies = []
operations = [
migrations.CreateModel(
name='Document',
name="Document",
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('sender', models.CharField(blank=True, db_index=True, max_length=128)),
('title', models.CharField(blank=True, db_index=True, max_length=128)),
('content', models.TextField(db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]))),
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("sender", models.CharField(blank=True, db_index=True, max_length=128)),
("title", models.CharField(blank=True, db_index=True, max_length=128)),
(
"content",
models.TextField(
db_index=(
"mysql" not in settings.DATABASES["default"]["ENGINE"]
)
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
],
),
]

View File

@ -9,17 +9,19 @@ import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('documents', '0001_initial'),
("documents", "0001_initial"),
]
operations = [
migrations.AlterModelOptions(
name='document',
options={'ordering': ('sender', 'title')},
name="document",
options={"ordering": ("sender", "title")},
),
migrations.AlterField(
model_name='document',
name='created',
field=models.DateTimeField(default=django.utils.timezone.now, editable=False),
model_name="document",
name="created",
field=models.DateTimeField(
default=django.utils.timezone.now, editable=False
),
),
]

View File

@ -19,9 +19,11 @@ def move_sender_strings_to_sender_model(apps, schema_editor):
# Create the sender and log the relationship with the document
for document in document_model.objects.all():
if document.sender:
DOCUMENT_SENDER_MAP[document.pk], created = sender_model.objects.get_or_create(
name=document.sender,
defaults={"slug": slugify(document.sender)}
(
DOCUMENT_SENDER_MAP[document.pk],
created,
) = sender_model.objects.get_or_create(
name=document.sender, defaults={"slug": slugify(document.sender)}
)
@ -33,27 +35,39 @@ def realign_senders(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('documents', '0002_auto_20151226_1316'),
("documents", "0002_auto_20151226_1316"),
]
operations = [
migrations.CreateModel(
name='Sender',
name="Sender",
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=128, unique=True)),
('slug', models.SlugField()),
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128, unique=True)),
("slug", models.SlugField()),
],
),
migrations.RunPython(move_sender_strings_to_sender_model),
migrations.RemoveField(
model_name='document',
name='sender',
model_name="document",
name="sender",
),
migrations.AddField(
model_name='document',
name='sender',
field=models.ForeignKey(blank=True, on_delete=django.db.models.deletion.CASCADE, to='documents.Sender'),
model_name="document",
name="sender",
field=models.ForeignKey(
blank=True,
on_delete=django.db.models.deletion.CASCADE,
to="documents.Sender",
),
),
migrations.RunPython(realign_senders),
]

View File

@ -9,13 +9,19 @@ import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('documents', '0003_sender'),
("documents", "0003_sender"),
]
operations = [
migrations.AlterField(
model_name='document',
name='sender',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='documents', to='documents.Sender'),
model_name="document",
name="sender",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="documents",
to="documents.Sender",
),
),
]

View File

@ -8,12 +8,12 @@ from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('documents', '0004_auto_20160114_1844'),
("documents", "0004_auto_20160114_1844"),
]
operations = [
migrations.AlterModelOptions(
name='sender',
options={'ordering': ('name',)},
name="sender",
options={"ordering": ("name",)},
),
]

View File

@ -8,30 +8,59 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0005_auto_20160123_0313'),
("documents", "0005_auto_20160123_0313"),
]
operations = [
migrations.CreateModel(
name='Tag',
name="Tag",
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=128, unique=True)),
('slug', models.SlugField(blank=True)),
('colour', models.PositiveIntegerField(choices=[(1, '#a6cee3'), (2, '#1f78b4'), (3, '#b2df8a'), (4, '#33a02c'), (5, '#fb9a99'), (6, '#e31a1c'), (7, '#fdbf6f'), (8, '#ff7f00'), (9, '#cab2d6'), (10, '#6a3d9a'), (11, '#ffff99'), (12, '#b15928'), (13, '#000000'), (14, '#cccccc')], default=1)),
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128, unique=True)),
("slug", models.SlugField(blank=True)),
(
"colour",
models.PositiveIntegerField(
choices=[
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#ffff99"),
(12, "#b15928"),
(13, "#000000"),
(14, "#cccccc"),
],
default=1,
),
),
],
options={
'abstract': False,
"abstract": False,
},
),
migrations.AlterField(
model_name='sender',
name='slug',
model_name="sender",
name="slug",
field=models.SlugField(blank=True),
),
migrations.AddField(
model_name='document',
name='tags',
field=models.ManyToManyField(related_name='documents', to='documents.Tag'),
model_name="document",
name="tags",
field=models.ManyToManyField(related_name="documents", to="documents.Tag"),
),
]

View File

@ -8,23 +8,50 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0006_auto_20160123_0430'),
("documents", "0006_auto_20160123_0430"),
]
operations = [
migrations.AddField(
model_name='tag',
name='match',
model_name="tag",
name="match",
field=models.CharField(blank=True, max_length=256),
),
migrations.AddField(
model_name='tag',
name='matching_algorithm',
field=models.PositiveIntegerField(blank=True, choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression')], help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.', null=True),
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
blank=True,
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
],
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
null=True,
),
),
migrations.AlterField(
model_name='tag',
name='colour',
field=models.PositiveIntegerField(choices=[(1, '#a6cee3'), (2, '#1f78b4'), (3, '#b2df8a'), (4, '#33a02c'), (5, '#fb9a99'), (6, '#e31a1c'), (7, '#fdbf6f'), (8, '#ff7f00'), (9, '#cab2d6'), (10, '#6a3d9a'), (11, '#b15928'), (12, '#000000'), (13, '#cccccc')], default=1),
model_name="tag",
name="colour",
field=models.PositiveIntegerField(
choices=[
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc"),
],
default=1,
),
),
]

View File

@ -8,20 +8,32 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0007_auto_20160126_2114'),
("documents", "0007_auto_20160126_2114"),
]
operations = [
migrations.AddField(
model_name='document',
name='file_type',
field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF')], default='pdf', editable=False, max_length=4),
model_name="document",
name="file_type",
field=models.CharField(
choices=[
("pdf", "PDF"),
("png", "PNG"),
("jpg", "JPG"),
("gif", "GIF"),
("tiff", "TIFF"),
],
default="pdf",
editable=False,
max_length=4,
),
preserve_default=False,
),
migrations.AlterField(
model_name='document',
name='tags',
field=models.ManyToManyField(blank=True, related_name='documents', to='documents.Tag'),
model_name="document",
name="tags",
field=models.ManyToManyField(
blank=True, related_name="documents", to="documents.Tag"
),
),
]

View File

@ -8,13 +8,22 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0008_document_file_type'),
("documents", "0008_document_file_type"),
]
operations = [
migrations.AlterField(
model_name='tag',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.'),
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
),
),
]

View File

@ -8,23 +8,48 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0009_auto_20160214_0040'),
("documents", "0009_auto_20160214_0040"),
]
operations = [
migrations.CreateModel(
name='Log',
name="Log",
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('group', models.UUIDField(blank=True)),
('message', models.TextField()),
('level', models.PositiveIntegerField(choices=[(10, 'Debugging'), (20, 'Informational'), (30, 'Warning'), (40, 'Error'), (50, 'Critical')], default=20)),
('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])),
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("group", models.UUIDField(blank=True)),
("message", models.TextField()),
(
"level",
models.PositiveIntegerField(
choices=[
(10, "Debugging"),
(20, "Informational"),
(30, "Warning"),
(40, "Error"),
(50, "Critical"),
],
default=20,
),
),
(
"component",
models.PositiveIntegerField(
choices=[(1, "Consumer"), (2, "Mail Fetcher")]
),
),
("created", models.DateTimeField(auto_now_add=True)),
("modified", models.DateTimeField(auto_now=True)),
],
options={
'ordering': ('-modified',),
"ordering": ("-modified",),
},
),
]

View File

@ -8,21 +8,21 @@ from django.db import migrations
class Migration(migrations.Migration):
atomic = False
dependencies = [
('documents', '0010_log'),
("documents", "0010_log"),
]
operations = [
migrations.RenameModel(
old_name='Sender',
new_name='Correspondent',
old_name="Sender",
new_name="Correspondent",
),
migrations.AlterModelOptions(
name='document',
options={'ordering': ('correspondent', 'title')},
name="document",
options={"ordering": ("correspondent", "title")},
),
migrations.RenameField(
model_name='document',
old_name='sender',
new_name='correspondent',
model_name="document",
old_name="sender",
new_name="correspondent",
),
]

View File

@ -23,37 +23,40 @@ class GnuPG(object):
@classmethod
def decrypted(cls, file_handle):
return cls.gpg.decrypt_file(
file_handle, passphrase=settings.PASSPHRASE).data
return cls.gpg.decrypt_file(file_handle, passphrase=settings.PASSPHRASE).data
@classmethod
def encrypted(cls, file_handle):
return cls.gpg.encrypt_file(
file_handle,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
file_handle, recipients=None, passphrase=settings.PASSPHRASE, symmetric=True
).data
def move_documents_and_create_thumbnails(apps, schema_editor):
os.makedirs(os.path.join(settings.MEDIA_ROOT, "documents", "originals"), exist_ok=True)
os.makedirs(os.path.join(settings.MEDIA_ROOT, "documents", "thumbnails"), exist_ok=True)
os.makedirs(
os.path.join(settings.MEDIA_ROOT, "documents", "originals"), exist_ok=True
)
os.makedirs(
os.path.join(settings.MEDIA_ROOT, "documents", "thumbnails"), exist_ok=True
)
documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents"))
if set(documents) == {"originals", "thumbnails"}:
return
print(colourise(
"\n\n"
" This is a one-time only migration to generate thumbnails for all of your\n"
" documents so that future UIs will have something to work with. If you have\n"
" a lot of documents though, this may take a while, so a coffee break may be\n"
" in order."
"\n", opts=("bold",)
))
print(
colourise(
"\n\n"
" This is a one-time only migration to generate thumbnails for all of your\n"
" documents so that future UIs will have something to work with. If you have\n"
" a lot of documents though, this may take a while, so a coffee break may be\n"
" in order."
"\n",
opts=("bold",),
)
)
try:
os.makedirs(settings.SCRATCH_DIR)
@ -65,16 +68,16 @@ def move_documents_and_create_thumbnails(apps, schema_editor):
if not f.endswith("gpg"):
continue
print(" {} {} {}".format(
colourise("*", fg="green"),
colourise("Generating a thumbnail for", fg="white"),
colourise(f, fg="cyan")
))
print(
" {} {} {}".format(
colourise("*", fg="green"),
colourise("Generating a thumbnail for", fg="white"),
colourise(f, fg="cyan"),
)
)
thumb_temp = tempfile.mkdtemp(
prefix="paperless", dir=settings.SCRATCH_DIR)
orig_temp = tempfile.mkdtemp(
prefix="paperless", dir=settings.SCRATCH_DIR)
thumb_temp = tempfile.mkdtemp(prefix="paperless", dir=settings.SCRATCH_DIR)
orig_temp = tempfile.mkdtemp(prefix="paperless", dir=settings.SCRATCH_DIR)
orig_source = os.path.join(settings.MEDIA_ROOT, "documents", f)
orig_target = os.path.join(orig_temp, f.replace(".gpg", ""))
@ -83,20 +86,24 @@ def move_documents_and_create_thumbnails(apps, schema_editor):
with open(orig_target, "wb") as unencrypted:
unencrypted.write(GnuPG.decrypted(encrypted))
subprocess.Popen((
settings.CONVERT_BINARY,
"-scale", "500x5000",
"-alpha", "remove",
orig_target,
os.path.join(thumb_temp, "convert-%04d.png")
)).wait()
subprocess.Popen(
(
settings.CONVERT_BINARY,
"-scale",
"500x5000",
"-alpha",
"remove",
orig_target,
os.path.join(thumb_temp, "convert-%04d.png"),
)
).wait()
thumb_source = os.path.join(thumb_temp, "convert-0000.png")
thumb_target = os.path.join(
settings.MEDIA_ROOT,
"documents",
"thumbnails",
re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f)
re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f),
)
with open(thumb_source, "rb") as unencrypted:
with open(thumb_target, "wb") as encrypted:
@ -113,7 +120,7 @@ def move_documents_and_create_thumbnails(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('documents', '0011_auto_20160303_1929'),
("documents", "0011_auto_20160303_1929"),
]
operations = [

View File

@ -9,27 +9,36 @@ import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('documents', '0012_auto_20160305_0040'),
("documents", "0012_auto_20160305_0040"),
]
operations = [
migrations.AddField(
model_name='correspondent',
name='match',
model_name="correspondent",
name="match",
field=models.CharField(blank=True, max_length=256),
),
migrations.AddField(
model_name='correspondent',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.'),
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.',
),
),
migrations.AlterField(
model_name='document',
name='created',
model_name="document",
name="created",
field=models.DateTimeField(default=django.utils.timezone.now),
),
migrations.RemoveField(
model_name='log',
name='component',
model_name="log",
name="component",
),
]

View File

@ -22,16 +22,12 @@ class GnuPG(object):
@classmethod
def decrypted(cls, file_handle):
return cls.gpg.decrypt_file(
file_handle, passphrase=settings.PASSPHRASE).data
return cls.gpg.decrypt_file(file_handle, passphrase=settings.PASSPHRASE).data
@classmethod
def encrypted(cls, file_handle):
return cls.gpg.encrypt_file(
file_handle,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
file_handle, recipients=None, passphrase=settings.PASSPHRASE, symmetric=True
).data
@ -53,8 +49,7 @@ class Document(object):
def __str__(self):
created = self.created.strftime("%Y%m%d%H%M%S")
if self.correspondent and self.title:
return "{}: {} - {}".format(
created, self.correspondent, self.title)
return "{}: {} - {}".format(created, self.correspondent, self.title)
if self.correspondent or self.title:
return "{}: {}".format(created, self.correspondent or self.title)
return str(created)
@ -65,7 +60,7 @@ class Document(object):
settings.MEDIA_ROOT,
"documents",
"originals",
"{:07}.{}.gpg".format(self.pk, self.file_type)
"{:07}.{}.gpg".format(self.pk, self.file_type),
)
@property
@ -84,38 +79,62 @@ def set_checksums(apps, schema_editor):
if not document_model.objects.all().exists():
return
print(colourise(
"\n\n"
" This is a one-time only migration to generate checksums for all\n"
" of your existing documents. If you have a lot of documents\n"
" though, this may take a while, so a coffee break may be in\n"
" order."
"\n", opts=("bold",)
))
print(
colourise(
"\n\n"
" This is a one-time only migration to generate checksums for all\n"
" of your existing documents. If you have a lot of documents\n"
" though, this may take a while, so a coffee break may be in\n"
" order."
"\n",
opts=("bold",),
)
)
sums = {}
for d in document_model.objects.all():
document = Document(d)
print(" {} {} {}".format(
colourise("*", fg="green"),
colourise("Generating a checksum for", fg="white"),
colourise(document.file_name, fg="cyan")
))
print(
" {} {} {}".format(
colourise("*", fg="green"),
colourise("Generating a checksum for", fg="white"),
colourise(document.file_name, fg="cyan"),
)
)
with document.source_file as encrypted:
checksum = hashlib.md5(GnuPG.decrypted(encrypted)).hexdigest()
if checksum in sums:
error = "\n{line}{p1}\n\n{doc1}\n{doc2}\n\n{p2}\n\n{code}\n\n{p3}{line}".format(
p1=colourise("It appears that you have two identical documents in your collection and \nPaperless no longer supports this (see issue #97). The documents in question\nare:", fg="yellow"),
p2=colourise("To fix this problem, you'll have to remove one of them from the database, a task\nmost easily done by running the following command in the same\ndirectory as manage.py:", fg="yellow"),
p3=colourise("When that's finished, re-run the migrate, and provided that there aren't any\nother duplicates, you should be good to go.", fg="yellow"),
doc1=colourise(" * {} (id: {})".format(sums[checksum][1], sums[checksum][0]), fg="red"),
doc2=colourise(" * {} (id: {})".format(document.file_name, document.pk), fg="red"),
code=colourise(" $ echo 'DELETE FROM documents_document WHERE id = {pk};' | ./manage.py dbshell".format(pk=document.pk), fg="green"),
line=colourise("\n{}\n".format("=" * 80), fg="white", opts=("bold",))
p1=colourise(
"It appears that you have two identical documents in your collection and \nPaperless no longer supports this (see issue #97). The documents in question\nare:",
fg="yellow",
),
p2=colourise(
"To fix this problem, you'll have to remove one of them from the database, a task\nmost easily done by running the following command in the same\ndirectory as manage.py:",
fg="yellow",
),
p3=colourise(
"When that's finished, re-run the migrate, and provided that there aren't any\nother duplicates, you should be good to go.",
fg="yellow",
),
doc1=colourise(
" * {} (id: {})".format(sums[checksum][1], sums[checksum][0]),
fg="red",
),
doc2=colourise(
" * {} (id: {})".format(document.file_name, document.pk), fg="red"
),
code=colourise(
" $ echo 'DELETE FROM documents_document WHERE id = {pk};' | ./manage.py dbshell".format(
pk=document.pk
),
fg="green",
),
line=colourise("\n{}\n".format("=" * 80), fg="white", opts=("bold",)),
)
raise RuntimeError(error)
sums[checksum] = (document.pk, document.file_name)
@ -129,33 +148,35 @@ def do_nothing(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('documents', '0013_auto_20160325_2111'),
("documents", "0013_auto_20160325_2111"),
]
operations = [
migrations.AddField(
model_name='document',
name='checksum',
model_name="document",
name="checksum",
field=models.CharField(
default='-',
default="-",
db_index=True,
editable=False,
max_length=32,
help_text='The checksum of the original document (before it '
'was encrypted). We use this to prevent duplicate '
'document imports.',
help_text="The checksum of the original document (before it "
"was encrypted). We use this to prevent duplicate "
"document imports.",
),
preserve_default=False,
),
migrations.RunPython(set_checksums, do_nothing),
migrations.AlterField(
model_name='document',
name='created',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
model_name="document",
name="created",
field=models.DateTimeField(
db_index=True, default=django.utils.timezone.now
),
),
migrations.AlterField(
model_name='document',
name='modified',
model_name="document",
name="modified",
field=models.DateTimeField(auto_now=True, db_index=True),
),
]

View File

@ -8,23 +8,28 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0014_document_checksum'),
("documents", "0014_document_checksum"),
]
operations = [
migrations.AlterField(
model_name='document',
name='checksum',
field=models.CharField(editable=False, help_text='The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.', max_length=32, unique=True),
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.",
max_length=32,
unique=True,
),
),
migrations.AddField(
model_name='correspondent',
name='is_insensitive',
model_name="correspondent",
name="is_insensitive",
field=models.BooleanField(default=True),
),
migrations.AddField(
model_name='tag',
name='is_insensitive',
model_name="tag",
name="is_insensitive",
field=models.BooleanField(default=True),
),
]

View File

@ -9,13 +9,17 @@ from django.conf import settings
class Migration(migrations.Migration):
dependencies = [
('documents', '0015_add_insensitive_to_match'),
("documents", "0015_add_insensitive_to_match"),
]
operations = [
migrations.AlterField(
model_name='document',
name='content',
field=models.TextField(blank=True, db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]), help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
model_name="document",
name="content",
field=models.TextField(
blank=True,
db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]),
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
),
),
]

View File

@ -8,18 +8,38 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0016_auto_20170325_1558'),
("documents", "0016_auto_20170325_1558"),
]
operations = [
migrations.AlterField(
model_name='correspondent',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.',
),
),
migrations.AlterField(
model_name='tag',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.',
),
),
]

View File

@ -9,13 +9,19 @@ import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('documents', '0017_auto_20170512_0507'),
("documents", "0017_auto_20170512_0507"),
]
operations = [
migrations.AlterField(
model_name='document',
name='correspondent',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.Correspondent'),
model_name="document",
name="correspondent",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.Correspondent",
),
),
]

View File

@ -16,7 +16,7 @@ def reverse_func(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('documents', '0018_auto_20170715_1712'),
("documents", "0018_auto_20170715_1712"),
]
operations = [

View File

@ -14,14 +14,16 @@ def set_added_time_to_created_time(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('documents', '0019_add_consumer_user'),
("documents", "0019_add_consumer_user"),
]
operations = [
migrations.AddField(
model_name='document',
name='added',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, editable=False),
model_name="document",
name="added",
field=models.DateTimeField(
db_index=True, default=django.utils.timezone.now, editable=False
),
),
migrations.RunPython(set_added_time_to_created_time)
migrations.RunPython(set_added_time_to_created_time),
]

View File

@ -8,23 +8,36 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0020_document_added'),
("documents", "0020_document_added"),
]
operations = [
# Add the field with the default GPG-encrypted value
migrations.AddField(
model_name='document',
name='storage_type',
field=models.CharField(choices=[('unencrypted', 'Unencrypted'), ('gpg', 'Encrypted with GNU Privacy Guard')], default='gpg', editable=False, max_length=11),
model_name="document",
name="storage_type",
field=models.CharField(
choices=[
("unencrypted", "Unencrypted"),
("gpg", "Encrypted with GNU Privacy Guard"),
],
default="gpg",
editable=False,
max_length=11,
),
),
# Now that the field is added, change the default to unencrypted
migrations.AlterField(
model_name='document',
name='storage_type',
field=models.CharField(choices=[('unencrypted', 'Unencrypted'), ('gpg', 'Encrypted with GNU Privacy Guard')], default='unencrypted', editable=False, max_length=11),
model_name="document",
name="storage_type",
field=models.CharField(
choices=[
("unencrypted", "Unencrypted"),
("gpg", "Encrypted with GNU Privacy Guard"),
],
default="unencrypted",
editable=False,
max_length=11,
),
),
]

View File

@ -15,38 +15,47 @@ def re_slug_all_the_things(apps, schema_editor):
for klass in (Tag, Correspondent):
for instance in klass.objects.all():
klass.objects.filter(
pk=instance.pk
).update(
slug=slugify(instance.slug)
)
klass.objects.filter(pk=instance.pk).update(slug=slugify(instance.slug))
class Migration(migrations.Migration):
dependencies = [
('documents', '0021_document_storage_type'),
("documents", "0021_document_storage_type"),
]
operations = [
migrations.AlterModelOptions(
name='tag',
options={'ordering': ('name',)},
name="tag",
options={"ordering": ("name",)},
),
migrations.AlterField(
model_name='correspondent',
name='slug',
model_name="correspondent",
name="slug",
field=models.SlugField(blank=True, editable=False),
),
migrations.AlterField(
model_name='document',
name='file_type',
field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF'), ('txt', 'TXT'), ('csv', 'CSV'), ('md', 'MD')], editable=False, max_length=4),
model_name="document",
name="file_type",
field=models.CharField(
choices=[
("pdf", "PDF"),
("png", "PNG"),
("jpg", "JPG"),
("gif", "GIF"),
("tiff", "TIFF"),
("txt", "TXT"),
("csv", "CSV"),
("md", "MD"),
],
editable=False,
max_length=4,
),
),
migrations.AlterField(
model_name='tag',
name='slug',
model_name="tag",
name="slug",
field=models.SlugField(blank=True, editable=False),
),
migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop),
]

View File

@ -20,18 +20,20 @@ def set_filename(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('documents', '0022_auto_20181007_1420'),
("documents", "0022_auto_20181007_1420"),
]
operations = [
migrations.AddField(
model_name='document',
name='filename',
field=models.FilePathField(default=None,
null=True,
editable=False,
help_text='Current filename in storage',
max_length=256),
model_name="document",
name="filename",
field=models.FilePathField(
default=None,
null=True,
editable=False,
help_text="Current filename in storage",
max_length=256,
),
),
migrations.RunPython(set_filename)
migrations.RunPython(set_filename),
]

View File

@ -6,7 +6,7 @@ import django.db.models.deletion
def logs_set_default_group(apps, schema_editor):
Log = apps.get_model('documents', 'Log')
Log = apps.get_model("documents", "Log")
for log in Log.objects.all():
if log.group is None:
log.group = uuid.uuid4()
@ -16,70 +16,132 @@ def logs_set_default_group(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('documents', '0023_document_current_filename'),
("documents", "0023_document_current_filename"),
]
operations = [
migrations.AddField(
model_name='document',
name='archive_serial_number',
field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True),
model_name="document",
name="archive_serial_number",
field=models.IntegerField(
blank=True,
db_index=True,
help_text="The position of this document in your physical document archive.",
null=True,
unique=True,
),
),
migrations.AddField(
model_name='tag',
name='is_inbox_tag',
field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.'),
model_name="tag",
name="is_inbox_tag",
field=models.BooleanField(
default=False,
help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.",
),
),
migrations.CreateModel(
name='DocumentType',
name="DocumentType",
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=128, unique=True)),
('slug', models.SlugField(blank=True, editable=False)),
('match', models.CharField(blank=True, max_length=256)),
('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
('is_insensitive', models.BooleanField(default=True)),
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128, unique=True)),
("slug", models.SlugField(blank=True, editable=False)),
("match", models.CharField(blank=True, max_length=256)),
(
"matching_algorithm",
models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
(6, "Automatic Classification"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.',
),
),
("is_insensitive", models.BooleanField(default=True)),
],
options={
'abstract': False,
'ordering': ('name',),
"abstract": False,
"ordering": ("name",),
},
),
migrations.AddField(
model_name='document',
name='document_type',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.documenttype'),
model_name="document",
name="document_type",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.documenttype",
),
),
migrations.AlterField(
model_name='correspondent',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
(6, "Automatic Classification"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.',
),
),
migrations.AlterField(
model_name='tag',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any"),
(2, "All"),
(3, "Literal"),
(4, "Regular Expression"),
(5, "Fuzzy Match"),
(6, "Automatic Classification"),
],
default=1,
help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.',
),
),
migrations.AlterField(
model_name='document',
name='content',
field=models.TextField(blank=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
model_name="document",
name="content",
field=models.TextField(
blank=True,
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
),
),
migrations.AlterModelOptions(
name='log',
options={'ordering': ('-created',)},
name="log",
options={"ordering": ("-created",)},
),
migrations.RemoveField(
model_name='log',
name='modified',
model_name="log",
name="modified",
),
migrations.AlterField(
model_name='log',
name='group',
model_name="log",
name="group",
field=models.UUIDField(blank=True, null=True),
),
migrations.RunPython(
code=django.db.migrations.operations.special.RunPython.noop,
reverse_code=logs_set_default_group
reverse_code=logs_set_default_group,
),
]

View File

@ -7,22 +7,28 @@ from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
schedule(
"documents.tasks.train_classifier",
name="Train the classifier",
schedule_type=Schedule.HOURLY,
)
schedule(
"documents.tasks.index_optimize",
name="Optimize the index",
schedule_type=Schedule.DAILY,
)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func='documents.tasks.train_classifier').delete()
Schedule.objects.filter(func='documents.tasks.index_optimize').delete()
Schedule.objects.filter(func="documents.tasks.train_classifier").delete()
Schedule.objects.filter(func="documents.tasks.index_optimize").delete()
class Migration(migrations.Migration):
dependencies = [
('documents', '1000_update_paperless_all'),
('django_q', '0013_task_attempt_count'),
("documents", "1000_update_paperless_all"),
("django_q", "0013_task_attempt_count"),
]
operations = [
RunPython(add_schedules, remove_schedules)
]
operations = [RunPython(add_schedules, remove_schedules)]

View File

@ -6,13 +6,19 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1001_auto_20201109_1636'),
("documents", "1001_auto_20201109_1636"),
]
operations = [
migrations.AlterField(
model_name='document',
name='filename',
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True),
model_name="document",
name="filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current filename in storage",
max_length=1024,
null=True,
),
),
]

View File

@ -20,10 +20,7 @@ def source_path(self):
if self.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg"
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
return os.path.join(settings.ORIGINALS_DIR, fname)
def add_mime_types(apps, schema_editor):
@ -49,43 +46,51 @@ def add_file_extensions(apps, schema_editor):
documents = Document.objects.all()
for d in documents:
d.file_type = os.path.splitext(d.filename)[1].strip('.')
d.file_type = os.path.splitext(d.filename)[1].strip(".")
d.save()
class Migration(migrations.Migration):
dependencies = [
('documents', '1002_auto_20201111_1105'),
("documents", "1002_auto_20201111_1105"),
]
operations = [
migrations.AddField(
model_name='document',
name='mime_type',
model_name="document",
name="mime_type",
field=models.CharField(default="-", editable=False, max_length=256),
preserve_default=False,
),
migrations.RunPython(add_mime_types, migrations.RunPython.noop),
# This operation is here so that we can revert the entire migration:
# By allowing this field to be blank and null, we can revert the
# remove operation further down and the database won't complain about
# NOT NULL violations.
migrations.AlterField(
model_name='document',
name='file_type',
model_name="document",
name="file_type",
field=models.CharField(
choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF'), ('txt', 'TXT'), ('csv', 'CSV'), ('md', 'MD')],
choices=[
("pdf", "PDF"),
("png", "PNG"),
("jpg", "JPG"),
("gif", "GIF"),
("tiff", "TIFF"),
("txt", "TXT"),
("csv", "CSV"),
("md", "MD"),
],
editable=False,
max_length=4,
null=True,
blank=True
blank=True,
),
),
migrations.RunPython(migrations.RunPython.noop, add_file_extensions),
migrations.RemoveField(
model_name='document',
name='file_type',
model_name="document",
name="file_type",
),
]

View File

@ -7,20 +7,22 @@ from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
schedule(
"documents.tasks.sanity_check",
name="Perform sanity check",
schedule_type=Schedule.WEEKLY,
)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
Schedule.objects.filter(func="documents.tasks.sanity_check").delete()
class Migration(migrations.Migration):
dependencies = [
('documents', '1003_mime_types'),
('django_q', '0013_task_attempt_count'),
("documents", "1003_mime_types"),
("django_q", "0013_task_attempt_count"),
]
operations = [
RunPython(add_schedules, remove_schedules)
]
operations = [RunPython(add_schedules, remove_schedules)]

View File

@ -6,18 +6,29 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1004_sanity_check_schedule'),
("documents", "1004_sanity_check_schedule"),
]
operations = [
migrations.AddField(
model_name='document',
name='archive_checksum',
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
model_name="document",
name="archive_checksum",
field=models.CharField(
blank=True,
editable=False,
help_text="The checksum of the archived document.",
max_length=32,
null=True,
),
),
migrations.AlterField(
model_name='document',
name='checksum',
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document.",
max_length=32,
unique=True,
),
),
]

View File

@ -6,20 +6,20 @@ from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('documents', '1005_checksums'),
("documents", "1005_checksums"),
]
operations = [
migrations.RemoveField(
model_name='correspondent',
name='slug',
model_name="correspondent",
name="slug",
),
migrations.RemoveField(
model_name='documenttype',
name='slug',
model_name="documenttype",
name="slug",
),
migrations.RemoveField(
model_name='tag',
name='slug',
model_name="tag",
name="slug",
),
]

View File

@ -9,29 +9,82 @@ class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('documents', '1006_auto_20201208_2209'),
("documents", "1006_auto_20201208_2209"),
]
operations = [
migrations.CreateModel(
name='SavedView',
name="SavedView",
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=128)),
('show_on_dashboard', models.BooleanField()),
('show_in_sidebar', models.BooleanField()),
('sort_field', models.CharField(max_length=128)),
('sort_reverse', models.BooleanField(default=False)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(max_length=128)),
("show_on_dashboard", models.BooleanField()),
("show_in_sidebar", models.BooleanField()),
("sort_field", models.CharField(max_length=128)),
("sort_reverse", models.BooleanField(default=False)),
(
"user",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
),
),
],
),
migrations.CreateModel(
name='SavedViewFilterRule',
name="SavedViewFilterRule",
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('rule_type', models.PositiveIntegerField(choices=[(0, 'Title contains'), (1, 'Content contains'), (2, 'ASN is'), (3, 'Correspondent is'), (4, 'Document type is'), (5, 'Is in inbox'), (6, 'Has tag'), (7, 'Has any tag'), (8, 'Created before'), (9, 'Created after'), (10, 'Created year is'), (11, 'Created month is'), (12, 'Created day is'), (13, 'Added before'), (14, 'Added after'), (15, 'Modified before'), (16, 'Modified after'), (17, 'Does not have tag')])),
('value', models.CharField(max_length=128)),
('saved_view', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='filter_rules', to='documents.savedview')),
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"rule_type",
models.PositiveIntegerField(
choices=[
(0, "Title contains"),
(1, "Content contains"),
(2, "ASN is"),
(3, "Correspondent is"),
(4, "Document type is"),
(5, "Is in inbox"),
(6, "Has tag"),
(7, "Has any tag"),
(8, "Created before"),
(9, "Created after"),
(10, "Created year is"),
(11, "Created month is"),
(12, "Created day is"),
(13, "Added before"),
(14, "Added after"),
(15, "Modified before"),
(16, "Modified after"),
(17, "Does not have tag"),
]
),
),
("value", models.CharField(max_length=128)),
(
"saved_view",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="filter_rules",
to="documents.savedview",
),
),
],
),
]

View File

@ -7,28 +7,28 @@ import django.db.models.functions.text
class Migration(migrations.Migration):
dependencies = [
('documents', '1007_savedview_savedviewfilterrule'),
("documents", "1007_savedview_savedviewfilterrule"),
]
operations = [
migrations.AlterModelOptions(
name='correspondent',
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
name="correspondent",
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
),
migrations.AlterModelOptions(
name='document',
options={'ordering': ('-created',)},
name="document",
options={"ordering": ("-created",)},
),
migrations.AlterModelOptions(
name='documenttype',
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
name="documenttype",
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
),
migrations.AlterModelOptions(
name='savedview',
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
name="savedview",
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
),
migrations.AlterModelOptions(
name='tag',
options={'ordering': (django.db.models.functions.text.Lower('name'),)},
name="tag",
options={"ordering": (django.db.models.functions.text.Lower("name"),)},
),
]

View File

@ -6,24 +6,24 @@ from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('documents', '1008_auto_20201216_1736'),
("documents", "1008_auto_20201216_1736"),
]
operations = [
migrations.AlterModelOptions(
name='correspondent',
options={'ordering': ('name',)},
name="correspondent",
options={"ordering": ("name",)},
),
migrations.AlterModelOptions(
name='documenttype',
options={'ordering': ('name',)},
name="documenttype",
options={"ordering": ("name",)},
),
migrations.AlterModelOptions(
name='savedview',
options={'ordering': ('name',)},
name="savedview",
options={"ordering": ("name",)},
),
migrations.AlterModelOptions(
name='tag',
options={'ordering': ('name',)},
name="tag",
options={"ordering": ("name",)},
),
]

View File

@ -6,13 +6,13 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1009_auto_20201216_2005'),
("documents", "1009_auto_20201216_2005"),
]
operations = [
migrations.AlterField(
model_name='savedviewfilterrule',
name='value',
model_name="savedviewfilterrule",
name="value",
field=models.CharField(blank=True, max_length=128, null=True),
),
]

View File

@ -10,241 +10,433 @@ class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('documents', '1010_auto_20210101_2159'),
("documents", "1010_auto_20210101_2159"),
]
operations = [
migrations.AlterModelOptions(
name='correspondent',
options={'ordering': ('name',), 'verbose_name': 'correspondent', 'verbose_name_plural': 'correspondents'},
name="correspondent",
options={
"ordering": ("name",),
"verbose_name": "correspondent",
"verbose_name_plural": "correspondents",
},
),
migrations.AlterModelOptions(
name='document',
options={'ordering': ('-created',), 'verbose_name': 'document', 'verbose_name_plural': 'documents'},
name="document",
options={
"ordering": ("-created",),
"verbose_name": "document",
"verbose_name_plural": "documents",
},
),
migrations.AlterModelOptions(
name='documenttype',
options={'verbose_name': 'document type', 'verbose_name_plural': 'document types'},
name="documenttype",
options={
"verbose_name": "document type",
"verbose_name_plural": "document types",
},
),
migrations.AlterModelOptions(
name='log',
options={'ordering': ('-created',), 'verbose_name': 'log', 'verbose_name_plural': 'logs'},
name="log",
options={
"ordering": ("-created",),
"verbose_name": "log",
"verbose_name_plural": "logs",
},
),
migrations.AlterModelOptions(
name='savedview',
options={'ordering': ('name',), 'verbose_name': 'saved view', 'verbose_name_plural': 'saved views'},
name="savedview",
options={
"ordering": ("name",),
"verbose_name": "saved view",
"verbose_name_plural": "saved views",
},
),
migrations.AlterModelOptions(
name='savedviewfilterrule',
options={'verbose_name': 'filter rule', 'verbose_name_plural': 'filter rules'},
name="savedviewfilterrule",
options={
"verbose_name": "filter rule",
"verbose_name_plural": "filter rules",
},
),
migrations.AlterModelOptions(
name='tag',
options={'verbose_name': 'tag', 'verbose_name_plural': 'tags'},
name="tag",
options={"verbose_name": "tag", "verbose_name_plural": "tags"},
),
migrations.AlterField(
model_name='correspondent',
name='is_insensitive',
field=models.BooleanField(default=True, verbose_name='is insensitive'),
model_name="correspondent",
name="is_insensitive",
field=models.BooleanField(default=True, verbose_name="is insensitive"),
),
migrations.AlterField(
model_name='correspondent',
name='match',
field=models.CharField(blank=True, max_length=256, verbose_name='match'),
model_name="correspondent",
name="match",
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
),
migrations.AlterField(
model_name='correspondent',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any word'), (2, 'All words'), (3, 'Exact match'), (4, 'Regular expression'), (5, 'Fuzzy word'), (6, 'Automatic')], default=1, verbose_name='matching algorithm'),
model_name="correspondent",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name='correspondent',
name='name',
field=models.CharField(max_length=128, unique=True, verbose_name='name'),
model_name="correspondent",
name="name",
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
),
migrations.AlterField(
model_name='document',
name='added',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, editable=False, verbose_name='added'),
model_name="document",
name="added",
field=models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
editable=False,
verbose_name="added",
),
),
migrations.AlterField(
model_name='document',
name='archive_checksum',
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True, verbose_name='archive checksum'),
model_name="document",
name="archive_checksum",
field=models.CharField(
blank=True,
editable=False,
help_text="The checksum of the archived document.",
max_length=32,
null=True,
verbose_name="archive checksum",
),
),
migrations.AlterField(
model_name='document',
name='archive_serial_number',
field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True, verbose_name='archive serial number'),
model_name="document",
name="archive_serial_number",
field=models.IntegerField(
blank=True,
db_index=True,
help_text="The position of this document in your physical document archive.",
null=True,
unique=True,
verbose_name="archive serial number",
),
),
migrations.AlterField(
model_name='document',
name='checksum',
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True, verbose_name='checksum'),
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
help_text="The checksum of the original document.",
max_length=32,
unique=True,
verbose_name="checksum",
),
),
migrations.AlterField(
model_name='document',
name='content',
field=models.TextField(blank=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.', verbose_name='content'),
model_name="document",
name="content",
field=models.TextField(
blank=True,
help_text="The raw, text-only data of the document. This field is primarily used for searching.",
verbose_name="content",
),
),
migrations.AlterField(
model_name='document',
name='correspondent',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.correspondent', verbose_name='correspondent'),
model_name="document",
name="correspondent",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.correspondent",
verbose_name="correspondent",
),
),
migrations.AlterField(
model_name='document',
name='created',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, verbose_name='created'),
model_name="document",
name="created",
field=models.DateTimeField(
db_index=True, default=django.utils.timezone.now, verbose_name="created"
),
),
migrations.AlterField(
model_name='document',
name='document_type',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.documenttype', verbose_name='document type'),
model_name="document",
name="document_type",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="documents",
to="documents.documenttype",
verbose_name="document type",
),
),
migrations.AlterField(
model_name='document',
name='filename',
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, verbose_name='filename'),
model_name="document",
name="filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current filename in storage",
max_length=1024,
null=True,
verbose_name="filename",
),
),
migrations.AlterField(
model_name='document',
name='mime_type',
field=models.CharField(editable=False, max_length=256, verbose_name='mime type'),
model_name="document",
name="mime_type",
field=models.CharField(
editable=False, max_length=256, verbose_name="mime type"
),
),
migrations.AlterField(
model_name='document',
name='modified',
field=models.DateTimeField(auto_now=True, db_index=True, verbose_name='modified'),
model_name="document",
name="modified",
field=models.DateTimeField(
auto_now=True, db_index=True, verbose_name="modified"
),
),
migrations.AlterField(
model_name='document',
name='storage_type',
field=models.CharField(choices=[('unencrypted', 'Unencrypted'), ('gpg', 'Encrypted with GNU Privacy Guard')], default='unencrypted', editable=False, max_length=11, verbose_name='storage type'),
model_name="document",
name="storage_type",
field=models.CharField(
choices=[
("unencrypted", "Unencrypted"),
("gpg", "Encrypted with GNU Privacy Guard"),
],
default="unencrypted",
editable=False,
max_length=11,
verbose_name="storage type",
),
),
migrations.AlterField(
model_name='document',
name='tags',
field=models.ManyToManyField(blank=True, related_name='documents', to='documents.Tag', verbose_name='tags'),
model_name="document",
name="tags",
field=models.ManyToManyField(
blank=True,
related_name="documents",
to="documents.Tag",
verbose_name="tags",
),
),
migrations.AlterField(
model_name='document',
name='title',
field=models.CharField(blank=True, db_index=True, max_length=128, verbose_name='title'),
model_name="document",
name="title",
field=models.CharField(
blank=True, db_index=True, max_length=128, verbose_name="title"
),
),
migrations.AlterField(
model_name='documenttype',
name='is_insensitive',
field=models.BooleanField(default=True, verbose_name='is insensitive'),
model_name="documenttype",
name="is_insensitive",
field=models.BooleanField(default=True, verbose_name="is insensitive"),
),
migrations.AlterField(
model_name='documenttype',
name='match',
field=models.CharField(blank=True, max_length=256, verbose_name='match'),
model_name="documenttype",
name="match",
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
),
migrations.AlterField(
model_name='documenttype',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any word'), (2, 'All words'), (3, 'Exact match'), (4, 'Regular expression'), (5, 'Fuzzy word'), (6, 'Automatic')], default=1, verbose_name='matching algorithm'),
model_name="documenttype",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name='documenttype',
name='name',
field=models.CharField(max_length=128, unique=True, verbose_name='name'),
model_name="documenttype",
name="name",
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
),
migrations.AlterField(
model_name='log',
name='created',
field=models.DateTimeField(auto_now_add=True, verbose_name='created'),
model_name="log",
name="created",
field=models.DateTimeField(auto_now_add=True, verbose_name="created"),
),
migrations.AlterField(
model_name='log',
name='group',
field=models.UUIDField(blank=True, null=True, verbose_name='group'),
model_name="log",
name="group",
field=models.UUIDField(blank=True, null=True, verbose_name="group"),
),
migrations.AlterField(
model_name='log',
name='level',
field=models.PositiveIntegerField(choices=[(10, 'debug'), (20, 'information'), (30, 'warning'), (40, 'error'), (50, 'critical')], default=20, verbose_name='level'),
model_name="log",
name="level",
field=models.PositiveIntegerField(
choices=[
(10, "debug"),
(20, "information"),
(30, "warning"),
(40, "error"),
(50, "critical"),
],
default=20,
verbose_name="level",
),
),
migrations.AlterField(
model_name='log',
name='message',
field=models.TextField(verbose_name='message'),
model_name="log",
name="message",
field=models.TextField(verbose_name="message"),
),
migrations.AlterField(
model_name='savedview',
name='name',
field=models.CharField(max_length=128, verbose_name='name'),
model_name="savedview",
name="name",
field=models.CharField(max_length=128, verbose_name="name"),
),
migrations.AlterField(
model_name='savedview',
name='show_in_sidebar',
field=models.BooleanField(verbose_name='show in sidebar'),
model_name="savedview",
name="show_in_sidebar",
field=models.BooleanField(verbose_name="show in sidebar"),
),
migrations.AlterField(
model_name='savedview',
name='show_on_dashboard',
field=models.BooleanField(verbose_name='show on dashboard'),
model_name="savedview",
name="show_on_dashboard",
field=models.BooleanField(verbose_name="show on dashboard"),
),
migrations.AlterField(
model_name='savedview',
name='sort_field',
field=models.CharField(max_length=128, verbose_name='sort field'),
model_name="savedview",
name="sort_field",
field=models.CharField(max_length=128, verbose_name="sort field"),
),
migrations.AlterField(
model_name='savedview',
name='sort_reverse',
field=models.BooleanField(default=False, verbose_name='sort reverse'),
model_name="savedview",
name="sort_reverse",
field=models.BooleanField(default=False, verbose_name="sort reverse"),
),
migrations.AlterField(
model_name='savedview',
name='user',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL, verbose_name='user'),
model_name="savedview",
name="user",
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
verbose_name="user",
),
),
migrations.AlterField(
model_name='savedviewfilterrule',
name='rule_type',
field=models.PositiveIntegerField(choices=[(0, 'title contains'), (1, 'content contains'), (2, 'ASN is'), (3, 'correspondent is'), (4, 'document type is'), (5, 'is in inbox'), (6, 'has tag'), (7, 'has any tag'), (8, 'created before'), (9, 'created after'), (10, 'created year is'), (11, 'created month is'), (12, 'created day is'), (13, 'added before'), (14, 'added after'), (15, 'modified before'), (16, 'modified after'), (17, 'does not have tag')], verbose_name='rule type'),
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
],
verbose_name="rule type",
),
),
migrations.AlterField(
model_name='savedviewfilterrule',
name='saved_view',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='filter_rules', to='documents.savedview', verbose_name='saved view'),
model_name="savedviewfilterrule",
name="saved_view",
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="filter_rules",
to="documents.savedview",
verbose_name="saved view",
),
),
migrations.AlterField(
model_name='savedviewfilterrule',
name='value',
field=models.CharField(blank=True, max_length=128, null=True, verbose_name='value'),
model_name="savedviewfilterrule",
name="value",
field=models.CharField(
blank=True, max_length=128, null=True, verbose_name="value"
),
),
migrations.AlterField(
model_name='tag',
name='colour',
field=models.PositiveIntegerField(choices=[(1, '#a6cee3'), (2, '#1f78b4'), (3, '#b2df8a'), (4, '#33a02c'), (5, '#fb9a99'), (6, '#e31a1c'), (7, '#fdbf6f'), (8, '#ff7f00'), (9, '#cab2d6'), (10, '#6a3d9a'), (11, '#b15928'), (12, '#000000'), (13, '#cccccc')], default=1, verbose_name='color'),
model_name="tag",
name="colour",
field=models.PositiveIntegerField(
choices=[
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc"),
],
default=1,
verbose_name="color",
),
),
migrations.AlterField(
model_name='tag',
name='is_inbox_tag',
field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.', verbose_name='is inbox tag'),
model_name="tag",
name="is_inbox_tag",
field=models.BooleanField(
default=False,
help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.",
verbose_name="is inbox tag",
),
),
migrations.AlterField(
model_name='tag',
name='is_insensitive',
field=models.BooleanField(default=True, verbose_name='is insensitive'),
model_name="tag",
name="is_insensitive",
field=models.BooleanField(default=True, verbose_name="is insensitive"),
),
migrations.AlterField(
model_name='tag',
name='match',
field=models.CharField(blank=True, max_length=256, verbose_name='match'),
model_name="tag",
name="match",
field=models.CharField(blank=True, max_length=256, verbose_name="match"),
),
migrations.AlterField(
model_name='tag',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any word'), (2, 'All words'), (3, 'Exact match'), (4, 'Regular expression'), (5, 'Fuzzy word'), (6, 'Automatic')], default=1, verbose_name='matching algorithm'),
model_name="tag",
name="matching_algorithm",
field=models.PositiveIntegerField(
choices=[
(1, "Any word"),
(2, "All words"),
(3, "Exact match"),
(4, "Regular expression"),
(5, "Fuzzy word"),
(6, "Automatic"),
],
default=1,
verbose_name="matching algorithm",
),
),
migrations.AlterField(
model_name='tag',
name='name',
field=models.CharField(max_length=128, unique=True, verbose_name='name'),
model_name="tag",
name="name",
field=models.CharField(max_length=128, unique=True, verbose_name="name"),
),
]

View File

@ -20,6 +20,7 @@ logger = logging.getLogger("paperless.migrations")
# This is code copied straight paperless before the change.
###############################################################################
def archive_name_from_filename(filename):
return os.path.splitext(filename)[0] + ".pdf"
@ -30,10 +31,7 @@ def archive_path_old(doc):
else:
fname = "{:07}.pdf".format(doc.pk)
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
return os.path.join(settings.ARCHIVE_DIR, fname)
STORAGE_TYPE_GPG = "gpg"
@ -41,10 +39,7 @@ STORAGE_TYPE_GPG = "gpg"
def archive_path_new(doc):
if doc.archive_filename is not None:
return os.path.join(
settings.ARCHIVE_DIR,
str(doc.archive_filename)
)
return os.path.join(settings.ARCHIVE_DIR, str(doc.archive_filename))
else:
return None
@ -57,10 +52,7 @@ def source_path(doc):
if doc.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg" # pragma: no cover
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
return os.path.join(settings.ORIGINALS_DIR, fname)
def generate_unique_filename(doc, archive_filename=False):
@ -75,7 +67,8 @@ def generate_unique_filename(doc, archive_filename=False):
while True:
new_filename = generate_filename(
doc, counter, archive_filename=archive_filename)
doc, counter, archive_filename=archive_filename
)
if new_filename == old_filename:
# still the same as before.
return new_filename
@ -91,14 +84,11 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdictNoStr(lambda: slugify(None),
many_to_dictionary(doc.tags))
tags = defaultdictNoStr(lambda: slugify(None), many_to_dictionary(doc.tags))
tag_list = pathvalidate.sanitize_filename(
",".join(sorted(
[tag.name for tag in doc.tags.all()]
)),
replacement_text="-"
",".join(sorted([tag.name for tag in doc.tags.all()])),
replacement_text="-",
)
if doc.correspondent:
@ -116,20 +106,21 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
document_type = "none"
path = settings.PAPERLESS_FILENAME_FORMAT.format(
title=pathvalidate.sanitize_filename(
doc.title, replacement_text="-"),
title=pathvalidate.sanitize_filename(doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(doc.created),
created_year=doc.created.year if doc.created else "none",
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
created_month=f"{doc.created.month:02}"
if doc.created
else "none", # NOQA: E501
created_day=f"{doc.created.day:02}" if doc.created else "none",
added=datetime.date.isoformat(doc.added),
added_year=doc.added.year if doc.added else "none",
added_month=f"{doc.added.month:02}" if doc.added else "none",
added_day=f"{doc.added.day:02}" if doc.added else "none",
tags=tags,
tag_list=tag_list
tag_list=tag_list,
).strip()
path = path.strip(os.sep)
@ -137,7 +128,8 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
except (ValueError, KeyError, IndexError):
logger.warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default"
)
counter_str = f"_{counter:02}" if counter else ""
@ -166,29 +158,29 @@ def parse_wrapper(parser, path, mime_type, file_name):
def create_archive_version(doc, retry_count=3):
from documents.parsers import get_parser_class_for_mime_type, \
DocumentParser, \
ParseError
logger.info(
f"Regenerating archive document for document ID:{doc.id}"
from documents.parsers import (
get_parser_class_for_mime_type,
DocumentParser,
ParseError,
)
logger.info(f"Regenerating archive document for document ID:{doc.id}")
parser_class = get_parser_class_for_mime_type(doc.mime_type)
for try_num in range(retry_count):
parser: DocumentParser = parser_class(None, None)
try:
parse_wrapper(parser, source_path(doc), doc.mime_type,
os.path.basename(doc.filename))
parse_wrapper(
parser, source_path(doc), doc.mime_type, os.path.basename(doc.filename)
)
doc.content = parser.get_text()
if parser.get_archive_path() and os.path.isfile(
parser.get_archive_path()):
if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
doc.archive_filename = generate_unique_filename(
doc, archive_filename=True)
doc, archive_filename=True
)
with open(parser.get_archive_path(), "rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
os.makedirs(os.path.dirname(archive_path_new(doc)),
exist_ok=True)
os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
else:
doc.archive_checksum = None
@ -241,8 +233,8 @@ def move_old_to_new_locations(apps, schema_editor):
old_path = archive_path_old(doc)
if doc.id not in affected_document_ids and not os.path.isfile(old_path):
raise ValueError(
f"Archived document ID:{doc.id} does not exist at: "
f"{old_path}")
f"Archived document ID:{doc.id} does not exist at: " f"{old_path}"
)
# check that we can regenerate affected archive versions
for doc_id in affected_document_ids:
@ -253,7 +245,8 @@ def move_old_to_new_locations(apps, schema_editor):
if not parser_class:
raise ValueError(
f"Document ID:{doc.id} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.")
f"but no parsers are available. Cannot migrate."
)
for doc in Document.objects.filter(archive_checksum__isnull=False):
@ -261,9 +254,7 @@ def move_old_to_new_locations(apps, schema_editor):
old_path = archive_path_old(doc)
# remove affected archive versions
if os.path.isfile(old_path):
logger.debug(
f"Removing {old_path}"
)
logger.debug(f"Removing {old_path}")
os.unlink(old_path)
else:
# Set archive path for unaffected files
@ -290,7 +281,8 @@ def move_new_to_old_locations(apps, schema_editor):
raise ValueError(
f"Cannot migrate: Archive file name {old_archive_path} of "
f"document {doc.filename} would clash with another archive "
f"filename.")
f"filename."
)
old_archive_paths.add(old_archive_path)
if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
raise ValueError(
@ -309,22 +301,35 @@ def move_new_to_old_locations(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('documents', '1011_auto_20210101_2340'),
("documents", "1011_auto_20210101_2340"),
]
operations = [
migrations.AddField(
model_name='document',
name='archive_filename',
field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'),
model_name="document",
name="archive_filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current archive filename in storage",
max_length=1024,
null=True,
unique=True,
verbose_name="archive filename",
),
),
migrations.AlterField(
model_name='document',
name='filename',
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'),
),
migrations.RunPython(
move_old_to_new_locations,
move_new_to_old_locations
model_name="document",
name="filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current filename in storage",
max_length=1024,
null=True,
unique=True,
verbose_name="filename",
),
),
migrations.RunPython(move_old_to_new_locations, move_new_to_old_locations),
]

View File

@ -20,7 +20,7 @@ COLOURS_OLD = {
def forward(apps, schema_editor):
Tag = apps.get_model('documents', 'Tag')
Tag = apps.get_model("documents", "Tag")
for tag in Tag.objects.all():
colour_old_id = tag.colour_old
@ -30,7 +30,7 @@ def forward(apps, schema_editor):
def reverse(apps, schema_editor):
Tag = apps.get_model('documents', 'Tag')
Tag = apps.get_model("documents", "Tag")
def _get_colour_id(rdb):
for idx, rdbx in COLOURS_OLD.items():
@ -48,23 +48,25 @@ def reverse(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('documents', '1012_fix_archive_files'),
("documents", "1012_fix_archive_files"),
]
operations = [
migrations.RenameField(
model_name='tag',
old_name='colour',
new_name='colour_old',
model_name="tag",
old_name="colour",
new_name="colour_old",
),
migrations.AddField(
model_name='tag',
name='color',
field=models.CharField(default='#a6cee3', max_length=7, verbose_name='color'),
model_name="tag",
name="color",
field=models.CharField(
default="#a6cee3", max_length=7, verbose_name="color"
),
),
migrations.RunPython(forward, reverse),
migrations.RemoveField(
model_name='tag',
name='colour_old',
)
model_name="tag",
name="colour_old",
),
]

View File

@ -6,13 +6,37 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1013_migrate_tag_colour'),
("documents", "1013_migrate_tag_colour"),
]
operations = [
migrations.AlterField(
model_name='savedviewfilterrule',
name='rule_type',
field=models.PositiveIntegerField(choices=[(0, 'title contains'), (1, 'content contains'), (2, 'ASN is'), (3, 'correspondent is'), (4, 'document type is'), (5, 'is in inbox'), (6, 'has tag'), (7, 'has any tag'), (8, 'created before'), (9, 'created after'), (10, 'created year is'), (11, 'created month is'), (12, 'created day is'), (13, 'added before'), (14, 'added after'), (15, 'modified before'), (16, 'modified after'), (17, 'does not have tag'), (18, 'does not have ASN'), (19, 'title or content contains')], verbose_name='rule type'),
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
],
verbose_name="rule type",
),
),
]

View File

@ -8,20 +8,20 @@ logger = logging.getLogger("paperless.migrations")
def remove_null_characters(apps, schema_editor):
Document = apps.get_model('documents', 'Document')
Document = apps.get_model("documents", "Document")
for doc in Document.objects.all():
content: str = doc.content
if '\0' in content:
if "\0" in content:
logger.info(f"Removing null characters from document {doc}...")
doc.content = content.replace('\0', ' ')
doc.content = content.replace("\0", " ")
doc.save()
class Migration(migrations.Migration):
dependencies = [
('documents', '1014_auto_20210228_1614'),
("documents", "1014_auto_20210228_1614"),
]
operations = [

View File

@ -6,18 +6,46 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1015_remove_null_characters'),
("documents", "1015_remove_null_characters"),
]
operations = [
migrations.AlterField(
model_name='savedview',
name='sort_field',
field=models.CharField(blank=True, max_length=128, null=True, verbose_name='sort field'),
model_name="savedview",
name="sort_field",
field=models.CharField(
blank=True, max_length=128, null=True, verbose_name="sort field"
),
),
migrations.AlterField(
model_name='savedviewfilterrule',
name='rule_type',
field=models.PositiveIntegerField(choices=[(0, 'title contains'), (1, 'content contains'), (2, 'ASN is'), (3, 'correspondent is'), (4, 'document type is'), (5, 'is in inbox'), (6, 'has tag'), (7, 'has any tag'), (8, 'created before'), (9, 'created after'), (10, 'created year is'), (11, 'created month is'), (12, 'created day is'), (13, 'added before'), (14, 'added after'), (15, 'modified before'), (16, 'modified after'), (17, 'does not have tag'), (18, 'does not have ASN'), (19, 'title or content contains'), (20, 'fulltext query'), (21, 'more like this')], verbose_name='rule type'),
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
],
verbose_name="rule type",
),
),
]

View File

@ -37,23 +37,15 @@ class MatchingModel(models.Model):
(MATCH_AUTO, _("Automatic")),
)
name = models.CharField(
_("name"),
max_length=128, unique=True)
name = models.CharField(_("name"), max_length=128, unique=True)
match = models.CharField(
_("match"),
max_length=256, blank=True)
match = models.CharField(_("match"), max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
_("matching algorithm"),
choices=MATCHING_ALGORITHMS,
default=MATCH_ANY
_("matching algorithm"), choices=MATCHING_ALGORITHMS, default=MATCH_ANY
)
is_insensitive = models.BooleanField(
_("is insensitive"),
default=True)
is_insensitive = models.BooleanField(_("is insensitive"), default=True)
class Meta:
abstract = True
@ -64,7 +56,6 @@ class MatchingModel(models.Model):
class Correspondent(MatchingModel):
class Meta:
ordering = ("name",)
verbose_name = _("correspondent")
@ -73,17 +64,15 @@ class Correspondent(MatchingModel):
class Tag(MatchingModel):
color = models.CharField(
_("color"),
max_length=7,
default="#a6cee3"
)
color = models.CharField(_("color"), max_length=7, default="#a6cee3")
is_inbox_tag = models.BooleanField(
_("is inbox tag"),
default=False,
help_text=_("Marks this tag as an inbox tag: All newly consumed "
"documents will be tagged with inbox tags.")
help_text=_(
"Marks this tag as an inbox tag: All newly consumed "
"documents will be tagged with inbox tags."
),
)
class Meta:
@ -92,7 +81,6 @@ class Tag(MatchingModel):
class DocumentType(MatchingModel):
class Meta:
verbose_name = _("document type")
verbose_name_plural = _("document types")
@ -104,7 +92,7 @@ class Document(models.Model):
STORAGE_TYPE_GPG = "gpg"
STORAGE_TYPES = (
(STORAGE_TYPE_UNENCRYPTED, _("Unencrypted")),
(STORAGE_TYPE_GPG, _("Encrypted with GNU Privacy Guard"))
(STORAGE_TYPE_GPG, _("Encrypted with GNU Privacy Guard")),
)
correspondent = models.ForeignKey(
@ -113,12 +101,10 @@ class Document(models.Model):
null=True,
related_name="documents",
on_delete=models.SET_NULL,
verbose_name=_("correspondent")
verbose_name=_("correspondent"),
)
title = models.CharField(
_("title"),
max_length=128, blank=True, db_index=True)
title = models.CharField(_("title"), max_length=128, blank=True, db_index=True)
document_type = models.ForeignKey(
DocumentType,
@ -126,25 +112,22 @@ class Document(models.Model):
null=True,
related_name="documents",
on_delete=models.SET_NULL,
verbose_name=_("document type")
verbose_name=_("document type"),
)
content = models.TextField(
_("content"),
blank=True,
help_text=_("The raw, text-only data of the document. This field is "
"primarily used for searching.")
help_text=_(
"The raw, text-only data of the document. This field is "
"primarily used for searching."
),
)
mime_type = models.CharField(
_("mime type"),
max_length=256,
editable=False
)
mime_type = models.CharField(_("mime type"), max_length=256, editable=False)
tags = models.ManyToManyField(
Tag, related_name="documents", blank=True,
verbose_name=_("tags")
Tag, related_name="documents", blank=True, verbose_name=_("tags")
)
checksum = models.CharField(
@ -152,7 +135,7 @@ class Document(models.Model):
max_length=32,
editable=False,
unique=True,
help_text=_("The checksum of the original document.")
help_text=_("The checksum of the original document."),
)
archive_checksum = models.CharField(
@ -161,28 +144,26 @@ class Document(models.Model):
editable=False,
blank=True,
null=True,
help_text=_("The checksum of the archived document.")
help_text=_("The checksum of the archived document."),
)
created = models.DateTimeField(
_("created"),
default=timezone.now, db_index=True)
created = models.DateTimeField(_("created"), default=timezone.now, db_index=True)
modified = models.DateTimeField(
_("modified"),
auto_now=True, editable=False, db_index=True)
_("modified"), auto_now=True, editable=False, db_index=True
)
storage_type = models.CharField(
_("storage type"),
max_length=11,
choices=STORAGE_TYPES,
default=STORAGE_TYPE_UNENCRYPTED,
editable=False
editable=False,
)
added = models.DateTimeField(
_("added"),
default=timezone.now, editable=False, db_index=True)
_("added"), default=timezone.now, editable=False, db_index=True
)
filename = models.FilePathField(
_("filename"),
@ -191,7 +172,7 @@ class Document(models.Model):
default=None,
unique=True,
null=True,
help_text=_("Current filename in storage")
help_text=_("Current filename in storage"),
)
archive_filename = models.FilePathField(
@ -201,7 +182,7 @@ class Document(models.Model):
default=None,
unique=True,
null=True,
help_text=_("Current archive filename in storage")
help_text=_("Current archive filename in storage"),
)
archive_serial_number = models.IntegerField(
@ -210,8 +191,9 @@ class Document(models.Model):
null=True,
unique=True,
db_index=True,
help_text=_("The position of this document in your physical document "
"archive.")
help_text=_(
"The position of this document in your physical document " "archive."
),
)
class Meta:
@ -238,10 +220,7 @@ class Document(models.Model):
if self.storage_type == self.STORAGE_TYPE_GPG:
fname += ".gpg" # pragma: no cover
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
return os.path.join(settings.ORIGINALS_DIR, fname)
@property
def source_file(self):
@ -254,10 +233,7 @@ class Document(models.Model):
@property
def archive_path(self):
if self.has_archive_version:
return os.path.join(
settings.ARCHIVE_DIR,
str(self.archive_filename)
)
return os.path.join(settings.ARCHIVE_DIR, str(self.archive_filename))
else:
return None
@ -291,10 +267,7 @@ class Document(models.Model):
if self.storage_type == self.STORAGE_TYPE_GPG:
file_name += ".gpg"
return os.path.join(
settings.THUMBNAIL_DIR,
file_name
)
return os.path.join(settings.THUMBNAIL_DIR, file_name)
@property
def thumbnail_file(self):
@ -311,15 +284,13 @@ class Log(models.Model):
(logging.CRITICAL, _("critical")),
)
group = models.UUIDField(
_("group"),
blank=True, null=True)
group = models.UUIDField(_("group"), blank=True, null=True)
message = models.TextField(_("message"))
level = models.PositiveIntegerField(
_("level"),
choices=LEVELS, default=logging.INFO)
_("level"), choices=LEVELS, default=logging.INFO
)
created = models.DateTimeField(_("created"), auto_now_add=True)
@ -333,18 +304,14 @@ class Log(models.Model):
class SavedView(models.Model):
class Meta:
ordering = ("name",)
verbose_name = _("saved view")
verbose_name_plural = _("saved views")
user = models.ForeignKey(User, on_delete=models.CASCADE,
verbose_name=_("user"))
name = models.CharField(
_("name"),
max_length=128)
user = models.ForeignKey(User, on_delete=models.CASCADE, verbose_name=_("user"))
name = models.CharField(_("name"), max_length=128)
show_on_dashboard = models.BooleanField(
_("show on dashboard"),
@ -354,14 +321,9 @@ class SavedView(models.Model):
)
sort_field = models.CharField(
_("sort field"),
max_length=128,
null=True,
blank=True
_("sort field"), max_length=128, null=True, blank=True
)
sort_reverse = models.BooleanField(
_("sort reverse"),
default=False)
sort_reverse = models.BooleanField(_("sort reverse"), default=False)
class SavedViewFilterRule(models.Model):
@ -388,25 +350,19 @@ class SavedViewFilterRule(models.Model):
(19, _("title or content contains")),
(20, _("fulltext query")),
(21, _("more like this")),
(22, _("has tags in"))
(22, _("has tags in")),
]
saved_view = models.ForeignKey(
SavedView,
on_delete=models.CASCADE,
related_name="filter_rules",
verbose_name=_("saved view")
verbose_name=_("saved view"),
)
rule_type = models.PositiveIntegerField(
_("rule type"),
choices=RULE_TYPES)
rule_type = models.PositiveIntegerField(_("rule type"), choices=RULE_TYPES)
value = models.CharField(
_("value"),
max_length=128,
blank=True,
null=True)
value = models.CharField(_("value"), max_length=128, blank=True, null=True)
class Meta:
verbose_name = _("filter rule")
@ -416,20 +372,23 @@ class SavedViewFilterRule(models.Model):
# TODO: why is this in the models file?
class FileInfo:
REGEXES = OrderedDict([
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)$",
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)$",
flags=re.IGNORECASE
))
])
REGEXES = OrderedDict(
[
(
"created-title",
re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)$",
flags=re.IGNORECASE,
),
),
("title", re.compile(r"(?P<title>.*)$", flags=re.IGNORECASE)),
]
)
def __init__(self, created=None, correspondent=None, title=None, tags=(),
extension=None):
def __init__(
self, created=None, correspondent=None, title=None, tags=(), extension=None
):
self.created = created
self.title = title
@ -451,9 +410,7 @@ class FileInfo:
@classmethod
def _mangle_property(cls, properties, name):
if name in properties:
properties[name] = getattr(cls, "_get_{}".format(name))(
properties[name]
)
properties[name] = getattr(cls, "_get_{}".format(name))(properties[name])
@classmethod
def from_filename(cls, filename):

View File

@ -27,11 +27,11 @@ from documents.signals import document_consumer_declaration
# TODO: isnt there a date parsing library for this?
DATE_REGEX = re.compile(
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|'
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
r"(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|" # NOQA: E501
r"(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|" # NOQA: E501
r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|" # NOQA: E501
r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))"
)
@ -93,8 +93,7 @@ def get_parser_class_for_mime_type(mime_type):
return None
# Return the parser with the highest weight.
return sorted(
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def get_parser_class(path):
@ -107,18 +106,20 @@ def get_parser_class(path):
return get_parser_class_for_mime_type(mime_type)
def run_convert(input_file,
output_file,
density=None,
scale=None,
alpha=None,
strip=False,
trim=False,
type=None,
depth=None,
auto_orient=False,
extra=None,
logging_group=None):
def run_convert(
input_file,
output_file,
density=None,
scale=None,
alpha=None,
strip=False,
trim=False,
type=None,
depth=None,
auto_orient=False,
extra=None,
logging_group=None,
):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
@ -127,17 +128,17 @@ def run_convert(input_file,
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
args = [settings.CONVERT_BINARY]
args += ['-density', str(density)] if density else []
args += ['-scale', str(scale)] if scale else []
args += ['-alpha', str(alpha)] if alpha else []
args += ['-strip'] if strip else []
args += ['-trim'] if trim else []
args += ['-type', str(type)] if type else []
args += ['-depth', str(depth)] if depth else []
args += ['-auto-orient'] if auto_orient else []
args += ["-density", str(density)] if density else []
args += ["-scale", str(scale)] if scale else []
args += ["-alpha", str(alpha)] if alpha else []
args += ["-strip"] if strip else []
args += ["-trim"] if trim else []
args += ["-type", str(type)] if type else []
args += ["-depth", str(depth)] if depth else []
args += ["-auto-orient"] if auto_orient else []
args += [input_file, output_file]
logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})
logger.debug("Execute: " + " ".join(args), extra={"group": logging_group})
if not subprocess.Popen(args, env=environment).wait() == 0:
raise ParseError("Convert failed at {}".format(args))
@ -155,27 +156,25 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
logger.warning(
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
extra={'group': logging_group}
extra={"group": logging_group},
)
gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
in_path]
cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path]
try:
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=logging_group)
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=logging_group,
)
return out_path
@ -191,18 +190,19 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
# Run convert to get a decent thumbnail
try:
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file="{}[0]".format(in_path),
output_file=out_path,
logging_group=logging_group)
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file="{}[0]".format(in_path),
output_file=out_path,
logging_group=logging_group,
)
except ParseError:
out_path = make_thumbnail_from_pdf_gs_fallback(
in_path, temp_dir, logging_group)
out_path = make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group)
return out_path
@ -223,15 +223,17 @@ def parse_date(filename, text):
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE":
True
}
"RETURN_AS_TIMEZONE_AWARE": True,
},
)
def __filter(date):
if date and date.year > 1900 and \
date <= timezone.now() and \
date.date() not in settings.IGNORE_DATES:
if (
date
and date.year > 1900
and date <= timezone.now()
and date.date() not in settings.IGNORE_DATES
):
return date
return None
@ -285,8 +287,7 @@ class DocumentParser(LoggingMixin):
super().__init__()
self.logging_group = logging_group
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
self.tempdir = tempfile.mkdtemp(
prefix="paperless-", dir=settings.SCRATCH_DIR)
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
self.archive_path = None
self.text = None
@ -312,18 +313,21 @@ class DocumentParser(LoggingMixin):
"""
raise NotImplementedError()
def get_optimised_thumbnail(self,
document_path,
mime_type,
file_name=None):
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
thumbnail = self.get_thumbnail(document_path, mime_type, file_name)
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
args = (settings.OPTIPNG_BINARY,
"-silent", "-o5", thumbnail, "-out", out_path)
args = (
settings.OPTIPNG_BINARY,
"-silent",
"-o5",
thumbnail,
"-out",
out_path,
)
self.log('debug', f"Execute: {' '.join(args)}")
self.log("debug", f"Execute: {' '.join(args)}")
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))

View File

@ -9,7 +9,6 @@ from documents.models import Document
class SanityCheckMessages:
def __init__(self):
self._messages = []
@ -29,7 +28,7 @@ class SanityCheckMessages:
logger.info("Sanity checker detected no issues.")
else:
for msg in self._messages:
logger.log(msg['level'], msg['message'])
logger.log(msg["level"], msg["message"])
def __len__(self):
return len(self._messages)
@ -38,10 +37,10 @@ class SanityCheckMessages:
return self._messages[item]
def has_error(self):
return any([msg['level'] == logging.ERROR for msg in self._messages])
return any([msg["level"] == logging.ERROR for msg in self._messages])
def has_warning(self):
return any([msg['level'] == logging.WARNING for msg in self._messages])
return any([msg["level"] == logging.WARNING for msg in self._messages])
class SanityCheckFailedException(Exception):
@ -71,9 +70,7 @@ def check_sanity(progress=False):
with doc.thumbnail_file as f:
f.read()
except OSError as e:
messages.error(
f"Cannot read thumbnail file of document {doc.pk}: {e}"
)
messages.error(f"Cannot read thumbnail file of document {doc.pk}: {e}")
# Check sanity of the original file
# TODO: extract method
@ -86,8 +83,7 @@ def check_sanity(progress=False):
with doc.source_file as f:
checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e:
messages.error(
f"Cannot read original file of document {doc.pk}: {e}")
messages.error(f"Cannot read original file of document {doc.pk}: {e}")
else:
if not checksum == doc.checksum:
messages.error(
@ -108,9 +104,7 @@ def check_sanity(progress=False):
)
elif doc.has_archive_version:
if not os.path.isfile(doc.archive_path):
messages.error(
f"Archived version of document {doc.pk} does not exist."
)
messages.error(f"Archived version of document {doc.pk} does not exist.")
else:
if os.path.normpath(doc.archive_path) in present_files:
present_files.remove(os.path.normpath(doc.archive_path))

View File

@ -7,8 +7,15 @@ from rest_framework import serializers
from rest_framework.fields import SerializerMethodField
from . import bulk_edit
from .models import Correspondent, Tag, Document, DocumentType, \
SavedView, SavedViewFilterRule, MatchingModel
from .models import (
Correspondent,
Tag,
Document,
DocumentType,
SavedView,
SavedViewFilterRule,
MatchingModel,
)
from .parsers import is_mime_type_supported
from django.utils.translation import gettext as _
@ -23,7 +30,7 @@ class DynamicFieldsModelSerializer(serializers.ModelSerializer):
def __init__(self, *args, **kwargs):
# Don't pass the 'fields' arg up to the superclass
fields = kwargs.pop('fields', None)
fields = kwargs.pop("fields", None)
# Instantiate the superclass normally
super(DynamicFieldsModelSerializer, self).__init__(*args, **kwargs)
@ -42,16 +49,19 @@ class MatchingModelSerializer(serializers.ModelSerializer):
def get_slug(self, obj):
return slugify(obj.name)
slug = SerializerMethodField()
def validate_match(self, match):
if 'matching_algorithm' in self.initial_data and self.initial_data['matching_algorithm'] == MatchingModel.MATCH_REGEX: # NOQA: E501
if (
"matching_algorithm" in self.initial_data
and self.initial_data["matching_algorithm"] == MatchingModel.MATCH_REGEX
): # NOQA: E501
try:
re.compile(match)
except Exception as e:
raise serializers.ValidationError(
_("Invalid regular expression: %(error)s") %
{'error': str(e)}
_("Invalid regular expression: %(error)s") % {"error": str(e)}
)
return match
@ -70,12 +80,11 @@ class CorrespondentSerializer(MatchingModelSerializer):
"matching_algorithm",
"is_insensitive",
"document_count",
"last_correspondence"
"last_correspondence",
)
class DocumentTypeSerializer(MatchingModelSerializer):
class Meta:
model = DocumentType
fields = (
@ -85,7 +94,7 @@ class DocumentTypeSerializer(MatchingModelSerializer):
"match",
"matching_algorithm",
"is_insensitive",
"document_count"
"document_count",
)
@ -104,7 +113,7 @@ class ColorField(serializers.Field):
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc")
(13, "#cccccc"),
)
def to_internal_value(self, data):
@ -122,7 +131,7 @@ class ColorField(serializers.Field):
class TagSerializerVersion1(MatchingModelSerializer):
colour = ColorField(source='color', default="#a6cee3")
colour = ColorField(source="color", default="#a6cee3")
class Meta:
model = Tag
@ -135,20 +144,19 @@ class TagSerializerVersion1(MatchingModelSerializer):
"matching_algorithm",
"is_insensitive",
"is_inbox_tag",
"document_count"
"document_count",
)
class TagSerializer(MatchingModelSerializer):
def get_text_color(self, obj):
try:
h = obj.color.lstrip('#')
rgb = tuple(int(h[i:i + 2], 16)/256 for i in (0, 2, 4))
h = obj.color.lstrip("#")
rgb = tuple(int(h[i : i + 2], 16) / 256 for i in (0, 2, 4))
luminance = math.sqrt(
0.299 * math.pow(rgb[0], 2) +
0.587 * math.pow(rgb[1], 2) +
0.114 * math.pow(rgb[2], 2)
0.299 * math.pow(rgb[0], 2)
+ 0.587 * math.pow(rgb[1], 2)
+ 0.114 * math.pow(rgb[2], 2)
)
return "#ffffff" if luminance < 0.53 else "#000000"
except ValueError:
@ -168,7 +176,7 @@ class TagSerializer(MatchingModelSerializer):
"matching_algorithm",
"is_insensitive",
"is_inbox_tag",
"document_count"
"document_count",
)
def validate_color(self, color):
@ -231,7 +239,6 @@ class DocumentSerializer(DynamicFieldsModelSerializer):
class SavedViewFilterRuleSerializer(serializers.ModelSerializer):
class Meta:
model = SavedViewFilterRule
fields = ["rule_type", "value"]
@ -244,28 +251,33 @@ class SavedViewSerializer(serializers.ModelSerializer):
class Meta:
model = SavedView
depth = 1
fields = ["id", "name", "show_on_dashboard", "show_in_sidebar",
"sort_field", "sort_reverse", "filter_rules"]
fields = [
"id",
"name",
"show_on_dashboard",
"show_in_sidebar",
"sort_field",
"sort_reverse",
"filter_rules",
]
def update(self, instance, validated_data):
if 'filter_rules' in validated_data:
rules_data = validated_data.pop('filter_rules')
if "filter_rules" in validated_data:
rules_data = validated_data.pop("filter_rules")
else:
rules_data = None
super(SavedViewSerializer, self).update(instance, validated_data)
if rules_data is not None:
SavedViewFilterRule.objects.filter(saved_view=instance).delete()
for rule_data in rules_data:
SavedViewFilterRule.objects.create(
saved_view=instance, **rule_data)
SavedViewFilterRule.objects.create(saved_view=instance, **rule_data)
return instance
def create(self, validated_data):
rules_data = validated_data.pop('filter_rules')
rules_data = validated_data.pop("filter_rules")
saved_view = SavedView.objects.create(**validated_data)
for rule_data in rules_data:
SavedViewFilterRule.objects.create(
saved_view=saved_view, **rule_data)
SavedViewFilterRule.objects.create(saved_view=saved_view, **rule_data)
return saved_view
@ -275,20 +287,19 @@ class DocumentListSerializer(serializers.Serializer):
required=True,
label="Documents",
write_only=True,
child=serializers.IntegerField()
child=serializers.IntegerField(),
)
def _validate_document_id_list(self, documents, name="documents"):
if not type(documents) == list:
raise serializers.ValidationError(f"{name} must be a list")
if not all([type(i) == int for i in documents]):
raise serializers.ValidationError(
f"{name} must be a list of integers")
raise serializers.ValidationError(f"{name} must be a list of integers")
count = Document.objects.filter(id__in=documents).count()
if not count == len(documents):
raise serializers.ValidationError(
f"Some documents in {name} don't exist or were "
f"specified twice.")
f"Some documents in {name} don't exist or were " f"specified twice."
)
def validate_documents(self, documents):
self._validate_document_id_list(documents)
@ -304,7 +315,7 @@ class BulkEditSerializer(DocumentListSerializer):
"add_tag",
"remove_tag",
"modify_tags",
"delete"
"delete",
],
label="Method",
write_only=True,
@ -316,12 +327,12 @@ class BulkEditSerializer(DocumentListSerializer):
if not type(tags) == list:
raise serializers.ValidationError(f"{name} must be a list")
if not all([type(i) == int for i in tags]):
raise serializers.ValidationError(
f"{name} must be a list of integers")
raise serializers.ValidationError(f"{name} must be a list of integers")
count = Tag.objects.filter(id__in=tags).count()
if not count == len(tags):
raise serializers.ValidationError(
f"Some tags in {name} don't exist or were specified twice.")
f"Some tags in {name} don't exist or were specified twice."
)
def validate_method(self, method):
if method == "set_correspondent":
@ -340,8 +351,8 @@ class BulkEditSerializer(DocumentListSerializer):
raise serializers.ValidationError("Unsupported method.")
def _validate_parameters_tags(self, parameters):
if 'tag' in parameters:
tag_id = parameters['tag']
if "tag" in parameters:
tag_id = parameters["tag"]
try:
Tag.objects.get(id=tag_id)
except Tag.DoesNotExist:
@ -350,48 +361,45 @@ class BulkEditSerializer(DocumentListSerializer):
raise serializers.ValidationError("tag not specified")
def _validate_parameters_document_type(self, parameters):
if 'document_type' in parameters:
document_type_id = parameters['document_type']
if "document_type" in parameters:
document_type_id = parameters["document_type"]
if document_type_id is None:
# None is ok
return
try:
DocumentType.objects.get(id=document_type_id)
except DocumentType.DoesNotExist:
raise serializers.ValidationError(
"Document type does not exist")
raise serializers.ValidationError("Document type does not exist")
else:
raise serializers.ValidationError("document_type not specified")
def _validate_parameters_correspondent(self, parameters):
if 'correspondent' in parameters:
correspondent_id = parameters['correspondent']
if "correspondent" in parameters:
correspondent_id = parameters["correspondent"]
if correspondent_id is None:
return
try:
Correspondent.objects.get(id=correspondent_id)
except Correspondent.DoesNotExist:
raise serializers.ValidationError(
"Correspondent does not exist")
raise serializers.ValidationError("Correspondent does not exist")
else:
raise serializers.ValidationError("correspondent not specified")
def _validate_parameters_modify_tags(self, parameters):
if "add_tags" in parameters:
self._validate_tag_id_list(parameters['add_tags'], "add_tags")
self._validate_tag_id_list(parameters["add_tags"], "add_tags")
else:
raise serializers.ValidationError("add_tags not specified")
if "remove_tags" in parameters:
self._validate_tag_id_list(parameters['remove_tags'],
"remove_tags")
self._validate_tag_id_list(parameters["remove_tags"], "remove_tags")
else:
raise serializers.ValidationError("remove_tags not specified")
def validate(self, attrs):
method = attrs['method']
parameters = attrs['parameters']
method = attrs["method"]
parameters = attrs["parameters"]
if method == bulk_edit.set_correspondent:
self._validate_parameters_correspondent(parameters)
@ -448,8 +456,7 @@ class PostDocumentSerializer(serializers.Serializer):
if not is_mime_type_supported(mime_type):
raise serializers.ValidationError(
_("File type %(type)s not supported") %
{'type': mime_type}
_("File type %(type)s not supported") % {"type": mime_type}
)
return document.name, document_data
@ -476,13 +483,11 @@ class PostDocumentSerializer(serializers.Serializer):
class BulkDownloadSerializer(DocumentListSerializer):
content = serializers.ChoiceField(
choices=["archive", "originals", "both"],
default="archive"
choices=["archive", "originals", "both"], default="archive"
)
compression = serializers.ChoiceField(
choices=["none", "deflated", "bzip2", "lzma"],
default="none"
choices=["none", "deflated", "bzip2", "lzma"], default="none"
)
def validate_compression(self, compression):
@ -492,5 +497,5 @@ class BulkDownloadSerializer(DocumentListSerializer):
"none": zipfile.ZIP_STORED,
"deflated": zipfile.ZIP_DEFLATED,
"bzip2": zipfile.ZIP_BZIP2,
"lzma": zipfile.ZIP_LZMA
"lzma": zipfile.ZIP_LZMA,
}[compression]

View File

@ -13,9 +13,11 @@ from django.utils import termcolors, timezone
from filelock import FileLock
from .. import matching
from ..file_handling import delete_empty_directories, \
create_source_path_directory, \
generate_unique_filename
from ..file_handling import (
delete_empty_directories,
create_source_path_directory,
generate_unique_filename,
)
from ..models import Document, Tag, MatchingModel
@ -27,21 +29,22 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
document.tags.add(*inbox_tags)
def set_correspondent(sender,
document=None,
logging_group=None,
classifier=None,
replace=False,
use_first=True,
suggest=False,
base_url=None,
color=False,
**kwargs):
def set_correspondent(
sender,
document=None,
logging_group=None,
classifier=None,
replace=False,
use_first=True,
suggest=False,
base_url=None,
color=False,
**kwargs,
):
if document.correspondent and not replace:
return
potential_correspondents = matching.match_correspondents(document,
classifier)
potential_correspondents = matching.match_correspondents(document, classifier)
potential_count = len(potential_correspondents)
if potential_correspondents:
@ -53,13 +56,13 @@ def set_correspondent(sender,
logger.debug(
f"Detected {potential_count} potential correspondents, "
f"so we've opted for {selected}",
extra={'group': logging_group}
extra={"group": logging_group},
)
else:
logger.debug(
f"Detected {potential_count} potential correspondents, "
f"not assigning any correspondent",
extra={'group': logging_group}
extra={"group": logging_group},
)
return
@ -67,7 +70,7 @@ def set_correspondent(sender,
if suggest:
if base_url:
print(
termcolors.colorize(str(document), fg='green')
termcolors.colorize(str(document), fg="green")
if color
else str(document)
)
@ -75,37 +78,39 @@ def set_correspondent(sender,
else:
print(
(
termcolors.colorize(str(document), fg='green')
termcolors.colorize(str(document), fg="green")
if color
else str(document)
) + f" [{document.pk}]"
)
+ f" [{document.pk}]"
)
print(f"Suggest correspondent {selected}")
else:
logger.info(
f"Assigning correspondent {selected} to {document}",
extra={'group': logging_group}
extra={"group": logging_group},
)
document.correspondent = selected
document.save(update_fields=("correspondent",))
def set_document_type(sender,
document=None,
logging_group=None,
classifier=None,
replace=False,
use_first=True,
suggest=False,
base_url=None,
color=False,
**kwargs):
def set_document_type(
sender,
document=None,
logging_group=None,
classifier=None,
replace=False,
use_first=True,
suggest=False,
base_url=None,
color=False,
**kwargs,
):
if document.document_type and not replace:
return
potential_document_type = matching.match_document_types(document,
classifier)
potential_document_type = matching.match_document_types(document, classifier)
potential_count = len(potential_document_type)
if potential_document_type:
@ -118,13 +123,13 @@ def set_document_type(sender,
logger.info(
f"Detected {potential_count} potential document types, "
f"so we've opted for {selected}",
extra={'group': logging_group}
extra={"group": logging_group},
)
else:
logger.info(
f"Detected {potential_count} potential document types, "
f"not assigning any document type",
extra={'group': logging_group}
extra={"group": logging_group},
)
return
@ -132,7 +137,7 @@ def set_document_type(sender,
if suggest:
if base_url:
print(
termcolors.colorize(str(document), fg='green')
termcolors.colorize(str(document), fg="green")
if color
else str(document)
)
@ -140,35 +145,39 @@ def set_document_type(sender,
else:
print(
(
termcolors.colorize(str(document), fg='green')
termcolors.colorize(str(document), fg="green")
if color
else str(document)
) + f" [{document.pk}]"
)
+ f" [{document.pk}]"
)
print(f"Suggest document type {selected}")
else:
logger.info(
f"Assigning document type {selected} to {document}",
extra={'group': logging_group}
extra={"group": logging_group},
)
document.document_type = selected
document.save(update_fields=("document_type",))
def set_tags(sender,
document=None,
logging_group=None,
classifier=None,
replace=False,
suggest=False,
base_url=None,
color=False,
**kwargs):
def set_tags(
sender,
document=None,
logging_group=None,
classifier=None,
replace=False,
suggest=False,
base_url=None,
color=False,
**kwargs,
):
if replace:
Document.tags.through.objects.filter(document=document).exclude(
Q(tag__is_inbox_tag=True)).exclude(
Q(tag__is_inbox_tag=True)
).exclude(
Q(tag__match="") & ~Q(tag__matching_algorithm=Tag.MATCH_AUTO)
).delete()
@ -181,14 +190,13 @@ def set_tags(sender,
if suggest:
extra_tags = current_tags - set(matched_tags)
extra_tags = [
t for t in extra_tags
if t.matching_algorithm == MatchingModel.MATCH_AUTO
t for t in extra_tags if t.matching_algorithm == MatchingModel.MATCH_AUTO
]
if not relevant_tags and not extra_tags:
return
if base_url:
print(
termcolors.colorize(str(document), fg='green')
termcolors.colorize(str(document), fg="green")
if color
else str(document)
)
@ -196,15 +204,14 @@ def set_tags(sender,
else:
print(
(
termcolors.colorize(str(document), fg='green')
termcolors.colorize(str(document), fg="green")
if color
else str(document)
) + f" [{document.pk}]"
)
+ f" [{document.pk}]"
)
if relevant_tags:
print(
"Suggest tags: " + ", ".join([t.name for t in relevant_tags])
)
print("Suggest tags: " + ", ".join([t.name for t in relevant_tags]))
if extra_tags:
print("Extra tags: " + ", ".join([t.name for t in extra_tags]))
else:
@ -213,10 +220,8 @@ def set_tags(sender,
message = 'Tagging "{}" with "{}"'
logger.info(
message.format(
document, ", ".join([t.name for t in relevant_tags])
),
extra={'group': logging_group}
message.format(document, ", ".join([t.name for t in relevant_tags])),
extra={"group": logging_group},
)
document.tags.add(*relevant_tags)
@ -235,9 +240,7 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
while True:
new_file_path = os.path.join(
settings.TRASH_DIR,
old_filebase +
(f"_{counter:02}" if counter else "") +
old_fileext
old_filebase + (f"_{counter:02}" if counter else "") + old_fileext,
)
if os.path.exists(new_file_path):
@ -245,8 +248,7 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
else:
break
logger.debug(
f"Moving {instance.source_path} to trash at {new_file_path}")
logger.debug(f"Moving {instance.source_path} to trash at {new_file_path}")
try:
os.rename(instance.source_path, new_file_path)
except OSError as e:
@ -256,14 +258,15 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
)
return
for filename in (instance.source_path,
instance.archive_path,
instance.thumbnail_path):
for filename in (
instance.source_path,
instance.archive_path,
instance.thumbnail_path,
):
if filename and os.path.isfile(filename):
try:
os.unlink(filename)
logger.debug(
f"Deleted file {filename}.")
logger.debug(f"Deleted file {filename}.")
except OSError as e:
logger.warning(
f"While deleting document {str(instance)}, the file "
@ -271,14 +274,12 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
)
delete_empty_directories(
os.path.dirname(instance.source_path),
root=settings.ORIGINALS_DIR
os.path.dirname(instance.source_path), root=settings.ORIGINALS_DIR
)
if instance.has_archive_version:
delete_empty_directories(
os.path.dirname(instance.archive_path),
root=settings.ARCHIVE_DIR
os.path.dirname(instance.archive_path), root=settings.ARCHIVE_DIR
)
@ -289,15 +290,15 @@ class CannotMoveFilesException(Exception):
def validate_move(instance, old_path, new_path):
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
logger.fatal(
f"Document {str(instance)}: File {old_path} has gone.")
logger.fatal(f"Document {str(instance)}: File {old_path} has gone.")
raise CannotMoveFilesException()
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
logger.warning(
f"Document {str(instance)}: Cannot rename file "
f"since target path {new_path} already exists.")
f"since target path {new_path} already exists."
)
raise CannotMoveFilesException()
@ -333,7 +334,9 @@ def update_filename_and_move_files(sender, instance, **kwargs):
instance, archive_filename=True
)
move_archive = old_archive_filename != instance.archive_filename # NOQA: E501
move_archive = (
old_archive_filename != instance.archive_filename
) # NOQA: E501
else:
move_archive = False
@ -347,8 +350,7 @@ def update_filename_and_move_files(sender, instance, **kwargs):
os.rename(old_source_path, instance.source_path)
if move_archive:
validate_move(
instance, old_archive_path, instance.archive_path)
validate_move(instance, old_archive_path, instance.archive_path)
create_source_path_directory(instance.archive_path)
os.rename(old_archive_path, instance.archive_path)
@ -390,12 +392,16 @@ def update_filename_and_move_files(sender, instance, **kwargs):
# finally, remove any empty sub folders. This will do nothing if
# something has failed above.
if not os.path.isfile(old_source_path):
delete_empty_directories(os.path.dirname(old_source_path),
root=settings.ORIGINALS_DIR)
delete_empty_directories(
os.path.dirname(old_source_path), root=settings.ORIGINALS_DIR
)
if instance.has_archive_version and not os.path.isfile(old_archive_path): # NOQA: E501
delete_empty_directories(os.path.dirname(old_archive_path),
root=settings.ARCHIVE_DIR)
if instance.has_archive_version and not os.path.isfile(
old_archive_path
): # NOQA: E501
delete_empty_directories(
os.path.dirname(old_archive_path), root=settings.ARCHIVE_DIR
)
def set_log_entry(sender, document=None, logging_group=None, **kwargs):

View File

@ -31,12 +31,11 @@ def index_reindex(progress_bar_disable=False):
def train_classifier():
if (not Tag.objects.filter(
matching_algorithm=Tag.MATCH_AUTO).exists() and
not DocumentType.objects.filter(
matching_algorithm=Tag.MATCH_AUTO).exists() and
not Correspondent.objects.filter(
matching_algorithm=Tag.MATCH_AUTO).exists()):
if (
not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
):
return
@ -48,28 +47,25 @@ def train_classifier():
try:
if classifier.train():
logger.info(
"Saving updated classifier model to {}...".format(
settings.MODEL_FILE)
"Saving updated classifier model to {}...".format(settings.MODEL_FILE)
)
classifier.save()
else:
logger.debug(
"Training data unchanged."
)
logger.debug("Training data unchanged.")
except Exception as e:
logger.warning(
"Classifier error: " + str(e)
)
logger.warning("Classifier error: " + str(e))
def consume_file(path,
override_filename=None,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None,
task_id=None):
def consume_file(
path,
override_filename=None,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None,
task_id=None,
):
document = Consumer().try_consume_file(
path,
@ -78,16 +74,16 @@ def consume_file(path,
override_correspondent_id=override_correspondent_id,
override_document_type_id=override_document_type_id,
override_tag_ids=override_tag_ids,
task_id=task_id
task_id=task_id,
)
if document:
return "Success. New document id {} created".format(
document.pk
)
return "Success. New document id {} created".format(document.pk)
else:
raise ConsumerError("Unknown error: Returned document was null, but "
"no error message was given.")
raise ConsumerError(
"Unknown error: Returned document was null, but "
"no error message was given."
)
def sanity_check():
@ -96,8 +92,7 @@ def sanity_check():
messages.log_messages()
if messages.has_error():
raise SanityCheckFailedException(
"Sanity check failed with errors. See log.")
raise SanityCheckFailedException("Sanity check failed with errors. See log.")
elif messages.has_warning():
return "Sanity check exited with warnings. See log."
elif len(messages) > 0:

View File

@ -5,7 +5,6 @@ from ..models import Document, Correspondent
class CorrespondentFactory(DjangoModelFactory):
class Meta:
model = Correspondent
@ -13,6 +12,5 @@ class CorrespondentFactory(DjangoModelFactory):
class DocumentFactory(DjangoModelFactory):
class Meta:
model = Document

View File

@ -11,7 +11,6 @@ from documents.tests.utils import DirectoriesMixin
class TestDocumentAdmin(DirectoriesMixin, TestCase):
def get_document_from_index(self, doc):
ix = index.open_index()
with ix.searcher() as searcher:
@ -27,7 +26,7 @@ class TestDocumentAdmin(DirectoriesMixin, TestCase):
doc.title = "new title"
self.doc_admin.save_model(None, doc, None, None)
self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
self.assertEqual(self.get_document_from_index(doc)['id'], doc.id)
self.assertEqual(self.get_document_from_index(doc)["id"], doc.id)
def test_delete_model(self):
doc = Document.objects.create(title="test")
@ -42,7 +41,9 @@ class TestDocumentAdmin(DirectoriesMixin, TestCase):
def test_delete_queryset(self):
docs = []
for i in range(42):
doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
doc = Document.objects.create(
title="Many documents with the same title", checksum=f"{i:02}"
)
docs.append(doc)
index.add_or_update_document(doc)
@ -59,5 +60,7 @@ class TestDocumentAdmin(DirectoriesMixin, TestCase):
self.assertIsNone(self.get_document_from_index(doc))
def test_created(self):
doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
doc = Document.objects.create(
title="test", created=timezone.datetime(2020, 4, 12)
)
self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")

File diff suppressed because it is too large Load Diff

View File

@ -11,7 +11,6 @@ from ..models import Document
class ChecksTestCase(TestCase):
def test_changed_password_check_empty_db(self):
self.assertEqual(changed_password_check(None), [])
@ -23,8 +22,15 @@ class ChecksTestCase(TestCase):
self.assertEqual(parser_check(None), [])
with mock.patch('documents.checks.document_consumer_declaration.send') as m:
with mock.patch("documents.checks.document_consumer_declaration.send") as m:
m.return_value = []
self.assertEqual(parser_check(None), [Error("No parsers found. This is a bug. The consumer won't be "
"able to consume any documents without parsers.")])
self.assertEqual(
parser_check(None),
[
Error(
"No parsers found. This is a bug. The consumer won't be "
"able to consume any documents without parsers."
)
],
)

View File

@ -7,30 +7,60 @@ import pytest
from django.conf import settings
from django.test import TestCase, override_settings
from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError, load_classifier
from documents.classifier import (
DocumentClassifier,
IncompatibleClassifierVersionError,
load_classifier,
)
from documents.models import Correspondent, Document, Tag, DocumentType
from documents.tests.utils import DirectoriesMixin
class TestClassifier(DirectoriesMixin, TestCase):
def setUp(self):
super(TestClassifier, self).setUp()
self.classifier = DocumentClassifier()
def generate_test_data(self):
self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
self.c1 = Correspondent.objects.create(
name="c1", matching_algorithm=Correspondent.MATCH_AUTO
)
self.c2 = Correspondent.objects.create(name="c2")
self.c3 = Correspondent.objects.create(name="c3", matching_algorithm=Correspondent.MATCH_AUTO)
self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
self.dt2 = DocumentType.objects.create(name="dt2", matching_algorithm=DocumentType.MATCH_AUTO)
self.c3 = Correspondent.objects.create(
name="c3", matching_algorithm=Correspondent.MATCH_AUTO
)
self.t1 = Tag.objects.create(
name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12
)
self.t2 = Tag.objects.create(
name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True
)
self.t3 = Tag.objects.create(
name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45
)
self.dt = DocumentType.objects.create(
name="dt", matching_algorithm=DocumentType.MATCH_AUTO
)
self.dt2 = DocumentType.objects.create(
name="dt2", matching_algorithm=DocumentType.MATCH_AUTO
)
self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
self.doc_inbox = Document.objects.create(title="doc235", content="aa", checksum="C")
self.doc1 = Document.objects.create(
title="doc1",
content="this is a document from c1",
correspondent=self.c1,
checksum="A",
document_type=self.dt,
)
self.doc2 = Document.objects.create(
title="doc1",
content="this is another document, but from c2",
correspondent=self.c2,
checksum="B",
)
self.doc_inbox = Document.objects.create(
title="doc235", content="aa", checksum="C"
)
self.doc1.tags.add(self.t1)
self.doc2.tags.add(self.t1)
@ -59,17 +89,29 @@ class TestClassifier(DirectoriesMixin, TestCase):
def testTrain(self):
self.generate_test_data()
self.classifier.train()
self.assertListEqual(list(self.classifier.correspondent_classifier.classes_), [-1, self.c1.pk])
self.assertListEqual(list(self.classifier.tags_binarizer.classes_), [self.t1.pk, self.t3.pk])
self.assertListEqual(
list(self.classifier.correspondent_classifier.classes_), [-1, self.c1.pk]
)
self.assertListEqual(
list(self.classifier.tags_binarizer.classes_), [self.t1.pk, self.t3.pk]
)
def testPredict(self):
self.generate_test_data()
self.classifier.train()
self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
self.assertEqual(
self.classifier.predict_correspondent(self.doc1.content), self.c1.pk
)
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
self.assertListEqual(self.classifier.predict_tags(self.doc1.content), [self.t1.pk])
self.assertListEqual(self.classifier.predict_tags(self.doc2.content), [self.t1.pk, self.t3.pk])
self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
self.assertListEqual(
self.classifier.predict_tags(self.doc1.content), [self.t1.pk]
)
self.assertListEqual(
self.classifier.predict_tags(self.doc2.content), [self.t1.pk, self.t3.pk]
)
self.assertEqual(
self.classifier.predict_document_type(self.doc1.content), self.dt.pk
)
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
def testDatasetHashing(self):
@ -90,7 +132,9 @@ class TestClassifier(DirectoriesMixin, TestCase):
classifier2 = DocumentClassifier()
current_ver = DocumentClassifier.FORMAT_VERSION
with mock.patch("documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver+1):
with mock.patch(
"documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver + 1
):
# assure that we won't load old classifiers.
self.assertRaises(IncompatibleClassifierVersionError, classifier2.load)
@ -112,7 +156,9 @@ class TestClassifier(DirectoriesMixin, TestCase):
new_classifier.load()
self.assertFalse(new_classifier.train())
@override_settings(MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"))
@override_settings(
MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle")
)
def test_load_and_classify(self):
self.generate_test_data()
@ -122,38 +168,67 @@ class TestClassifier(DirectoriesMixin, TestCase):
self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])
def test_one_correspondent_predict(self):
c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
c1 = Correspondent.objects.create(
name="c1", matching_algorithm=Correspondent.MATCH_AUTO
)
doc1 = Document.objects.create(
title="doc1",
content="this is a document from c1",
correspondent=c1,
checksum="A",
)
self.classifier.train()
self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
def test_one_correspondent_predict_manydocs(self):
c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
doc2 = Document.objects.create(title="doc2", content="this is a document from noone", checksum="B")
c1 = Correspondent.objects.create(
name="c1", matching_algorithm=Correspondent.MATCH_AUTO
)
doc1 = Document.objects.create(
title="doc1",
content="this is a document from c1",
correspondent=c1,
checksum="A",
)
doc2 = Document.objects.create(
title="doc2", content="this is a document from noone", checksum="B"
)
self.classifier.train()
self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
self.assertIsNone(self.classifier.predict_correspondent(doc2.content))
def test_one_type_predict(self):
dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
dt = DocumentType.objects.create(
name="dt", matching_algorithm=DocumentType.MATCH_AUTO
)
doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
checksum="A", document_type=dt)
doc1 = Document.objects.create(
title="doc1",
content="this is a document from c1",
checksum="A",
document_type=dt,
)
self.classifier.train()
self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
def test_one_type_predict_manydocs(self):
dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
dt = DocumentType.objects.create(
name="dt", matching_algorithm=DocumentType.MATCH_AUTO
)
doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
checksum="A", document_type=dt)
doc1 = Document.objects.create(
title="doc1",
content="this is a document from c1",
checksum="A",
document_type=dt,
)
doc2 = Document.objects.create(title="doc1", content="this is a document from c2",
checksum="B")
doc2 = Document.objects.create(
title="doc1", content="this is a document from c2", checksum="B"
)
self.classifier.train()
self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
@ -162,7 +237,9 @@ class TestClassifier(DirectoriesMixin, TestCase):
def test_one_tag_predict(self):
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
doc1 = Document.objects.create(
title="doc1", content="this is a document from c1", checksum="A"
)
doc1.tags.add(t1)
self.classifier.train()
@ -171,7 +248,9 @@ class TestClassifier(DirectoriesMixin, TestCase):
def test_one_tag_predict_unassigned(self):
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
doc1 = Document.objects.create(
title="doc1", content="this is a document from c1", checksum="A"
)
self.classifier.train()
self.assertListEqual(self.classifier.predict_tags(doc1.content), [])
@ -180,7 +259,9 @@ class TestClassifier(DirectoriesMixin, TestCase):
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
doc4 = Document.objects.create(
title="doc1", content="this is a document from c4", checksum="D"
)
doc4.tags.add(t1)
doc4.tags.add(t2)
@ -191,10 +272,18 @@ class TestClassifier(DirectoriesMixin, TestCase):
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
doc2 = Document.objects.create(title="doc1", content="this is a document from c2", checksum="B")
doc3 = Document.objects.create(title="doc1", content="this is a document from c3", checksum="C")
doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
doc1 = Document.objects.create(
title="doc1", content="this is a document from c1", checksum="A"
)
doc2 = Document.objects.create(
title="doc1", content="this is a document from c2", checksum="B"
)
doc3 = Document.objects.create(
title="doc1", content="this is a document from c3", checksum="C"
)
doc4 = Document.objects.create(
title="doc1", content="this is a document from c4", checksum="D"
)
doc1.tags.add(t1)
doc2.tags.add(t2)
@ -210,8 +299,12 @@ class TestClassifier(DirectoriesMixin, TestCase):
def test_one_tag_predict_multi(self):
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
doc1 = Document.objects.create(
title="doc1", content="this is a document from c1", checksum="A"
)
doc2 = Document.objects.create(
title="doc2", content="this is a document from c2", checksum="B"
)
doc1.tags.add(t1)
doc2.tags.add(t1)
@ -222,8 +315,12 @@ class TestClassifier(DirectoriesMixin, TestCase):
def test_one_tag_predict_multi_2(self):
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
doc1 = Document.objects.create(
title="doc1", content="this is a document from c1", checksum="A"
)
doc2 = Document.objects.create(
title="doc2", content="this is a document from c2", checksum="B"
)
doc1.tags.add(t1)
self.classifier.train()
@ -240,9 +337,15 @@ class TestClassifier(DirectoriesMixin, TestCase):
self.assertIsNotNone(load_classifier())
load.assert_called_once()
@override_settings(CACHES={'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}})
@override_settings(MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"))
@pytest.mark.skip(reason="Disabled caching due to high memory usage - need to investigate.")
@override_settings(
CACHES={"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"}}
)
@override_settings(
MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle")
)
@pytest.mark.skip(
reason="Disabled caching due to high memory usage - need to investigate."
)
def test_load_classifier_cached(self):
classifier = load_classifier()
self.assertIsNotNone(classifier)

View File

@ -31,21 +31,14 @@ class TestAttributes(TestCase):
self.assertEqual(tuple([t.name for t in file_info.tags]), tags, filename)
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
self._test_guess_attributes_from_name(
'- weird but should not break.pdf',
None,
'- weird but should not break',
()
"- weird but should not break.pdf", None, "- weird but should not break", ()
)
def test_guess_attributes_from_name_when_title_ends_with_dash(self):
self._test_guess_attributes_from_name(
'weird but should not break -.pdf',
None,
'weird but should not break -',
()
"weird but should not break -.pdf", None, "weird but should not break -", ()
)
@ -55,19 +48,13 @@ class TestFieldPermutations(TestCase):
"20150102030405Z",
"20150102Z",
)
valid_correspondents = [
"timmy",
"Dr. McWheelie",
"Dash Gor-don",
"ο Θερμαστής",
""
]
valid_correspondents = ["timmy", "Dr. McWheelie", "Dash Gor-don", "ο Θερμαστής", ""]
valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
def _test_guessed_attributes(self, filename, created=None,
correspondent=None, title=None,
tags=None):
def _test_guessed_attributes(
self, filename, created=None, correspondent=None, title=None, tags=None
):
info = FileInfo.from_filename(filename)
@ -92,13 +79,10 @@ class TestFieldPermutations(TestCase):
if tags is None:
self.assertEqual(info.tags, (), filename)
else:
self.assertEqual(
[t.name for t in info.tags], tags.split(','),
filename
)
self.assertEqual([t.name for t in info.tags], tags.split(","), filename)
def test_just_title(self):
template = '{title}.pdf'
template = "{title}.pdf"
for title in self.valid_titles:
spec = dict(title=title)
filename = template.format(**spec)
@ -109,12 +93,8 @@ class TestFieldPermutations(TestCase):
for created in self.valid_dates:
for title in self.valid_titles:
spec = {
"created": created,
"title": title
}
self._test_guessed_attributes(
template.format(**spec), **spec)
spec = {"created": created, "title": title}
self._test_guessed_attributes(template.format(**spec), **spec)
def test_invalid_date_format(self):
info = FileInfo.from_filename("06112017Z - title.pdf")
@ -127,7 +107,7 @@ class TestFieldPermutations(TestCase):
all_patt = re.compile("^.*$")
none_patt = re.compile("$a")
exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
repl1 = " - \\4 - \\1." # (empty) corrspondent, title and tags
repl1 = " - \\4 - \\1." # (empty) corrspondent, title and tags
repl2 = "\\2Z - " + repl1 # creation date + repl1
# No transformations configured (= default)
@ -137,36 +117,37 @@ class TestFieldPermutations(TestCase):
self.assertIsNone(info.created)
# Pattern doesn't match (filename unaltered)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
with self.settings(FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
# Simple transformation (match all)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
with self.settings(FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "all")
# Multiple transformations configured (first pattern matches)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[
(all_patt, "all.gif"),
(all_patt, "anotherall.gif")]):
FILENAME_PARSE_TRANSFORMS=[
(all_patt, "all.gif"),
(all_patt, "anotherall.gif"),
]
):
info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "all")
# Multiple transformations configured (second pattern matches)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[
(none_patt, "none.gif"),
(all_patt, "anotherall.gif")]):
FILENAME_PARSE_TRANSFORMS=[
(none_patt, "none.gif"),
(all_patt, "anotherall.gif"),
]
):
info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "anotherall")
class DummyParser(DocumentParser):
def get_thumbnail(self, document_path, mime_type, file_name=None):
# not important during tests
raise NotImplementedError()
@ -184,7 +165,6 @@ class DummyParser(DocumentParser):
class CopyParser(DocumentParser):
def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
@ -202,7 +182,6 @@ class CopyParser(DocumentParser):
class FaultyParser(DocumentParser):
def get_thumbnail(self, document_path, mime_type, file_name=None):
# not important during tests
raise NotImplementedError()
@ -233,8 +212,15 @@ def fake_magic_from_file(file, mime=False):
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumer(DirectoriesMixin, TestCase):
def _assert_first_last_send_progress(self, first_status="STARTING", last_status="SUCCESS", first_progress=0, first_progress_max=100, last_progress=100, last_progress_max=100):
def _assert_first_last_send_progress(
self,
first_status="STARTING",
last_status="SUCCESS",
first_progress=0,
first_progress_max=100,
last_progress=100,
last_progress_max=100,
):
self._send_progress.assert_called()
@ -243,13 +229,17 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertEqual(args[1], first_progress_max)
self.assertEqual(args[2], first_status)
args, kwargs = self._send_progress.call_args_list[len(self._send_progress.call_args_list) - 1]
args, kwargs = self._send_progress.call_args_list[
len(self._send_progress.call_args_list) - 1
]
self.assertEqual(args[0], last_progress)
self.assertEqual(args[1], last_progress_max)
self.assertEqual(args[2], last_status)
def make_dummy_parser(self, logging_group, progress_callback=None):
return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())
return DummyParser(
logging_group, self.dirs.scratch_dir, self.get_test_archive_file()
)
def make_faulty_parser(self, logging_group, progress_callback=None):
return FaultyParser(logging_group, self.dirs.scratch_dir)
@ -259,11 +249,16 @@ class TestConsumer(DirectoriesMixin, TestCase):
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
m = patcher.start()
m.return_value = [(None, {
"parser": self.make_dummy_parser,
"mime_types": {"application/pdf": ".pdf"},
"weight": 0
})]
m.return_value = [
(
None,
{
"parser": self.make_dummy_parser,
"mime_types": {"application/pdf": ".pdf"},
"weight": 0,
},
)
]
self.addCleanup(patcher.stop)
# this prevents websocket message reports during testing.
@ -274,13 +269,21 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.consumer = Consumer()
def get_test_file(self):
src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf")
src = os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000001.pdf",
)
dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
shutil.copy(src, dst)
return dst
def get_test_archive_file(self):
src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf")
src = os.path.join(
os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf"
)
dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf")
shutil.copy(src, dst)
return dst
@ -292,23 +295,19 @@ class TestConsumer(DirectoriesMixin, TestCase):
document = self.consumer.try_consume_file(filename)
self.assertEqual(document.content, "The Text")
self.assertEqual(document.title, os.path.splitext(os.path.basename(filename))[0])
self.assertEqual(
document.title, os.path.splitext(os.path.basename(filename))[0]
)
self.assertIsNone(document.correspondent)
self.assertIsNone(document.document_type)
self.assertEqual(document.filename, "0000001.pdf")
self.assertEqual(document.archive_filename, "0000001.pdf")
self.assertTrue(os.path.isfile(
document.source_path
))
self.assertTrue(os.path.isfile(document.source_path))
self.assertTrue(os.path.isfile(
document.thumbnail_path
))
self.assertTrue(os.path.isfile(document.thumbnail_path))
self.assertTrue(os.path.isfile(
document.archive_path
))
self.assertTrue(os.path.isfile(document.archive_path))
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
@ -330,40 +329,45 @@ class TestConsumer(DirectoriesMixin, TestCase):
document = self.consumer.try_consume_file(filename)
self.assertTrue(os.path.isfile(
document.source_path
))
self.assertTrue(os.path.isfile(document.source_path))
self.assertFalse(os.path.isfile(shadow_file))
self.assertFalse(os.path.isfile(filename))
def testOverrideFilename(self):
filename = self.get_test_file()
override_filename = "Statement for November.pdf"
document = self.consumer.try_consume_file(filename, override_filename=override_filename)
document = self.consumer.try_consume_file(
filename, override_filename=override_filename
)
self.assertEqual(document.title, "Statement for November")
self._assert_first_last_send_progress()
def testOverrideTitle(self):
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
document = self.consumer.try_consume_file(
self.get_test_file(), override_title="Override Title"
)
self.assertEqual(document.title, "Override Title")
self._assert_first_last_send_progress()
def testOverrideCorrespondent(self):
c = Correspondent.objects.create(name="test")
document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
document = self.consumer.try_consume_file(
self.get_test_file(), override_correspondent_id=c.pk
)
self.assertEqual(document.correspondent.id, c.id)
self._assert_first_last_send_progress()
def testOverrideDocumentType(self):
dt = DocumentType.objects.create(name="test")
document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
document = self.consumer.try_consume_file(
self.get_test_file(), override_document_type_id=dt.pk
)
self.assertEqual(document.document_type.id, dt.id)
self._assert_first_last_send_progress()
@ -371,7 +375,9 @@ class TestConsumer(DirectoriesMixin, TestCase):
t1 = Tag.objects.create(name="t1")
t2 = Tag.objects.create(name="t2")
t3 = Tag.objects.create(name="t3")
document = self.consumer.try_consume_file(self.get_test_file(), override_tag_ids=[t1.id, t3.id])
document = self.consumer.try_consume_file(
self.get_test_file(), override_tag_ids=[t1.id, t3.id]
)
self.assertIn(t1, document.tags.all())
self.assertNotIn(t2, document.tags.all())
@ -384,7 +390,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
ConsumerError,
"File not found",
self.consumer.try_consume_file,
"non-existing-file"
"non-existing-file",
)
self._assert_first_last_send_progress(last_status="FAILED")
@ -396,7 +402,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
ConsumerError,
"It is a duplicate",
self.consumer.try_consume_file,
self.get_test_file()
self.get_test_file(),
)
self._assert_first_last_send_progress(last_status="FAILED")
@ -408,7 +414,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
ConsumerError,
"It is a duplicate",
self.consumer.try_consume_file,
self.get_test_archive_file()
self.get_test_archive_file(),
)
self._assert_first_last_send_progress(last_status="FAILED")
@ -425,25 +431,29 @@ class TestConsumer(DirectoriesMixin, TestCase):
ConsumerError,
"sample.pdf: Unsupported mime type application/pdf",
self.consumer.try_consume_file,
self.get_test_file()
self.get_test_file(),
)
self._assert_first_last_send_progress(last_status="FAILED")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def testFaultyParser(self, m):
m.return_value = [(None, {
"parser": self.make_faulty_parser,
"mime_types": {"application/pdf": ".pdf"},
"weight": 0
})]
m.return_value = [
(
None,
{
"parser": self.make_faulty_parser,
"mime_types": {"application/pdf": ".pdf"},
"weight": 0,
},
)
]
self.assertRaisesMessage(
ConsumerError,
"sample.pdf: Error while consuming document sample.pdf: Does not compute.",
self.consumer.try_consume_file,
self.get_test_file()
self.get_test_file(),
)
self._assert_first_last_send_progress(last_status="FAILED")
@ -457,7 +467,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
ConsumerError,
"sample.pdf: The following error occured while consuming sample.pdf: NO.",
self.consumer.try_consume_file,
filename
filename,
)
self._assert_first_last_send_progress(last_status="FAILED")
@ -491,7 +501,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
filenames.insert(0, f)
return f
m.side_effect = lambda f, archive_filename = False: get_filename()
m.side_effect = lambda f, archive_filename=False: get_filename()
filename = self.get_test_file()
@ -565,17 +575,37 @@ class TestConsumer(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_similar_filenames(self, m):
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), os.path.join(settings.CONSUMPTION_DIR, "simple.pdf"))
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.png"), os.path.join(settings.CONSUMPTION_DIR, "simple.png"))
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple-noalpha.png"), os.path.join(settings.CONSUMPTION_DIR, "simple.png.pdf"))
m.return_value = [(None, {
"parser": CopyParser,
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
"weight": 0
})]
doc1 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.png"))
doc2 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.pdf"))
doc3 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.png.pdf"))
shutil.copy(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
os.path.join(settings.CONSUMPTION_DIR, "simple.pdf"),
)
shutil.copy(
os.path.join(os.path.dirname(__file__), "samples", "simple.png"),
os.path.join(settings.CONSUMPTION_DIR, "simple.png"),
)
shutil.copy(
os.path.join(os.path.dirname(__file__), "samples", "simple-noalpha.png"),
os.path.join(settings.CONSUMPTION_DIR, "simple.png.pdf"),
)
m.return_value = [
(
None,
{
"parser": CopyParser,
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
"weight": 0,
},
)
]
doc1 = self.consumer.try_consume_file(
os.path.join(settings.CONSUMPTION_DIR, "simple.png")
)
doc2 = self.consumer.try_consume_file(
os.path.join(settings.CONSUMPTION_DIR, "simple.pdf")
)
doc3 = self.consumer.try_consume_file(
os.path.join(settings.CONSUMPTION_DIR, "simple.png.pdf")
)
self.assertEqual(doc1.filename, "simple.png")
self.assertEqual(doc1.archive_filename, "simple.pdf")
@ -588,7 +618,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
class PreConsumeTestCase(TestCase):
@mock.patch("documents.consumer.Popen")
@override_settings(PRE_CONSUME_SCRIPT=None)
def test_no_pre_consume_script(self, m):
@ -625,7 +654,6 @@ class PreConsumeTestCase(TestCase):
class PostConsumeTestCase(TestCase):
@mock.patch("documents.consumer.Popen")
@override_settings(POST_CONSUME_SCRIPT=None)
def test_no_post_consume_script(self, m):
@ -662,7 +690,9 @@ class PostConsumeTestCase(TestCase):
with tempfile.NamedTemporaryFile() as script:
with override_settings(POST_CONSUME_SCRIPT=script.name):
c = Correspondent.objects.create(name="my_bank")
doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
doc = Document.objects.create(
title="Test", mime_type="application/pdf", correspondent=c
)
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)

View File

@ -12,7 +12,9 @@ from documents.parsers import parse_date
class TestDate(TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples")
SAMPLE_FILES = os.path.join(
os.path.dirname(__file__), "../../paperless_tesseract/tests/samples"
)
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
def setUp(self):
@ -38,24 +40,15 @@ class TestDate(TestCase):
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_5(self):
text = (
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
"ipsum"
)
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " "ipsum"
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_6(self):
@ -73,18 +66,11 @@ class TestDate(TestCase):
self.assertEqual(parse_date("", text), None)
def test_date_format_7(self):
text = (
"lorem ipsum\n"
"März 2019\n"
"lorem ipsum"
)
text = "lorem ipsum\n" "März 2019\n" "lorem ipsum"
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(
2019, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_8(self):
@ -102,26 +88,15 @@ class TestDate(TestCase):
)
self.assertEqual(
parse_date("", text),
datetime.datetime(
2020, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_9(self):
text = (
"lorem ipsum\n"
"27. Nullmonth 2020\n"
"März 2020\n"
"lorem ipsum"
)
text = "lorem ipsum\n" "27. Nullmonth 2020\n" "März 2020\n" "lorem ipsum"
self.assertEqual(
parse_date("", text),
datetime.datetime(
2020, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_crazy_date_past(self, *args):
@ -135,19 +110,17 @@ class TestDate(TestCase):
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_invalid(self, *args):
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
@override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)))
def test_ignored_dates(self, *args):
text = (
"lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem "
"ipsum"
self.assertIsNone(
parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here")
)
@override_settings(
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
)
def test_ignored_dates(self, *args):
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " "ipsum"
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)

View File

@ -10,7 +10,6 @@ from ..models import Document, Correspondent
class TestDocument(TestCase):
def setUp(self) -> None:
self.originals_dir = tempfile.mkdtemp()
self.thumb_dir = tempfile.mkdtemp()
@ -30,7 +29,7 @@ class TestDocument(TestCase):
title="Title",
content="content",
checksum="checksum",
mime_type="application/pdf"
mime_type="application/pdf",
)
file_path = document.source_path
@ -47,20 +46,36 @@ class TestDocument(TestCase):
def test_file_name(self):
doc = Document(mime_type="application/pdf", title="test", created=timezone.datetime(2020, 12, 25))
doc = Document(
mime_type="application/pdf",
title="test",
created=timezone.datetime(2020, 12, 25),
)
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.pdf")
def test_file_name_jpg(self):
doc = Document(mime_type="image/jpeg", title="test", created=timezone.datetime(2020, 12, 25))
doc = Document(
mime_type="image/jpeg",
title="test",
created=timezone.datetime(2020, 12, 25),
)
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.jpg")
def test_file_name_unknown(self):
doc = Document(mime_type="application/zip", title="test", created=timezone.datetime(2020, 12, 25))
doc = Document(
mime_type="application/zip",
title="test",
created=timezone.datetime(2020, 12, 25),
)
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.zip")
def test_file_name_invalid_type(self):
doc = Document(mime_type="image/jpegasd", title="test", created=timezone.datetime(2020, 12, 25))
doc = Document(
mime_type="image/jpegasd",
title="test",
created=timezone.datetime(2020, 12, 25),
)
self.assertEqual(doc.get_public_filename(), "2020-12-25 test")

View File

@ -13,13 +13,16 @@ from django.test import TestCase, override_settings
from django.utils import timezone
from .utils import DirectoriesMixin
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories, \
generate_unique_filename
from ..file_handling import (
generate_filename,
create_source_path_directory,
delete_empty_directories,
generate_unique_filename,
)
from ..models import Document, Correspondent, Tag, DocumentType
class TestFileHandling(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_generate_source_filename(self):
document = Document()
@ -30,8 +33,9 @@ class TestFileHandling(DirectoriesMixin, TestCase):
self.assertEqual(generate_filename(document), "{:07d}.pdf".format(document.pk))
document.storage_type = Document.STORAGE_TYPE_GPG
self.assertEqual(generate_filename(document),
"{:07d}.pdf.gpg".format(document.pk))
self.assertEqual(
generate_filename(document), "{:07d}.pdf.gpg".format(document.pk)
)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming(self):
@ -41,7 +45,10 @@ class TestFileHandling(DirectoriesMixin, TestCase):
document.save()
# Test default source_path
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/{:07d}.pdf".format(document.pk))
self.assertEqual(
document.source_path,
settings.ORIGINALS_DIR + "/{:07d}.pdf".format(document.pk),
)
document.filename = generate_filename(document)
@ -51,8 +58,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
# Enable encryption and check again
document.storage_type = Document.STORAGE_TYPE_GPG
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none.pdf.gpg")
self.assertEqual(document.filename, "none/none.pdf.gpg")
document.save()
@ -68,7 +74,9 @@ class TestFileHandling(DirectoriesMixin, TestCase):
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test.pdf.gpg"), True)
self.assertEqual(
os.path.isfile(settings.ORIGINALS_DIR + "/test/test.pdf.gpg"), True
)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_missing_permissions(self):
@ -79,13 +87,14 @@ class TestFileHandling(DirectoriesMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none.pdf")
self.assertEqual(document.filename, "none/none.pdf")
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
# Test source_path
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none.pdf")
self.assertEqual(
document.source_path, settings.ORIGINALS_DIR + "/none/none.pdf"
)
# Make the folder read- and execute-only (no writing and no renaming)
os.chmod(settings.ORIGINALS_DIR + "/none", 0o555)
@ -95,7 +104,9 @@ class TestFileHandling(DirectoriesMixin, TestCase):
document.save()
# Check proper handling of files
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), True)
self.assertEqual(
os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), True
)
self.assertEqual(document.filename, "none/none.pdf")
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
@ -103,7 +114,11 @@ class TestFileHandling(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_database_error(self):
document1 = Document.objects.create(mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
document1 = Document.objects.create(
mime_type="application/pdf",
storage_type=Document.STORAGE_TYPE_UNENCRYPTED,
checksum="AAAAA",
)
document = Document()
document.mime_type = "application/pdf"
@ -113,8 +128,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none.pdf")
self.assertEqual(document.filename, "none/none.pdf")
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
@ -122,8 +136,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
self.assertTrue(os.path.isfile(document.source_path))
# Set a correspondent and save the document
document.correspondent = Correspondent.objects.get_or_create(
name="test")[0]
document.correspondent = Correspondent.objects.get_or_create(name="test")[0]
with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
m.side_effect = DatabaseError()
@ -131,7 +144,9 @@ class TestFileHandling(DirectoriesMixin, TestCase):
# Check proper handling of files
self.assertTrue(os.path.isfile(document.source_path))
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), True)
self.assertEqual(
os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), True
)
self.assertEqual(document.filename, "none/none.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
@ -143,8 +158,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none.pdf")
self.assertEqual(document.filename, "none/none.pdf")
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
@ -152,10 +166,15 @@ class TestFileHandling(DirectoriesMixin, TestCase):
# Ensure file deletion after delete
pk = document.pk
document.delete()
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), False)
self.assertEqual(
os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), False
)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}", TRASH_DIR=tempfile.mkdtemp())
@override_settings(
PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}",
TRASH_DIR=tempfile.mkdtemp(),
)
def test_document_delete_trash(self):
document = Document()
document.mime_type = "application/pdf"
@ -164,8 +183,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none.pdf")
self.assertEqual(document.filename, "none/none.pdf")
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
@ -173,7 +191,9 @@ class TestFileHandling(DirectoriesMixin, TestCase):
# Ensure file was moved to trash after delete
self.assertEqual(os.path.isfile(settings.TRASH_DIR + "/none/none.pdf"), False)
document.delete()
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), False)
self.assertEqual(
os.path.isfile(settings.ORIGINALS_DIR + "/none/none.pdf"), False
)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
self.assertEqual(os.path.isfile(settings.TRASH_DIR + "/none.pdf"), True)
self.assertEqual(os.path.isfile(settings.TRASH_DIR + "/none_01.pdf"), False)
@ -207,8 +227,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename,
"none/none.pdf")
self.assertEqual(document.filename, "none/none.pdf")
create_source_path_directory(document.source_path)
@ -238,8 +257,18 @@ class TestFileHandling(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{asn} - {title}")
def test_asn(self):
d1 = Document.objects.create(title="the_doc", mime_type="application/pdf", archive_serial_number=652, checksum="A")
d2 = Document.objects.create(title="the_doc", mime_type="application/pdf", archive_serial_number=None, checksum="B")
d1 = Document.objects.create(
title="the_doc",
mime_type="application/pdf",
archive_serial_number=652,
checksum="A",
)
d2 = Document.objects.create(
title="the_doc",
mime_type="application/pdf",
archive_serial_number=None,
checksum="B",
)
self.assertEqual(generate_filename(d1), "652 - the_doc.pdf")
self.assertEqual(generate_filename(d2), "none - the_doc.pdf")
@ -256,8 +285,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
document.save()
# Ensure that filename is properly generated
self.assertEqual(generate_filename(document),
"demo.pdf")
self.assertEqual(generate_filename(document), "demo.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_dash(self):
@ -272,8 +300,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
document.save()
# Ensure that filename is properly generated
self.assertEqual(generate_filename(document),
"demo.pdf")
self.assertEqual(generate_filename(document), "demo.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_malformed(self):
@ -288,8 +315,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
document.save()
# Ensure that filename is properly generated
self.assertEqual(generate_filename(document),
"none.pdf")
self.assertEqual(generate_filename(document), "none.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
def test_tags_all(self):
@ -303,8 +329,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
document.save()
# Ensure that filename is properly generated
self.assertEqual(generate_filename(document),
"demo.pdf")
self.assertEqual(generate_filename(document), "demo.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
def test_tags_out_of_bounds(self):
@ -318,8 +343,7 @@ class TestFileHandling(DirectoriesMixin, TestCase):
document.save()
# Ensure that filename is properly generated
self.assertEqual(generate_filename(document),
"none.pdf")
self.assertEqual(generate_filename(document), "none.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags}")
def test_tags_without_args(self):
@ -338,7 +362,9 @@ class TestFileHandling(DirectoriesMixin, TestCase):
self.assertEqual(generate_filename(doc), "doc1 tag1,tag2.pdf")
doc = Document.objects.create(title="doc2", checksum="B", mime_type="application/pdf")
doc = Document.objects.create(
title="doc2", checksum="B", mime_type="application/pdf"
)
self.assertEqual(generate_filename(doc), "doc2.pdf")
@ -348,12 +374,19 @@ class TestFileHandling(DirectoriesMixin, TestCase):
doc.filename = generate_filename(doc)
doc.save()
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "etc", "something", "doc1.pdf"))
self.assertEqual(
doc.source_path,
os.path.join(settings.ORIGINALS_DIR, "etc", "something", "doc1.pdf"),
)
@override_settings(PAPERLESS_FILENAME_FORMAT="{created_year}-{created_month}-{created_day}")
@override_settings(
PAPERLESS_FILENAME_FORMAT="{created_year}-{created_month}-{created_day}"
)
def test_created_year_month_day(self):
d1 = timezone.make_aware(datetime.datetime(2020, 3, 6, 1, 1, 1))
doc1 = Document.objects.create(title="doc1", mime_type="application/pdf", created=d1)
doc1 = Document.objects.create(
title="doc1", mime_type="application/pdf", created=d1
)
self.assertEqual(generate_filename(doc1), "2020-03-06.pdf")
@ -361,10 +394,14 @@ class TestFileHandling(DirectoriesMixin, TestCase):
self.assertEqual(generate_filename(doc1), "2020-11-16.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{added_year}-{added_month}-{added_day}")
@override_settings(
PAPERLESS_FILENAME_FORMAT="{added_year}-{added_month}-{added_day}"
)
def test_added_year_month_day(self):
d1 = timezone.make_aware(datetime.datetime(232, 1, 9, 1, 1, 1))
doc1 = Document.objects.create(title="doc1", mime_type="application/pdf", added=d1)
doc1 = Document.objects.create(
title="doc1", mime_type="application/pdf", added=d1
)
self.assertEqual(generate_filename(doc1), "232-01-09.pdf")
@ -372,7 +409,9 @@ class TestFileHandling(DirectoriesMixin, TestCase):
self.assertEqual(generate_filename(doc1), "2020-11-16.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
@override_settings(
PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}"
)
def test_nested_directory_cleanup(self):
document = Document()
document.mime_type = "application/pdf"
@ -391,7 +430,9 @@ class TestFileHandling(DirectoriesMixin, TestCase):
pk = document.pk
document.delete()
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none.pdf"), False)
self.assertEqual(
os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none.pdf"), False
)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR), True)
@ -414,12 +455,12 @@ class TestFileHandling(DirectoriesMixin, TestCase):
Path(os.path.join(tmp, "notempty", "file")).touch()
os.makedirs(os.path.join(tmp, "notempty", "empty"))
delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR)
delete_empty_directories(
os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR
)
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
self.assertEqual(os.path.isfile(
os.path.join(tmp, "notempty", "file")), True)
self.assertEqual(os.path.isdir(
os.path.join(tmp, "notempty", "empty")), False)
self.assertEqual(os.path.isfile(os.path.join(tmp, "notempty", "file")), True)
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty", "empty")), False)
@override_settings(PAPERLESS_FILENAME_FORMAT="{created/[title]")
def test_invalid_format(self):
@ -441,8 +482,12 @@ class TestFileHandling(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
def test_duplicates(self):
document = Document.objects.create(mime_type="application/pdf", title="qwe", checksum="A", pk=1)
document2 = Document.objects.create(mime_type="application/pdf", title="qwe", checksum="B", pk=2)
document = Document.objects.create(
mime_type="application/pdf", title="qwe", checksum="A", pk=1
)
document2 = Document.objects.create(
mime_type="application/pdf", title="qwe", checksum="B", pk=2
)
Path(document.source_path).touch()
Path(document2.source_path).touch()
document.filename = "0000001.pdf"
@ -480,11 +525,17 @@ class TestFileHandling(DirectoriesMixin, TestCase):
self.assertTrue(os.path.isfile(document.source_path))
self.assertEqual(document2.filename, "qwe.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
@mock.patch("documents.signals.handlers.Document.objects.filter")
def test_no_update_without_change(self, m):
doc = Document.objects.create(title="document", filename="document.pdf", archive_filename="document.pdf", checksum="A", archive_checksum="B", mime_type="application/pdf")
doc = Document.objects.create(
title="document",
filename="document.pdf",
archive_filename="document.pdf",
checksum="A",
archive_checksum="B",
mime_type="application/pdf",
)
Path(doc.source_path).touch()
Path(doc.archive_path).touch()
@ -493,16 +544,20 @@ class TestFileHandling(DirectoriesMixin, TestCase):
m.assert_not_called()
class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
def test_create_no_format(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_filename="0000001.pdf", archive_checksum="B")
doc = Document.objects.create(
mime_type="application/pdf",
filename="0000001.pdf",
checksum="A",
archive_filename="0000001.pdf",
archive_checksum="B",
)
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
@ -515,21 +570,39 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
doc = Document.objects.create(
mime_type="application/pdf",
title="my_doc",
filename="0000001.pdf",
checksum="A",
archive_checksum="B",
archive_filename="0000001.pdf",
)
self.assertFalse(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc.pdf"))
self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf"))
self.assertEqual(
doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc.pdf")
)
self.assertEqual(
doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf")
)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_move_archive_gone(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
doc = Document.objects.create(
mime_type="application/pdf",
title="my_doc",
filename="0000001.pdf",
checksum="A",
archive_checksum="B",
archive_filename="0000001.pdf",
)
self.assertTrue(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
@ -545,7 +618,14 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
Path(archive).touch()
os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
Path(existing_archive_file).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
doc = Document.objects.create(
mime_type="application/pdf",
title="my_doc",
filename="0000001.pdf",
checksum="A",
archive_checksum="B",
archive_filename="0000001.pdf",
)
self.assertFalse(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
@ -561,8 +641,14 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="document", filename="document_01.pdf", checksum="A",
archive_checksum="B", archive_filename="document.pdf")
doc = Document.objects.create(
mime_type="application/pdf",
title="document",
filename="document_01.pdf",
checksum="A",
archive_checksum="B",
archive_filename="document.pdf",
)
self.assertEqual(doc.filename, "document.pdf")
self.assertEqual(doc.archive_filename, "document.pdf")
@ -577,8 +663,14 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="document", filename="document.pdf", checksum="A",
archive_checksum="B", archive_filename="document_01.pdf")
doc = Document.objects.create(
mime_type="application/pdf",
title="document",
filename="document.pdf",
checksum="A",
archive_checksum="B",
archive_filename="document_01.pdf",
)
self.assertEqual(doc.filename, "document.pdf")
self.assertEqual(doc.archive_filename, "document.pdf")
@ -589,7 +681,6 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
@mock.patch("documents.signals.handlers.os.rename")
def test_move_archive_error(self, m):
def fake_rename(src, dst):
if "archive" in src:
raise OSError()
@ -603,7 +694,14 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
doc = Document.objects.create(
mime_type="application/pdf",
title="my_doc",
filename="0000001.pdf",
checksum="A",
archive_checksum="B",
archive_filename="0000001.pdf",
)
m.assert_called()
self.assertTrue(os.path.isfile(original))
@ -615,9 +713,16 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
def test_move_file_gone(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
#Path(original).touch()
# Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", archive_filename="0000001.pdf", checksum="A", archive_checksum="B")
doc = Document.objects.create(
mime_type="application/pdf",
title="my_doc",
filename="0000001.pdf",
archive_filename="0000001.pdf",
checksum="A",
archive_checksum="B",
)
self.assertFalse(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
@ -627,7 +732,6 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
@mock.patch("documents.signals.handlers.os.rename")
def test_move_file_error(self, m):
def fake_rename(src, dst):
if "original" in src:
raise OSError()
@ -641,7 +745,14 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", archive_filename="0000001.pdf", checksum="A", archive_checksum="B")
doc = Document.objects.create(
mime_type="application/pdf",
title="my_doc",
filename="0000001.pdf",
archive_filename="0000001.pdf",
checksum="A",
archive_checksum="B",
)
m.assert_called()
self.assertTrue(os.path.isfile(original))
@ -655,7 +766,14 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
doc = Document.objects.create(
mime_type="application/pdf",
title="my_doc",
filename="0000001.pdf",
checksum="A",
archive_checksum="B",
archive_filename="0000001.pdf",
)
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
@ -678,8 +796,20 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
Path(original2).touch()
Path(archive).touch()
doc1 = Document.objects.create(mime_type="image/png", title="document", filename="document.png", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
doc2 = Document.objects.create(mime_type="application/pdf", title="0000001", filename="0000001.pdf", checksum="C")
doc1 = Document.objects.create(
mime_type="image/png",
title="document",
filename="document.png",
checksum="A",
archive_checksum="B",
archive_filename="0000001.pdf",
)
doc2 = Document.objects.create(
mime_type="application/pdf",
title="0000001",
filename="0000001.pdf",
checksum="C",
)
self.assertTrue(os.path.isfile(doc1.source_path))
self.assertTrue(os.path.isfile(doc1.archive_path))
@ -698,7 +828,14 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_filename="0000001.pdf", archive_checksum="B")
doc = Document(
mime_type="application/pdf",
title="my_doc",
filename="0000001.pdf",
checksum="A",
archive_filename="0000001.pdf",
archive_checksum="B",
)
with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
m.side_effect = DatabaseError()
doc.save()
@ -710,28 +847,38 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
class TestFilenameGeneration(TestCase):
@override_settings(
PAPERLESS_FILENAME_FORMAT="{title}"
)
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
def test_invalid_characters(self):
doc = Document.objects.create(title="This. is the title.", mime_type="application/pdf", pk=1, checksum="1")
doc = Document.objects.create(
title="This. is the title.", mime_type="application/pdf", pk=1, checksum="1"
)
self.assertEqual(generate_filename(doc), "This. is the title.pdf")
doc = Document.objects.create(title="my\\invalid/../title:yay", mime_type="application/pdf", pk=2, checksum="2")
doc = Document.objects.create(
title="my\\invalid/../title:yay",
mime_type="application/pdf",
pk=2,
checksum="2",
)
self.assertEqual(generate_filename(doc), "my-invalid-..-title-yay.pdf")
@override_settings(
PAPERLESS_FILENAME_FORMAT="{created}"
)
@override_settings(PAPERLESS_FILENAME_FORMAT="{created}")
def test_date(self):
doc = Document.objects.create(title="does not matter", created=timezone.make_aware(datetime.datetime(2020,5,21, 7,36,51, 153)), mime_type="application/pdf", pk=2, checksum="2")
doc = Document.objects.create(
title="does not matter",
created=timezone.make_aware(datetime.datetime(2020, 5, 21, 7, 36, 51, 153)),
mime_type="application/pdf",
pk=2,
checksum="2",
)
self.assertEqual(generate_filename(doc), "2020-05-21.pdf")
def run():
doc = Document.objects.create(checksum=str(uuid.uuid4()), title=str(uuid.uuid4()), content="wow")
doc = Document.objects.create(
checksum=str(uuid.uuid4()), title=str(uuid.uuid4()), content="wow"
)
doc.filename = generate_unique_filename(doc)
Path(doc.thumbnail_path).touch()
with open(doc.source_path, "w") as f:

View File

@ -6,14 +6,14 @@ from ..management.commands.document_importer import Command
class TestImporter(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
def test_check_manifest_exists(self):
cmd = Command()
self.assertRaises(
CommandError, cmd._check_manifest_exists, "/tmp/manifest.json")
CommandError, cmd._check_manifest_exists, "/tmp/manifest.json"
)
def test_check_manifest(self):
@ -23,15 +23,14 @@ class TestImporter(TestCase):
cmd.manifest = [{"model": "documents.document"}]
with self.assertRaises(CommandError) as cm:
cmd._check_manifest()
self.assertTrue(
'The manifest file contains a record' in str(cm.exception))
self.assertTrue("The manifest file contains a record" in str(cm.exception))
cmd.manifest = [{
"model": "documents.document",
EXPORTER_FILE_NAME: "noexist.pdf"
}]
cmd.manifest = [
{"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"}
]
# self.assertRaises(CommandError, cmd._check_manifest)
with self.assertRaises(CommandError) as cm:
cmd._check_manifest()
self.assertTrue(
'The manifest file refers to "noexist.pdf"' in str(cm.exception))
'The manifest file refers to "noexist.pdf"' in str(cm.exception)
)

View File

@ -6,10 +6,11 @@ from documents.tests.utils import DirectoriesMixin
class TestAutoComplete(DirectoriesMixin, TestCase):
def test_auto_complete(self):
doc1 = Document.objects.create(title="doc1", checksum="A", content="test test2 test3")
doc1 = Document.objects.create(
title="doc1", checksum="A", content="test test2 test3"
)
doc2 = Document.objects.create(title="doc2", checksum="B", content="test test2")
doc3 = Document.objects.create(title="doc3", checksum="C", content="test2")
@ -19,7 +20,11 @@ class TestAutoComplete(DirectoriesMixin, TestCase):
ix = index.open_index()
self.assertListEqual(index.autocomplete(ix, "tes"), [b"test3", b"test", b"test2"])
self.assertListEqual(index.autocomplete(ix, "tes", limit=3), [b"test3", b"test", b"test2"])
self.assertListEqual(
index.autocomplete(ix, "tes"), [b"test3", b"test", b"test2"]
)
self.assertListEqual(
index.autocomplete(ix, "tes", limit=3), [b"test3", b"test", b"test2"]
)
self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"])
self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])

View File

@ -22,21 +22,29 @@ sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
class TestArchiver(DirectoriesMixin, TestCase):
def make_models(self):
return Document.objects.create(checksum="A", title="A", content="first document", mime_type="application/pdf")
return Document.objects.create(
checksum="A",
title="A",
content="first document",
mime_type="application/pdf",
)
def test_archiver(self):
doc = self.make_models()
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
shutil.copy(
sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf")
)
call_command('document_archiver')
call_command("document_archiver")
def test_handle_document(self):
doc = self.make_models()
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"))
shutil.copy(
sample_file, os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf")
)
handle_document(doc.pk)
@ -66,10 +74,24 @@ class TestArchiver(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
def test_naming_priorities(self):
doc1 = Document.objects.create(checksum="A", title="document", content="first document", mime_type="application/pdf", filename="document.pdf")
doc2 = Document.objects.create(checksum="B", title="document", content="second document", mime_type="application/pdf", filename="document_01.pdf")
doc1 = Document.objects.create(
checksum="A",
title="document",
content="first document",
mime_type="application/pdf",
filename="document.pdf",
)
doc2 = Document.objects.create(
checksum="B",
title="document",
content="second document",
mime_type="application/pdf",
filename="document_01.pdf",
)
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"document.pdf"))
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"document_01.pdf"))
shutil.copy(
sample_file, os.path.join(self.dirs.originals_dir, f"document_01.pdf")
)
handle_document(doc2.pk)
handle_document(doc1.pk)
@ -82,12 +104,11 @@ class TestArchiver(DirectoriesMixin, TestCase):
class TestDecryptDocuments(TestCase):
@override_settings(
ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
PASSPHRASE="test",
PAPERLESS_FILENAME_FORMAT=None
PAPERLESS_FILENAME_FORMAT=None,
)
@mock.patch("documents.management.commands.decrypt_documents.input")
def test_decrypt(self, m):
@ -99,17 +120,39 @@ class TestDecryptDocuments(TestCase):
os.makedirs(thumb_dir, exist_ok=True)
override_settings(
ORIGINALS_DIR=originals_dir,
THUMBNAIL_DIR=thumb_dir,
PASSPHRASE="test"
ORIGINALS_DIR=originals_dir, THUMBNAIL_DIR=thumb_dir, PASSPHRASE="test"
).enable()
doc = Document.objects.create(checksum="82186aaa94f0b98697d704b90fd1c072", title="wow", filename="0000004.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
doc = Document.objects.create(
checksum="82186aaa94f0b98697d704b90fd1c072",
title="wow",
filename="0000004.pdf.gpg",
mime_type="application/pdf",
storage_type=Document.STORAGE_TYPE_GPG,
)
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000004.pdf.gpg"), os.path.join(originals_dir, "0000004.pdf.gpg"))
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", f"0000004.png.gpg"), os.path.join(thumb_dir, f"{doc.id:07}.png.gpg"))
shutil.copy(
os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000004.pdf.gpg",
),
os.path.join(originals_dir, "0000004.pdf.gpg"),
)
shutil.copy(
os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"thumbnails",
f"0000004.png.gpg",
),
os.path.join(thumb_dir, f"{doc.id:07}.png.gpg"),
)
call_command('decrypt_documents')
call_command("decrypt_documents")
doc.refresh_from_db()
@ -126,7 +169,6 @@ class TestDecryptDocuments(TestCase):
class TestMakeIndex(TestCase):
@mock.patch("documents.management.commands.document_index.index_reindex")
def test_reindex(self, m):
call_command("document_index", "reindex")
@ -139,7 +181,6 @@ class TestMakeIndex(TestCase):
class TestRenamer(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_rename(self):
doc = Document.objects.create(title="test", mime_type="image/jpeg")
@ -164,8 +205,9 @@ class TestRenamer(DirectoriesMixin, TestCase):
class TestCreateClassifier(TestCase):
@mock.patch("documents.management.commands.document_create_classifier.train_classifier")
@mock.patch(
"documents.management.commands.document_create_classifier.train_classifier"
)
def test_create_classifier(self, m):
call_command("document_create_classifier")
@ -173,7 +215,6 @@ class TestCreateClassifier(TestCase):
class TestSanityChecker(DirectoriesMixin, TestCase):
def test_no_issues(self):
with self.assertLogs() as capture:
call_command("document_sanity_checker")
@ -182,7 +223,9 @@ class TestSanityChecker(DirectoriesMixin, TestCase):
self.assertIn("Sanity checker detected no issues.", capture.output[0])
def test_errors(self):
doc = Document.objects.create(title="test", content="test", filename="test.pdf", checksum="abc")
doc = Document.objects.create(
title="test", content="test", filename="test.pdf", checksum="abc"
)
Path(doc.source_path).touch()
Path(doc.thumbnail_path).touch()

View File

@ -16,7 +16,6 @@ from documents.tests.utils import DirectoriesMixin
class ConsumerThread(Thread):
def __init__(self):
super().__init__()
self.cmd = document_consumer.Command()
@ -31,7 +30,7 @@ class ConsumerThread(Thread):
def chunked(size, source):
for i in range(0, len(source), size):
yield source[i:i+size]
yield source[i : i + size]
class ConsumerMixin:
@ -41,7 +40,9 @@ class ConsumerMixin:
def setUp(self) -> None:
super(ConsumerMixin, self).setUp()
self.t = None
patcher = mock.patch("documents.management.commands.document_consumer.async_task")
patcher = mock.patch(
"documents.management.commands.document_consumer.async_task"
)
self.task_mock = patcher.start()
self.addCleanup(patcher.stop)
@ -81,13 +82,13 @@ class ConsumerMixin:
print("Consumed a perfectly valid file.")
def slow_write_file(self, target, incomplete=False):
with open(self.sample_file, 'rb') as f:
with open(self.sample_file, "rb") as f:
pdf_bytes = f.read()
if incomplete:
pdf_bytes = pdf_bytes[:len(pdf_bytes) - 100]
pdf_bytes = pdf_bytes[: len(pdf_bytes) - 100]
with open(target, 'wb') as f:
with open(target, "wb") as f:
# this will take 2 seconds, since the file is about 20k.
print("Start writing file.")
for b in chunked(1000, pdf_bytes):
@ -97,7 +98,6 @@ class ConsumerMixin:
class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
def test_consume_file(self):
self.t_start()
@ -195,23 +195,35 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
@override_settings(CONSUMPTION_DIR="does_not_exist")
def test_consumption_directory_invalid(self):
self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
self.assertRaises(CommandError, call_command, "document_consumer", "--oneshot")
@override_settings(CONSUMPTION_DIR="")
def test_consumption_directory_unset(self):
self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
self.assertRaises(CommandError, call_command, "document_consumer", "--oneshot")
def test_mac_write(self):
self.task_mock.side_effect = self.bogus_task
self.t_start()
shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, ".DS_STORE"))
shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "._my_file.pdf"))
shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "my_second_file.pdf"))
shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "._my_second_file.pdf"))
shutil.copy(
self.sample_file, os.path.join(self.dirs.consumption_dir, ".DS_STORE")
)
shutil.copy(
self.sample_file, os.path.join(self.dirs.consumption_dir, "my_file.pdf")
)
shutil.copy(
self.sample_file, os.path.join(self.dirs.consumption_dir, "._my_file.pdf")
)
shutil.copy(
self.sample_file,
os.path.join(self.dirs.consumption_dir, "my_second_file.pdf"),
)
shutil.copy(
self.sample_file,
os.path.join(self.dirs.consumption_dir, "._my_second_file.pdf"),
)
sleep(5)
@ -219,15 +231,20 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.assertEqual(2, self.task_mock.call_count)
fnames = [os.path.basename(args[1]) for args, _ in self.task_mock.call_args_list]
fnames = [
os.path.basename(args[1]) for args, _ in self.task_mock.call_args_list
]
self.assertCountEqual(fnames, ["my_file.pdf", "my_second_file.pdf"])
def test_is_ignored(self):
test_paths = [
(os.path.join(self.dirs.consumption_dir, "foo.pdf"), False),
(os.path.join(self.dirs.consumption_dir, "foo","bar.pdf"), False),
(os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"), False),
(os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True),
(os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"), True),
(
os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"),
True,
),
(os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True),
(os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True),
(os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False),
@ -236,10 +253,13 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.assertEqual(
expected_ignored,
document_consumer._is_ignored(file_path),
f'_is_ignored("{file_path}") != {expected_ignored}')
f'_is_ignored("{file_path}") != {expected_ignored}',
)
@override_settings(CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=3, CONSUMER_POLLING_RETRY_COUNT=20)
@override_settings(
CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=3, CONSUMER_POLLING_RETRY_COUNT=20
)
class TestConsumerPolling(TestConsumer):
# just do all the tests with polling
pass
@ -251,21 +271,27 @@ class TestConsumerRecursive(TestConsumer):
pass
@override_settings(CONSUMER_RECURSIVE=True, CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=3, CONSUMER_POLLING_RETRY_COUNT=20)
@override_settings(
CONSUMER_RECURSIVE=True,
CONSUMER_POLLING=1,
CONSUMER_POLLING_DELAY=3,
CONSUMER_POLLING_RETRY_COUNT=20,
)
class TestConsumerRecursivePolling(TestConsumer):
# just do all the tests with polling and recursive
pass
class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
@override_settings(CONSUMER_RECURSIVE=True)
@override_settings(CONSUMER_SUBDIRS_AS_TAGS=True)
def test_consume_file_with_path_tags(self):
tag_names = ("existingTag", "Space Tag")
# Create a Tag prior to consuming a file using it in path
tag_ids = [Tag.objects.create(name="existingtag").pk,]
tag_ids = [
Tag.objects.create(name="existingtag").pk,
]
self.t_start()
@ -292,6 +318,8 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
# their order.
self.assertCountEqual(kwargs["override_tag_ids"], tag_ids)
@override_settings(CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20)
@override_settings(
CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20
)
def test_consume_file_with_path_tags_polling(self):
self.test_consume_file_with_path_tags()

View File

@ -17,15 +17,41 @@ from documents.tests.utils import DirectoriesMixin, paperless_environment
class TestExportImport(DirectoriesMixin, TestCase):
def setUp(self) -> None:
self.target = tempfile.mkdtemp()
self.addCleanup(shutil.rmtree, self.target)
self.d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow1", filename="0000001.pdf", mime_type="application/pdf", archive_filename="0000001.pdf")
self.d2 = Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow2", filename="0000002.pdf", mime_type="application/pdf")
self.d3 = Document.objects.create(content="Content", checksum="d38d7ed02e988e072caf924e0f3fcb76", title="wow2", filename="0000003.pdf", mime_type="application/pdf")
self.d4 = Document.objects.create(content="Content", checksum="82186aaa94f0b98697d704b90fd1c072", title="wow_dec", filename="0000004.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
self.d1 = Document.objects.create(
content="Content",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b",
title="wow1",
filename="0000001.pdf",
mime_type="application/pdf",
archive_filename="0000001.pdf",
)
self.d2 = Document.objects.create(
content="Content",
checksum="9c9691e51741c1f4f41a20896af31770",
title="wow2",
filename="0000002.pdf",
mime_type="application/pdf",
)
self.d3 = Document.objects.create(
content="Content",
checksum="d38d7ed02e988e072caf924e0f3fcb76",
title="wow2",
filename="0000003.pdf",
mime_type="application/pdf",
)
self.d4 = Document.objects.create(
content="Content",
checksum="82186aaa94f0b98697d704b90fd1c072",
title="wow_dec",
filename="0000004.pdf.gpg",
mime_type="application/pdf",
storage_type=Document.STORAGE_TYPE_GPG,
)
self.t1 = Tag.objects.create(name="t")
self.dt1 = DocumentType.objects.create(name="dt")
@ -38,17 +64,21 @@ class TestExportImport(DirectoriesMixin, TestCase):
super(TestExportImport, self).setUp()
def _get_document_from_manifest(self, manifest, id):
f = list(filter(lambda d: d['model'] == "documents.document" and d['pk'] == id, manifest))
f = list(
filter(
lambda d: d["model"] == "documents.document" and d["pk"] == id, manifest
)
)
if len(f) == 1:
return f[0]
else:
raise ValueError(f"document with id {id} does not exist in manifest")
@override_settings(
PASSPHRASE="test"
)
def _do_export(self, use_filename_format=False, compare_checksums=False, delete=False):
args = ['document_exporter', self.target]
@override_settings(PASSPHRASE="test")
def _do_export(
self, use_filename_format=False, compare_checksums=False, delete=False
):
args = ["document_exporter", self.target]
if use_filename_format:
args += ["--use-filename-format"]
if compare_checksums:
@ -65,39 +95,69 @@ class TestExportImport(DirectoriesMixin, TestCase):
def test_exporter(self, use_filename_format=False):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(
os.path.join(os.path.dirname(__file__), "samples", "documents"),
os.path.join(self.dirs.media_dir, "documents"),
)
manifest = self._do_export(use_filename_format=use_filename_format)
self.assertEqual(len(manifest), 8)
self.assertEqual(len(list(filter(lambda e: e['model'] == 'documents.document', manifest))), 4)
self.assertEqual(
len(list(filter(lambda e: e["model"] == "documents.document", manifest))), 4
)
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
self.assertEqual(self._get_document_from_manifest(manifest, self.d1.id)['fields']['title'], "wow1")
self.assertEqual(self._get_document_from_manifest(manifest, self.d2.id)['fields']['title'], "wow2")
self.assertEqual(self._get_document_from_manifest(manifest, self.d3.id)['fields']['title'], "wow2")
self.assertEqual(self._get_document_from_manifest(manifest, self.d4.id)['fields']['title'], "wow_dec")
self.assertEqual(
self._get_document_from_manifest(manifest, self.d1.id)["fields"]["title"],
"wow1",
)
self.assertEqual(
self._get_document_from_manifest(manifest, self.d2.id)["fields"]["title"],
"wow2",
)
self.assertEqual(
self._get_document_from_manifest(manifest, self.d3.id)["fields"]["title"],
"wow2",
)
self.assertEqual(
self._get_document_from_manifest(manifest, self.d4.id)["fields"]["title"],
"wow_dec",
)
for element in manifest:
if element['model'] == 'documents.document':
fname = os.path.join(self.target, element[document_exporter.EXPORTER_FILE_NAME])
if element["model"] == "documents.document":
fname = os.path.join(
self.target, element[document_exporter.EXPORTER_FILE_NAME]
)
self.assertTrue(os.path.exists(fname))
self.assertTrue(os.path.exists(os.path.join(self.target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
self.assertTrue(
os.path.exists(
os.path.join(
self.target,
element[document_exporter.EXPORTER_THUMBNAIL_NAME],
)
)
)
with open(fname, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(checksum, element['fields']['checksum'])
self.assertEqual(checksum, element["fields"]["checksum"])
self.assertEqual(element['fields']['storage_type'], Document.STORAGE_TYPE_UNENCRYPTED)
self.assertEqual(
element["fields"]["storage_type"], Document.STORAGE_TYPE_UNENCRYPTED
)
if document_exporter.EXPORTER_ARCHIVE_NAME in element:
fname = os.path.join(self.target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
fname = os.path.join(
self.target, element[document_exporter.EXPORTER_ARCHIVE_NAME]
)
self.assertTrue(os.path.exists(fname))
with open(fname, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(checksum, element['fields']['archive_checksum'])
self.assertEqual(checksum, element["fields"]["archive_checksum"])
with paperless_environment() as dirs:
self.assertEqual(Document.objects.count(), 4)
@ -107,7 +167,7 @@ class TestExportImport(DirectoriesMixin, TestCase):
Tag.objects.all().delete()
self.assertEqual(Document.objects.count(), 0)
call_command('document_importer', self.target)
call_command("document_importer", self.target)
self.assertEqual(Document.objects.count(), 4)
self.assertEqual(Tag.objects.count(), 1)
self.assertEqual(Correspondent.objects.count(), 1)
@ -122,21 +182,31 @@ class TestExportImport(DirectoriesMixin, TestCase):
def test_exporter_with_filename_format(self):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(
os.path.join(os.path.dirname(__file__), "samples", "documents"),
os.path.join(self.dirs.media_dir, "documents"),
)
with override_settings(PAPERLESS_FILENAME_FORMAT="{created_year}/{correspondent}/{title}"):
with override_settings(
PAPERLESS_FILENAME_FORMAT="{created_year}/{correspondent}/{title}"
):
self.test_exporter(use_filename_format=True)
def test_update_export_changed_time(self):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(
os.path.join(os.path.dirname(__file__), "samples", "documents"),
os.path.join(self.dirs.media_dir, "documents"),
)
self._do_export()
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
st_mtime_1 = os.stat(os.path.join(self.target, "manifest.json")).st_mtime
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
with mock.patch(
"documents.management.commands.document_exporter.shutil.copy2"
) as m:
self._do_export()
m.assert_not_called()
@ -145,7 +215,9 @@ class TestExportImport(DirectoriesMixin, TestCase):
Path(self.d1.source_path).touch()
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
with mock.patch(
"documents.management.commands.document_exporter.shutil.copy2"
) as m:
self._do_export()
self.assertEqual(m.call_count, 1)
@ -157,13 +229,18 @@ class TestExportImport(DirectoriesMixin, TestCase):
def test_update_export_changed_checksum(self):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(
os.path.join(os.path.dirname(__file__), "samples", "documents"),
os.path.join(self.dirs.media_dir, "documents"),
)
self._do_export()
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
with mock.patch(
"documents.management.commands.document_exporter.shutil.copy2"
) as m:
self._do_export()
m.assert_not_called()
@ -172,7 +249,9 @@ class TestExportImport(DirectoriesMixin, TestCase):
self.d2.checksum = "asdfasdgf3"
self.d2.save()
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
with mock.patch(
"documents.management.commands.document_exporter.shutil.copy2"
) as m:
self._do_export(compare_checksums=True)
self.assertEqual(m.call_count, 1)
@ -180,28 +259,48 @@ class TestExportImport(DirectoriesMixin, TestCase):
def test_update_export_deleted_document(self):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(
os.path.join(os.path.dirname(__file__), "samples", "documents"),
os.path.join(self.dirs.media_dir, "documents"),
)
manifest = self._do_export()
self.assertTrue(len(manifest), 7)
doc_from_manifest = self._get_document_from_manifest(manifest, self.d3.id)
self.assertTrue(os.path.isfile(os.path.join(self.target, doc_from_manifest[EXPORTER_FILE_NAME])))
self.assertTrue(
os.path.isfile(
os.path.join(self.target, doc_from_manifest[EXPORTER_FILE_NAME])
)
)
self.d3.delete()
manifest = self._do_export()
self.assertRaises(ValueError, self._get_document_from_manifest, manifest, self.d3.id)
self.assertTrue(os.path.isfile(os.path.join(self.target, doc_from_manifest[EXPORTER_FILE_NAME])))
self.assertRaises(
ValueError, self._get_document_from_manifest, manifest, self.d3.id
)
self.assertTrue(
os.path.isfile(
os.path.join(self.target, doc_from_manifest[EXPORTER_FILE_NAME])
)
)
manifest = self._do_export(delete=True)
self.assertFalse(os.path.isfile(os.path.join(self.target, doc_from_manifest[EXPORTER_FILE_NAME])))
self.assertFalse(
os.path.isfile(
os.path.join(self.target, doc_from_manifest[EXPORTER_FILE_NAME])
)
)
self.assertTrue(len(manifest), 6)
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}/{correspondent}")
def test_update_export_changed_location(self):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(
os.path.join(os.path.dirname(__file__), "samples", "documents"),
os.path.join(self.dirs.media_dir, "documents"),
)
m = self._do_export(use_filename_format=True)
self.assertTrue(os.path.isfile(os.path.join(self.target, "wow1", "c.pdf")))
@ -216,11 +315,18 @@ class TestExportImport(DirectoriesMixin, TestCase):
self.assertTrue(os.path.isfile(os.path.join(self.target, "new_title", "c.pdf")))
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
self.assertTrue(os.path.isfile(os.path.join(self.target, "wow2", "none.pdf")))
self.assertTrue(os.path.isfile(os.path.join(self.target, "wow2", "none_01.pdf")))
self.assertTrue(
os.path.isfile(os.path.join(self.target, "wow2", "none_01.pdf"))
)
def test_export_missing_files(self):
target = tempfile.mkdtemp()
self.addCleanup(shutil.rmtree, target)
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", mime_type="application/pdf")
self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
Document.objects.create(
checksum="AAAAAAAAAAAAAAAAA",
title="wow",
filename="0000004.pdf",
mime_type="application/pdf",
)
self.assertRaises(FileNotFoundError, call_command, "document_exporter", target)

View File

@ -6,44 +6,64 @@ from documents.tests.utils import DirectoriesMixin
class TestRetagger(DirectoriesMixin, TestCase):
def make_models(self):
self.d1 = Document.objects.create(checksum="A", title="A", content="first document")
self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
self.d4 = Document.objects.create(checksum="D", title="D", content="auto document")
self.d1 = Document.objects.create(
checksum="A", title="A", content="first document"
)
self.d2 = Document.objects.create(
checksum="B", title="B", content="second document"
)
self.d3 = Document.objects.create(
checksum="C", title="C", content="unrelated document"
)
self.d4 = Document.objects.create(
checksum="D", title="D", content="auto document"
)
self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
self.tag_first = Tag.objects.create(
name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY
)
self.tag_second = Tag.objects.create(
name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY
)
self.tag_inbox = Tag.objects.create(name="test", is_inbox_tag=True)
self.tag_no_match = Tag.objects.create(name="test2")
self.tag_auto = Tag.objects.create(name="tagauto", matching_algorithm=Tag.MATCH_AUTO)
self.tag_auto = Tag.objects.create(
name="tagauto", matching_algorithm=Tag.MATCH_AUTO
)
self.d3.tags.add(self.tag_inbox)
self.d3.tags.add(self.tag_no_match)
self.d4.tags.add(self.tag_auto)
self.correspondent_first = Correspondent.objects.create(
name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY)
name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY
)
self.correspondent_second = Correspondent.objects.create(
name="c2", match="second", matching_algorithm=Correspondent.MATCH_ANY)
name="c2", match="second", matching_algorithm=Correspondent.MATCH_ANY
)
self.doctype_first = DocumentType.objects.create(
name="dt1", match="first", matching_algorithm=DocumentType.MATCH_ANY)
name="dt1", match="first", matching_algorithm=DocumentType.MATCH_ANY
)
self.doctype_second = DocumentType.objects.create(
name="dt2", match="second", matching_algorithm=DocumentType.MATCH_ANY)
name="dt2", match="second", matching_algorithm=DocumentType.MATCH_ANY
)
def get_updated_docs(self):
return Document.objects.get(title="A"), Document.objects.get(title="B"), \
Document.objects.get(title="C"), Document.objects.get(title="D")
return (
Document.objects.get(title="A"),
Document.objects.get(title="B"),
Document.objects.get(title="C"),
Document.objects.get(title="D"),
)
def setUp(self) -> None:
super(TestRetagger, self).setUp()
self.make_models()
def test_add_tags(self):
call_command('document_retagger', '--tags')
call_command("document_retagger", "--tags")
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.tags.count(), 1)
@ -55,14 +75,14 @@ class TestRetagger(DirectoriesMixin, TestCase):
self.assertEqual(d_second.tags.first(), self.tag_second)
def test_add_type(self):
call_command('document_retagger', '--document_type')
call_command("document_retagger", "--document_type")
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.document_type, self.doctype_first)
self.assertEqual(d_second.document_type, self.doctype_second)
def test_add_correspondent(self):
call_command('document_retagger', '--correspondent')
call_command("document_retagger", "--correspondent")
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.correspondent, self.correspondent_first)
@ -71,19 +91,26 @@ class TestRetagger(DirectoriesMixin, TestCase):
def test_overwrite_preserve_inbox(self):
self.d1.tags.add(self.tag_second)
call_command('document_retagger', '--tags', '--overwrite')
call_command("document_retagger", "--tags", "--overwrite")
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertIsNotNone(Tag.objects.get(id=self.tag_second.id))
self.assertCountEqual([tag.id for tag in d_first.tags.all()], [self.tag_first.id])
self.assertCountEqual([tag.id for tag in d_second.tags.all()], [self.tag_second.id])
self.assertCountEqual([tag.id for tag in d_unrelated.tags.all()], [self.tag_inbox.id, self.tag_no_match.id])
self.assertCountEqual(
[tag.id for tag in d_first.tags.all()], [self.tag_first.id]
)
self.assertCountEqual(
[tag.id for tag in d_second.tags.all()], [self.tag_second.id]
)
self.assertCountEqual(
[tag.id for tag in d_unrelated.tags.all()],
[self.tag_inbox.id, self.tag_no_match.id],
)
self.assertEqual(d_auto.tags.count(), 0)
def test_add_tags_suggest(self):
call_command('document_retagger', '--tags', '--suggest')
call_command("document_retagger", "--tags", "--suggest")
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.tags.count(), 0)
@ -91,21 +118,23 @@ class TestRetagger(DirectoriesMixin, TestCase):
self.assertEqual(d_auto.tags.count(), 1)
def test_add_type_suggest(self):
call_command('document_retagger', '--document_type', '--suggest')
call_command("document_retagger", "--document_type", "--suggest")
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.document_type, None)
self.assertEqual(d_second.document_type, None)
def test_add_correspondent_suggest(self):
call_command('document_retagger', '--correspondent', '--suggest')
call_command("document_retagger", "--correspondent", "--suggest")
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.correspondent, None)
self.assertEqual(d_second.correspondent, None)
def test_add_tags_suggest_url(self):
call_command('document_retagger', '--tags', '--suggest', '--base-url=http://localhost')
call_command(
"document_retagger", "--tags", "--suggest", "--base-url=http://localhost"
)
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.tags.count(), 0)
@ -113,14 +142,24 @@ class TestRetagger(DirectoriesMixin, TestCase):
self.assertEqual(d_auto.tags.count(), 1)
def test_add_type_suggest_url(self):
call_command('document_retagger', '--document_type', '--suggest', '--base-url=http://localhost')
call_command(
"document_retagger",
"--document_type",
"--suggest",
"--base-url=http://localhost",
)
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.document_type, None)
self.assertEqual(d_second.document_type, None)
def test_add_correspondent_suggest_url(self):
call_command('document_retagger', '--correspondent', '--suggest', '--base-url=http://localhost')
call_command(
"document_retagger",
"--correspondent",
"--suggest",
"--base-url=http://localhost",
)
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.correspondent, None)

View File

@ -12,7 +12,6 @@ from documents.tests.utils import DirectoriesMixin
class TestManageSuperUser(DirectoriesMixin, TestCase):
def reset_environment(self):
if "PAPERLESS_ADMIN_USER" in os.environ:
del os.environ["PAPERLESS_ADMIN_USER"]

View File

@ -11,13 +11,30 @@ from documents.tests.utils import DirectoriesMixin
class TestMakeThumbnails(DirectoriesMixin, TestCase):
def make_models(self):
self.d1 = Document.objects.create(checksum="A", title="A", content="first document", mime_type="application/pdf", filename="test.pdf")
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), self.d1.source_path)
self.d1 = Document.objects.create(
checksum="A",
title="A",
content="first document",
mime_type="application/pdf",
filename="test.pdf",
)
shutil.copy(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
self.d1.source_path,
)
self.d2 = Document.objects.create(checksum="Ass", title="A", content="first document", mime_type="application/pdf", filename="test2.pdf")
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), self.d2.source_path)
self.d2 = Document.objects.create(
checksum="Ass",
title="A",
content="first document",
mime_type="application/pdf",
filename="test2.pdf",
)
shutil.copy(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
self.d2.source_path,
)
def setUp(self) -> None:
super(TestMakeThumbnails, self).setUp()
@ -40,13 +57,13 @@ class TestMakeThumbnails(DirectoriesMixin, TestCase):
def test_command(self):
self.assertFalse(os.path.isfile(self.d1.thumbnail_path))
self.assertFalse(os.path.isfile(self.d2.thumbnail_path))
call_command('document_thumbnails')
call_command("document_thumbnails")
self.assertTrue(os.path.isfile(self.d1.thumbnail_path))
self.assertTrue(os.path.isfile(self.d2.thumbnail_path))
def test_command_documentid(self):
self.assertFalse(os.path.isfile(self.d1.thumbnail_path))
self.assertFalse(os.path.isfile(self.d2.thumbnail_path))
call_command('document_thumbnails', '-d', f"{self.d1.id}")
call_command("document_thumbnails", "-d", f"{self.d1.id}")
self.assertTrue(os.path.isfile(self.d1.thumbnail_path))
self.assertFalse(os.path.isfile(self.d2.thumbnail_path))

View File

@ -12,25 +12,24 @@ from ..signals import document_consumption_finished
class TestMatching(TestCase):
def _test_matching(self, text, algorithm, true, false):
for klass in (Tag, Correspondent, DocumentType):
instance = klass.objects.create(
name=str(randint(10000, 99999)),
match=text,
matching_algorithm=getattr(klass, algorithm)
matching_algorithm=getattr(klass, algorithm),
)
for string in true:
doc = Document(content=string)
self.assertTrue(
matching.matches(instance, doc),
'"%s" should match "%s" but it does not' % (text, string)
'"%s" should match "%s" but it does not' % (text, string),
)
for string in false:
doc = Document(content=string)
self.assertFalse(
matching.matches(instance, doc),
'"%s" should not match "%s" but it does' % (text, string)
'"%s" should not match "%s" but it does' % (text, string),
)
def test_match_all(self):
@ -47,15 +46,13 @@ class TestMatching(TestCase):
"I have alphas, charlie, and gamma in me",
"I have alphas in me",
"I have bravo in me",
)
),
)
self._test_matching(
"12 34 56",
"MATCH_ALL",
(
"I have 12 34, and 56 in me",
),
("I have 12 34, and 56 in me",),
(
"I have 12 in me",
"I have 34 in me",
@ -64,7 +61,7 @@ class TestMatching(TestCase):
"I have 120, 34, and 56 in me",
"I have 123456 in me",
"I have 01234567 in me",
)
),
)
self._test_matching(
@ -79,7 +76,7 @@ class TestMatching(TestCase):
"the quick brown wolf jumped over the lazy dogs",
"the quick brown fox jumped over the fat dogs",
"the quick brown fox jumped over the lazy... dogs",
)
),
)
def test_match_any(self):
@ -97,7 +94,7 @@ class TestMatching(TestCase):
(
"I have alphas in me",
"I have bravo in me",
)
),
)
self._test_matching(
@ -114,7 +111,7 @@ class TestMatching(TestCase):
(
"I have 123456 in me",
"I have 01234567 in me",
)
),
)
self._test_matching(
@ -124,9 +121,7 @@ class TestMatching(TestCase):
"the quick brown fox",
"jumped over the lazy dogs.",
),
(
"the lazy fox jumped over the brown dogs",
)
("the lazy fox jumped over the brown dogs",),
)
def test_match_literal(self):
@ -134,9 +129,7 @@ class TestMatching(TestCase):
self._test_matching(
"alpha charlie gamma",
"MATCH_LITERAL",
(
"I have 'alpha charlie gamma' in me",
),
("I have 'alpha charlie gamma' in me",),
(
"I have alpha in me",
"I have charlie in me",
@ -146,15 +139,13 @@ class TestMatching(TestCase):
"I have alphas, charlie, and gamma in me",
"I have alphas in me",
"I have bravo in me",
)
),
)
self._test_matching(
"12 34 56",
"MATCH_LITERAL",
(
"I have 12 34 56 in me",
),
("I have 12 34 56 in me",),
(
"I have 12 in me",
"I have 34 in me",
@ -165,7 +156,7 @@ class TestMatching(TestCase):
"I have 120, 340, and 560 in me",
"I have 123456 in me",
"I have 01234567 in me",
)
),
)
def test_match_regex(self):
@ -186,18 +177,11 @@ class TestMatching(TestCase):
"I have alpha, charlie, and gamma in me",
"I have alphas, charlie, and gamma in me",
"I have alphas in me",
)
),
)
def test_tach_invalid_regex(self):
self._test_matching(
"[[",
"MATCH_REGEX",
[],
[
"Don't match this"
]
)
self._test_matching("[[", "MATCH_REGEX", [], ["Don't match this"])
def test_match_fuzzy(self):
@ -210,9 +194,7 @@ class TestMatching(TestCase):
"1220 Main Street, Springfeld, Miss.",
"1220 Main Street Springfield Miss",
),
(
"1220 Main Street, Springfield, Mich.",
)
("1220 Main Street, Springfield, Mich.",),
)
@ -225,9 +207,10 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
def setUp(self):
TestCase.setUp(self)
User.objects.create_user(username='test_consumer', password='12345')
User.objects.create_user(username="test_consumer", password="12345")
self.doc_contains = Document.objects.create(
content="I contain the keyword.", mime_type="application/pdf")
content="I contain the keyword.", mime_type="application/pdf"
)
self.index_dir = tempfile.mkdtemp()
# TODO: we should not need the index here.
@ -238,40 +221,43 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
def test_tag_applied_any(self):
t1 = Tag.objects.create(
name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY)
name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY
)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
sender=self.__class__, document=self.doc_contains
)
self.assertTrue(list(self.doc_contains.tags.all()) == [t1])
def test_tag_not_applied(self):
Tag.objects.create(
name="test", match="no-match", matching_algorithm=Tag.MATCH_ANY)
name="test", match="no-match", matching_algorithm=Tag.MATCH_ANY
)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
sender=self.__class__, document=self.doc_contains
)
self.assertTrue(list(self.doc_contains.tags.all()) == [])
def test_correspondent_applied(self):
correspondent = Correspondent.objects.create(
name="test",
match="keyword",
matching_algorithm=Correspondent.MATCH_ANY
name="test", match="keyword", matching_algorithm=Correspondent.MATCH_ANY
)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
sender=self.__class__, document=self.doc_contains
)
self.assertTrue(self.doc_contains.correspondent == correspondent)
def test_correspondent_not_applied(self):
Tag.objects.create(
name="test",
match="no-match",
matching_algorithm=Correspondent.MATCH_ANY
name="test", match="no-match", matching_algorithm=Correspondent.MATCH_ANY
)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
sender=self.__class__, document=self.doc_contains
)
self.assertEqual(self.doc_contains.correspondent, None)
def test_logentry_created(self):
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
sender=self.__class__, document=self.doc_contains
)
self.assertEqual(LogEntry.objects.count(), 1)

View File

@ -24,20 +24,14 @@ def archive_path_old(self):
else:
fname = "{:07}.pdf".format(self.pk)
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
return os.path.join(settings.ARCHIVE_DIR, fname)
def archive_path_new(doc):
if doc.archive_filename is not None:
return os.path.join(
settings.ARCHIVE_DIR,
str(doc.archive_filename)
)
else:
return None
if doc.archive_filename is not None:
return os.path.join(settings.ARCHIVE_DIR, str(doc.archive_filename))
else:
return None
def source_path(doc):
@ -48,10 +42,7 @@ def source_path(doc):
if doc.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg" # pragma: no cover
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
return os.path.join(settings.ORIGINALS_DIR, fname)
def thumbnail_path(doc):
@ -59,13 +50,18 @@ def thumbnail_path(doc):
if doc.storage_type == STORAGE_TYPE_GPG:
file_name += ".gpg"
return os.path.join(
settings.THUMBNAIL_DIR,
file_name
)
return os.path.join(settings.THUMBNAIL_DIR, file_name)
def make_test_document(document_class, title: str, mime_type: str, original: str, original_filename: str, archive: str = None, archive_filename: str = None):
def make_test_document(
document_class,
title: str,
mime_type: str,
original: str,
original_filename: str,
archive: str = None,
archive_filename: str = None,
):
doc = document_class()
doc.filename = original_filename
doc.title = title
@ -96,8 +92,12 @@ def make_test_document(document_class, title: str, mime_type: str, original: str
simple_jpg = os.path.join(os.path.dirname(__file__), "samples", "simple.jpg")
simple_pdf = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
simple_pdf2 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf")
simple_pdf3 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000003.pdf")
simple_pdf2 = os.path.join(
os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf"
)
simple_pdf3 = os.path.join(
os.path.dirname(__file__), "samples", "documents", "originals", "0000003.pdf"
)
simple_txt = os.path.join(os.path.dirname(__file__), "samples", "simple.txt")
simple_png = os.path.join(os.path.dirname(__file__), "samples", "simple-noalpha.png")
simple_png2 = os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
@ -106,26 +106,52 @@ simple_png2 = os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
@override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFiles(DirectoriesMixin, TestMigrations):
migrate_from = '1011_auto_20210101_2340'
migrate_to = '1012_fix_archive_files'
migrate_from = "1011_auto_20210101_2340"
migrate_to = "1012_fix_archive_files"
def setUpBeforeMigration(self, apps):
Document = apps.get_model("documents", "Document")
self.unrelated = make_test_document(Document, "unrelated", "application/pdf", simple_pdf3, "unrelated.pdf", simple_pdf)
self.no_text = make_test_document(Document, "no-text", "image/png", simple_png2, "no-text.png", simple_pdf)
self.doc_no_archive = make_test_document(Document, "no_archive", "text/plain", simple_txt, "no_archive.txt")
self.clash1 = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf)
self.clash2 = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf)
self.clash3 = make_test_document(Document, "clash", "image/png", simple_png, "clash.png", simple_pdf)
self.clash4 = make_test_document(Document, "clash.png", "application/pdf", simple_pdf2, "clash.png.pdf", simple_pdf2)
self.unrelated = make_test_document(
Document,
"unrelated",
"application/pdf",
simple_pdf3,
"unrelated.pdf",
simple_pdf,
)
self.no_text = make_test_document(
Document, "no-text", "image/png", simple_png2, "no-text.png", simple_pdf
)
self.doc_no_archive = make_test_document(
Document, "no_archive", "text/plain", simple_txt, "no_archive.txt"
)
self.clash1 = make_test_document(
Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf
)
self.clash2 = make_test_document(
Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf
)
self.clash3 = make_test_document(
Document, "clash", "image/png", simple_png, "clash.png", simple_pdf
)
self.clash4 = make_test_document(
Document,
"clash.png",
"application/pdf",
simple_pdf2,
"clash.png.pdf",
simple_pdf2,
)
self.assertEqual(archive_path_old(self.clash1), archive_path_old(self.clash2))
self.assertEqual(archive_path_old(self.clash1), archive_path_old(self.clash3))
self.assertNotEqual(archive_path_old(self.clash1), archive_path_old(self.clash4))
self.assertNotEqual(
archive_path_old(self.clash1), archive_path_old(self.clash4)
)
def testArchiveFilesMigrated(self):
Document = self.apps.get_model('documents', 'Document')
Document = self.apps.get_model("documents", "Document")
for doc in Document.objects.all():
if doc.archive_checksum:
@ -144,31 +170,65 @@ class TestMigrateArchiveFiles(DirectoriesMixin, TestMigrations):
archive_checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(archive_checksum, doc.archive_checksum)
self.assertEqual(Document.objects.filter(archive_checksum__isnull=False).count(), 6)
self.assertEqual(
Document.objects.filter(archive_checksum__isnull=False).count(), 6
)
def test_filenames(self):
Document = self.apps.get_model('documents', 'Document')
self.assertEqual(Document.objects.get(id=self.unrelated.id).archive_filename, "unrelated.pdf")
self.assertEqual(Document.objects.get(id=self.no_text.id).archive_filename, "no-text.pdf")
self.assertEqual(Document.objects.get(id=self.doc_no_archive.id).archive_filename, None)
self.assertEqual(Document.objects.get(id=self.clash1.id).archive_filename, f"{self.clash1.id:07}.pdf")
self.assertEqual(Document.objects.get(id=self.clash2.id).archive_filename, f"{self.clash2.id:07}.pdf")
self.assertEqual(Document.objects.get(id=self.clash3.id).archive_filename, f"{self.clash3.id:07}.pdf")
self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
Document = self.apps.get_model("documents", "Document")
self.assertEqual(
Document.objects.get(id=self.unrelated.id).archive_filename, "unrelated.pdf"
)
self.assertEqual(
Document.objects.get(id=self.no_text.id).archive_filename, "no-text.pdf"
)
self.assertEqual(
Document.objects.get(id=self.doc_no_archive.id).archive_filename, None
)
self.assertEqual(
Document.objects.get(id=self.clash1.id).archive_filename,
f"{self.clash1.id:07}.pdf",
)
self.assertEqual(
Document.objects.get(id=self.clash2.id).archive_filename,
f"{self.clash2.id:07}.pdf",
)
self.assertEqual(
Document.objects.get(id=self.clash3.id).archive_filename,
f"{self.clash3.id:07}.pdf",
)
self.assertEqual(
Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf"
)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
class TestMigrateArchiveFilesWithFilenameFormat(TestMigrateArchiveFiles):
def test_filenames(self):
Document = self.apps.get_model('documents', 'Document')
self.assertEqual(Document.objects.get(id=self.unrelated.id).archive_filename, "unrelated.pdf")
self.assertEqual(Document.objects.get(id=self.no_text.id).archive_filename, "no-text.pdf")
self.assertEqual(Document.objects.get(id=self.doc_no_archive.id).archive_filename, None)
self.assertEqual(Document.objects.get(id=self.clash1.id).archive_filename, "none/clash.pdf")
self.assertEqual(Document.objects.get(id=self.clash2.id).archive_filename, "none/clash_01.pdf")
self.assertEqual(Document.objects.get(id=self.clash3.id).archive_filename, "none/clash_02.pdf")
self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
Document = self.apps.get_model("documents", "Document")
self.assertEqual(
Document.objects.get(id=self.unrelated.id).archive_filename, "unrelated.pdf"
)
self.assertEqual(
Document.objects.get(id=self.no_text.id).archive_filename, "no-text.pdf"
)
self.assertEqual(
Document.objects.get(id=self.doc_no_archive.id).archive_filename, None
)
self.assertEqual(
Document.objects.get(id=self.clash1.id).archive_filename, "none/clash.pdf"
)
self.assertEqual(
Document.objects.get(id=self.clash2.id).archive_filename,
"none/clash_01.pdf",
)
self.assertEqual(
Document.objects.get(id=self.clash3.id).archive_filename,
"none/clash_02.pdf",
)
self.assertEqual(
Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf"
)
def fake_parse_wrapper(parser, path, mime_type, file_name):
@ -179,34 +239,63 @@ def fake_parse_wrapper(parser, path, mime_type, file_name):
@override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
migrate_from = '1011_auto_20210101_2340'
migrate_to = '1012_fix_archive_files'
migrate_from = "1011_auto_20210101_2340"
migrate_to = "1012_fix_archive_files"
auto_migrate = False
def test_archive_missing(self):
Document = self.apps.get_model("documents", "Document")
doc = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf)
doc = make_test_document(
Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf
)
os.unlink(archive_path_old(doc))
self.assertRaisesMessage(ValueError, "does not exist at: ", self.performMigration)
self.assertRaisesMessage(
ValueError, "does not exist at: ", self.performMigration
)
def test_parser_missing(self):
Document = self.apps.get_model("documents", "Document")
doc1 = make_test_document(Document, "document", "invalid/typesss768", simple_png, "document.png", simple_pdf)
doc2 = make_test_document(Document, "document", "invalid/typesss768", simple_jpg, "document.jpg", simple_pdf)
doc1 = make_test_document(
Document,
"document",
"invalid/typesss768",
simple_png,
"document.png",
simple_pdf,
)
doc2 = make_test_document(
Document,
"document",
"invalid/typesss768",
simple_jpg,
"document.jpg",
simple_pdf,
)
self.assertRaisesMessage(ValueError, "no parsers are available", self.performMigration)
self.assertRaisesMessage(
ValueError, "no parsers are available", self.performMigration
)
@mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
def test_parser_error(self, m):
m.side_effect = ParseError()
Document = self.apps.get_model("documents", "Document")
doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
doc1 = make_test_document(
Document, "document", "image/png", simple_png, "document.png", simple_pdf
)
doc2 = make_test_document(
Document,
"document",
"application/pdf",
simple_jpg,
"document.jpg",
simple_pdf,
)
self.assertIsNotNone(doc1.archive_checksum)
self.assertIsNotNone(doc2.archive_checksum)
@ -217,12 +306,29 @@ class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
self.assertEqual(m.call_count, 6)
self.assertEqual(
len(list(filter(lambda log: "Parse error, will try again in 5 seconds" in log, capture.output))),
4)
len(
list(
filter(
lambda log: "Parse error, will try again in 5 seconds" in log,
capture.output,
)
)
),
4,
)
self.assertEqual(
len(list(filter(lambda log: "Unable to regenerate archive document for ID:" in log, capture.output))),
2)
len(
list(
filter(
lambda log: "Unable to regenerate archive document for ID:"
in log,
capture.output,
)
)
),
2,
)
Document = self.apps.get_model("documents", "Document")
@ -240,15 +346,33 @@ class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
Document = self.apps.get_model("documents", "Document")
doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
doc1 = make_test_document(
Document, "document", "image/png", simple_png, "document.png", simple_pdf
)
doc2 = make_test_document(
Document,
"document",
"application/pdf",
simple_jpg,
"document.jpg",
simple_pdf,
)
with self.assertLogs() as capture:
self.performMigration()
self.assertEqual(
len(list(filter(lambda log: "Parser did not return an archive document for document" in log, capture.output))),
2)
len(
list(
filter(
lambda log: "Parser did not return an archive document for document"
in log,
capture.output,
)
)
),
2,
)
Document = self.apps.get_model("documents", "Document")
@ -264,19 +388,37 @@ class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
@override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations):
migrate_from = '1012_fix_archive_files'
migrate_to = '1011_auto_20210101_2340'
migrate_from = "1012_fix_archive_files"
migrate_to = "1011_auto_20210101_2340"
def setUpBeforeMigration(self, apps):
Document = apps.get_model("documents", "Document")
doc_unrelated = make_test_document(Document, "unrelated", "application/pdf", simple_pdf2, "unrelated.txt", simple_pdf2, "unrelated.pdf")
doc_no_archive = make_test_document(Document, "no_archive", "text/plain", simple_txt, "no_archive.txt")
clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_02.pdf")
doc_unrelated = make_test_document(
Document,
"unrelated",
"application/pdf",
simple_pdf2,
"unrelated.txt",
simple_pdf2,
"unrelated.pdf",
)
doc_no_archive = make_test_document(
Document, "no_archive", "text/plain", simple_txt, "no_archive.txt"
)
clashB = make_test_document(
Document,
"clash",
"image/jpeg",
simple_jpg,
"clash.jpg",
simple_pdf,
"clash_02.pdf",
)
def testArchiveFilesReverted(self):
Document = self.apps.get_model('documents', 'Document')
Document = self.apps.get_model("documents", "Document")
for doc in Document.objects.all():
if doc.archive_checksum:
@ -291,35 +433,77 @@ class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations):
archive_checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(archive_checksum, doc.archive_checksum)
self.assertEqual(Document.objects.filter(archive_checksum__isnull=False).count(), 2)
self.assertEqual(
Document.objects.filter(archive_checksum__isnull=False).count(), 2
)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
class TestMigrateArchiveFilesBackwardsWithFilenameFormat(TestMigrateArchiveFilesBackwards):
class TestMigrateArchiveFilesBackwardsWithFilenameFormat(
TestMigrateArchiveFilesBackwards
):
pass
@override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFilesBackwardsErrors(DirectoriesMixin, TestMigrations):
migrate_from = '1012_fix_archive_files'
migrate_to = '1011_auto_20210101_2340'
migrate_from = "1012_fix_archive_files"
migrate_to = "1011_auto_20210101_2340"
auto_migrate = False
def test_filename_clash(self):
Document = self.apps.get_model("documents", "Document")
self.clashA = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf, "clash_02.pdf")
self.clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_01.pdf")
self.clashA = make_test_document(
Document,
"clash",
"application/pdf",
simple_pdf,
"clash.pdf",
simple_pdf,
"clash_02.pdf",
)
self.clashB = make_test_document(
Document,
"clash",
"image/jpeg",
simple_jpg,
"clash.jpg",
simple_pdf,
"clash_01.pdf",
)
self.assertRaisesMessage(ValueError, "would clash with another archive filename", self.performMigration)
self.assertRaisesMessage(
ValueError,
"would clash with another archive filename",
self.performMigration,
)
def test_filename_exists(self):
Document = self.apps.get_model("documents", "Document")
self.clashA = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf, "clash.pdf")
self.clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_01.pdf")
self.clashA = make_test_document(
Document,
"clash",
"application/pdf",
simple_pdf,
"clash.pdf",
simple_pdf,
"clash.pdf",
)
self.clashB = make_test_document(
Document,
"clash",
"image/jpeg",
simple_jpg,
"clash.jpg",
simple_pdf,
"clash_01.pdf",
)
self.assertRaisesMessage(ValueError, "file already exists.", self.performMigration)
self.assertRaisesMessage(
ValueError, "file already exists.", self.performMigration
)

View File

@ -19,10 +19,7 @@ def source_path_before(self):
if self.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg"
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
return os.path.join(settings.ORIGINALS_DIR, fname)
def file_type_after(self):
@ -37,30 +34,43 @@ def source_path_after(doc):
if doc.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg" # pragma: no cover
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
return os.path.join(settings.ORIGINALS_DIR, fname)
@override_settings(PASSPHRASE="test")
class TestMigrateMimeType(DirectoriesMixin, TestMigrations):
migrate_from = '1002_auto_20201111_1105'
migrate_to = '1003_mime_types'
migrate_from = "1002_auto_20201111_1105"
migrate_to = "1003_mime_types"
def setUpBeforeMigration(self, apps):
Document = apps.get_model("documents", "Document")
doc = Document.objects.create(title="test", file_type="pdf", filename="file1.pdf")
doc = Document.objects.create(
title="test", file_type="pdf", filename="file1.pdf"
)
self.doc_id = doc.id
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), source_path_before(doc))
shutil.copy(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
source_path_before(doc),
)
doc2 = Document.objects.create(checksum="B", file_type="pdf", storage_type=STORAGE_TYPE_GPG)
doc2 = Document.objects.create(
checksum="B", file_type="pdf", storage_type=STORAGE_TYPE_GPG
)
self.doc2_id = doc2.id
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000004.pdf.gpg"), source_path_before(doc2))
shutil.copy(
os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000004.pdf.gpg",
),
source_path_before(doc2),
)
def testMimeTypesMigrated(self):
Document = self.apps.get_model('documents', 'Document')
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(id=self.doc_id)
self.assertEqual(doc.mime_type, "application/pdf")
@ -72,17 +82,22 @@ class TestMigrateMimeType(DirectoriesMixin, TestMigrations):
@override_settings(PASSPHRASE="test")
class TestMigrateMimeTypeBackwards(DirectoriesMixin, TestMigrations):
migrate_from = '1003_mime_types'
migrate_to = '1002_auto_20201111_1105'
migrate_from = "1003_mime_types"
migrate_to = "1002_auto_20201111_1105"
def setUpBeforeMigration(self, apps):
Document = apps.get_model("documents", "Document")
doc = Document.objects.create(title="test", mime_type="application/pdf", filename="file1.pdf")
doc = Document.objects.create(
title="test", mime_type="application/pdf", filename="file1.pdf"
)
self.doc_id = doc.id
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), source_path_after(doc))
shutil.copy(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
source_path_after(doc),
)
def testMimeTypesReverted(self):
Document = self.apps.get_model('documents', 'Document')
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(id=self.doc_id)
self.assertEqual(doc.file_type, "pdf")

View File

@ -3,13 +3,13 @@ from documents.tests.utils import DirectoriesMixin, TestMigrations
class TestMigrateNullCharacters(DirectoriesMixin, TestMigrations):
migrate_from = '1014_auto_20210228_1614'
migrate_to = '1015_remove_null_characters'
migrate_from = "1014_auto_20210228_1614"
migrate_to = "1015_remove_null_characters"
def setUpBeforeMigration(self, apps):
Document = apps.get_model("documents", "Document")
self.doc = Document.objects.create(content="aaa\0bbb")
def testMimeTypesMigrated(self):
Document = self.apps.get_model('documents', 'Document')
Document = self.apps.get_model("documents", "Document")
self.assertNotIn("\0", Document.objects.get(id=self.doc.id).content)

Some files were not shown because too many files have changed in this diff Show More