mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'dev'
This commit is contained in:
@@ -5,6 +5,7 @@ import tempfile
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
||||
@@ -201,16 +202,25 @@ def scan_file_for_barcodes(
|
||||
return DocumentBarcodeInfo(pdf_filepath, barcodes)
|
||||
|
||||
|
||||
def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
|
||||
def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]:
|
||||
"""
|
||||
Search the parsed barcodes for separators
|
||||
and returns a list of page numbers, which
|
||||
separate the file into new files.
|
||||
and returns a dict of page numbers, which
|
||||
separate the file into new files, together
|
||||
with the information whether to keep the page.
|
||||
"""
|
||||
# filter all barcodes for the separator string
|
||||
# get the page numbers of the separating barcodes
|
||||
separator_pages = {bc.page: False for bc in barcodes if bc.is_separator}
|
||||
if not settings.CONSUMER_ENABLE_ASN_BARCODE:
|
||||
return separator_pages
|
||||
|
||||
return list({bc.page for bc in barcodes if bc.is_separator})
|
||||
# add the page numbers of the ASN barcodes
|
||||
# (except for first page, that might lead to infinite loops).
|
||||
return {
|
||||
**separator_pages,
|
||||
**{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0},
|
||||
}
|
||||
|
||||
|
||||
def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
|
||||
@@ -242,10 +252,11 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
|
||||
return asn
|
||||
|
||||
|
||||
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
||||
def separate_pages(filepath: str, pages_to_split_on: Dict[int, bool]) -> List[str]:
|
||||
"""
|
||||
Separate the provided pdf file on the pages_to_split_on.
|
||||
The pages which are defined by page_numbers will be removed.
|
||||
The pages which are defined by the keys in page_numbers
|
||||
will be removed if the corresponding value is false.
|
||||
Returns a list of (temporary) filepaths to consume.
|
||||
These will need to be deleted later.
|
||||
"""
|
||||
@@ -261,26 +272,28 @@ def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
||||
fname = os.path.splitext(os.path.basename(filepath))[0]
|
||||
pdf = Pdf.open(filepath)
|
||||
|
||||
# Start with an empty document
|
||||
current_document: List[Page] = []
|
||||
# A list of documents, ie a list of lists of pages
|
||||
documents: List[List[Page]] = []
|
||||
# A single document, ie a list of pages
|
||||
document: List[Page] = []
|
||||
documents: List[List[Page]] = [current_document]
|
||||
|
||||
for idx, page in enumerate(pdf.pages):
|
||||
# Keep building the new PDF as long as it is not a
|
||||
# separator index
|
||||
if idx not in pages_to_split_on:
|
||||
document.append(page)
|
||||
# Make sure to append the very last document to the documents
|
||||
if idx == (len(pdf.pages) - 1):
|
||||
documents.append(document)
|
||||
document = []
|
||||
else:
|
||||
# This is a split index, save the current PDF pages, and restart
|
||||
# a new destination page listing
|
||||
logger.debug(f"Starting new document at idx {idx}")
|
||||
documents.append(document)
|
||||
document = []
|
||||
current_document.append(page)
|
||||
continue
|
||||
|
||||
# This is a split index
|
||||
# Start a new destination page listing
|
||||
logger.debug(f"Starting new document at idx {idx}")
|
||||
current_document = []
|
||||
documents.append(current_document)
|
||||
keep_page = pages_to_split_on[idx]
|
||||
if keep_page:
|
||||
# Keep the page
|
||||
# (new document is started by asn barcode)
|
||||
current_document.append(page)
|
||||
|
||||
documents = [x for x in documents if len(x)]
|
||||
|
||||
@@ -312,11 +325,10 @@ def save_to_dir(
|
||||
Optionally rename the file.
|
||||
"""
|
||||
if os.path.isfile(filepath) and os.path.isdir(target_dir):
|
||||
dst = shutil.copy(filepath, target_dir)
|
||||
logging.debug(f"saved {str(filepath)} to {str(dst)}")
|
||||
if newname:
|
||||
dst_new = os.path.join(target_dir, newname)
|
||||
logger.debug(f"moving {str(dst)} to {str(dst_new)}")
|
||||
os.rename(dst, dst_new)
|
||||
dest = target_dir
|
||||
if newname is not None:
|
||||
dest = os.path.join(dest, newname)
|
||||
shutil.copy(filepath, dest)
|
||||
logging.debug(f"saved {str(filepath)} to {str(dest)}")
|
||||
else:
|
||||
logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")
|
||||
|
@@ -146,11 +146,16 @@ class Consumer(LoggingMixin):
|
||||
return
|
||||
# Validate the range is above zero and less than uint32_t max
|
||||
# otherwise, Whoosh can't handle it in the index
|
||||
if self.override_asn < 0 or self.override_asn > 0xFF_FF_FF_FF:
|
||||
if (
|
||||
self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
|
||||
or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
self._fail(
|
||||
MESSAGE_ASN_RANGE,
|
||||
f"Not consuming {self.filename}: "
|
||||
f"Given ASN {self.override_asn} is out of range [0, 4,294,967,295]",
|
||||
f"Given ASN {self.override_asn} is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
|
||||
)
|
||||
if Document.objects.filter(archive_serial_number=self.override_asn).exists():
|
||||
self._fail(
|
||||
@@ -337,6 +342,7 @@ class Consumer(LoggingMixin):
|
||||
mime_type,
|
||||
)
|
||||
if not parser_class:
|
||||
tempdir.cleanup()
|
||||
self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
|
||||
|
||||
# Notify all listeners that we're going to do some work.
|
||||
@@ -395,6 +401,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
except ParseError as e:
|
||||
document_parser.cleanup()
|
||||
tempdir.cleanup()
|
||||
self._fail(
|
||||
str(e),
|
||||
f"Error while consuming document {self.filename}: {e}",
|
||||
|
@@ -5,6 +5,7 @@ from contextlib import contextmanager
|
||||
|
||||
from dateutil.parser import isoparse
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from documents.models import Comment
|
||||
from documents.models import Document
|
||||
from whoosh import classify
|
||||
@@ -89,10 +90,22 @@ def open_index_searcher():
|
||||
searcher.close()
|
||||
|
||||
|
||||
def update_document(writer, doc):
|
||||
def update_document(writer: AsyncWriter, doc: Document):
|
||||
tags = ",".join([t.name for t in doc.tags.all()])
|
||||
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
|
||||
comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)])
|
||||
asn = doc.archive_serial_number
|
||||
if asn is not None and (
|
||||
asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
|
||||
or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
logger.error(
|
||||
f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
|
||||
f"ASN is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
|
||||
)
|
||||
asn = 0
|
||||
writer.update_document(
|
||||
id=doc.pk,
|
||||
title=doc.title,
|
||||
@@ -108,7 +121,7 @@ def update_document(writer, doc):
|
||||
has_type=doc.document_type is not None,
|
||||
created=doc.created,
|
||||
added=doc.added,
|
||||
asn=doc.archive_serial_number,
|
||||
asn=asn,
|
||||
modified=doc.modified,
|
||||
path=doc.storage_path.name if doc.storage_path else None,
|
||||
path_id=doc.storage_path.id if doc.storage_path else None,
|
||||
@@ -262,7 +275,7 @@ class DelayedFullTextQuery(DelayedQuery):
|
||||
["content", "title", "correspondent", "tag", "type", "comments"],
|
||||
self.searcher.ixreader.schema,
|
||||
)
|
||||
qp.add_plugin(DateParserPlugin())
|
||||
qp.add_plugin(DateParserPlugin(basedate=timezone.now()))
|
||||
q = qp.parse(q_str)
|
||||
|
||||
corrected = self.searcher.correct_query(q, q_str)
|
||||
|
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
from fnmatch import filter
|
||||
from pathlib import Path
|
||||
from pathlib import PurePath
|
||||
from threading import Event
|
||||
@@ -7,6 +8,7 @@ from threading import Thread
|
||||
from time import monotonic
|
||||
from time import sleep
|
||||
from typing import Final
|
||||
from typing import Set
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
@@ -25,15 +27,15 @@ except ImportError: # pragma: nocover
|
||||
logger = logging.getLogger("paperless.management.consumer")
|
||||
|
||||
|
||||
def _tags_from_path(filepath):
|
||||
"""Walk up the directory tree from filepath to CONSUMPTION_DIR
|
||||
and get or create Tag IDs for every directory.
|
||||
def _tags_from_path(filepath) -> Set[Tag]:
|
||||
"""
|
||||
Walk up the directory tree from filepath to CONSUMPTION_DIR
|
||||
and get or create Tag IDs for every directory.
|
||||
|
||||
Returns set of Tag models
|
||||
"""
|
||||
normalized_consumption_dir = os.path.abspath(
|
||||
os.path.normpath(settings.CONSUMPTION_DIR),
|
||||
)
|
||||
tag_ids = set()
|
||||
path_parts = Path(filepath).relative_to(normalized_consumption_dir).parent.parts
|
||||
path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
|
||||
for part in path_parts:
|
||||
tag_ids.add(
|
||||
Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
|
||||
@@ -43,14 +45,41 @@ def _tags_from_path(filepath):
|
||||
|
||||
|
||||
def _is_ignored(filepath: str) -> bool:
|
||||
normalized_consumption_dir = os.path.abspath(
|
||||
os.path.normpath(settings.CONSUMPTION_DIR),
|
||||
"""
|
||||
Checks if the given file should be ignored, based on configured
|
||||
patterns.
|
||||
|
||||
Returns True if the file is ignored, False otherwise
|
||||
"""
|
||||
filepath = os.path.abspath(
|
||||
os.path.normpath(filepath),
|
||||
)
|
||||
filepath_relative = PurePath(filepath).relative_to(normalized_consumption_dir)
|
||||
return any(filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
|
||||
|
||||
# Trim out the consume directory, leaving only filename and it's
|
||||
# path relative to the consume directory
|
||||
filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
|
||||
|
||||
# March through the components of the path, including directories and the filename
|
||||
# looking for anything matching
|
||||
# foo/bar/baz/file.pdf -> (foo, bar, baz, file.pdf)
|
||||
parts = []
|
||||
for part in filepath_relative.parts:
|
||||
# If the part is not the name (ie, it's a dir)
|
||||
# Need to append the trailing slash or fnmatch doesn't match
|
||||
# fnmatch("dir", "dir/*") == False
|
||||
# fnmatch("dir/", "dir/*") == True
|
||||
if part != filepath_relative.name:
|
||||
part = part + "/"
|
||||
parts.append(part)
|
||||
|
||||
for pattern in settings.CONSUMER_IGNORE_PATTERNS:
|
||||
if len(filter(parts, pattern)):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _consume(filepath):
|
||||
def _consume(filepath: str) -> None:
|
||||
if os.path.isdir(filepath) or _is_ignored(filepath):
|
||||
return
|
||||
|
||||
@@ -103,7 +132,13 @@ def _consume(filepath):
|
||||
logger.exception("Error while consuming document")
|
||||
|
||||
|
||||
def _consume_wait_unmodified(file):
|
||||
def _consume_wait_unmodified(file: str) -> None:
|
||||
"""
|
||||
Waits for the given file to appear unmodified based on file size
|
||||
and modification time. Will wait a configured number of seconds
|
||||
and retry a configured number of times before either consuming or
|
||||
giving up
|
||||
"""
|
||||
if _is_ignored(file):
|
||||
return
|
||||
|
||||
|
@@ -311,8 +311,8 @@ class Command(BaseCommand):
|
||||
archive_target = None
|
||||
|
||||
# 3.4. write files to target folder
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
|
||||
original_target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with document.source_file as out_file:
|
||||
|
@@ -0,0 +1,23 @@
|
||||
# Generated by Django 4.1.5 on 2023-02-03 21:53
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("documents", "1029_alter_document_archive_serial_number"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="paperlesstask",
|
||||
name="task_file_name",
|
||||
field=models.CharField(
|
||||
help_text="Name of the file which the Task was run for",
|
||||
max_length=255,
|
||||
null=True,
|
||||
verbose_name="Task Filename",
|
||||
),
|
||||
),
|
||||
]
|
@@ -3,6 +3,7 @@ import logging
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
from typing import Final
|
||||
from typing import Optional
|
||||
|
||||
import dateutil.parser
|
||||
@@ -229,6 +230,9 @@ class Document(models.Model):
|
||||
help_text=_("The original name of the file when it was uploaded"),
|
||||
)
|
||||
|
||||
ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0
|
||||
ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF
|
||||
|
||||
archive_serial_number = models.PositiveIntegerField(
|
||||
_("archive serial number"),
|
||||
blank=True,
|
||||
@@ -236,8 +240,8 @@ class Document(models.Model):
|
||||
unique=True,
|
||||
db_index=True,
|
||||
validators=[
|
||||
MaxValueValidator(0xFF_FF_FF_FF),
|
||||
MinValueValidator(0),
|
||||
MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX),
|
||||
MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN),
|
||||
],
|
||||
help_text=_(
|
||||
"The position of this document in your physical document " "archive.",
|
||||
@@ -555,7 +559,7 @@ class PaperlessTask(models.Model):
|
||||
task_file_name = models.CharField(
|
||||
null=True,
|
||||
max_length=255,
|
||||
verbose_name=_("Task Name"),
|
||||
verbose_name=_("Task Filename"),
|
||||
help_text=_("Name of the file which the Task was run for"),
|
||||
)
|
||||
|
||||
|
@@ -599,11 +599,17 @@ class StoragePathSerializer(MatchingModelSerializer):
|
||||
document_type="document_type",
|
||||
created="created",
|
||||
created_year="created_year",
|
||||
created_year_short="created_year_short",
|
||||
created_month="created_month",
|
||||
created_month_name="created_month_name",
|
||||
created_month_name_short="created_month_name_short",
|
||||
created_day="created_day",
|
||||
added="added",
|
||||
added_year="added_year",
|
||||
added_year_short="added_year_short",
|
||||
added_month="added_month",
|
||||
added_month_name="added_month_name",
|
||||
added_month_name_short="added_month_name_short",
|
||||
added_day="added_day",
|
||||
asn="asn",
|
||||
tags="tags",
|
||||
|
@@ -128,6 +128,18 @@ def consume_file(
|
||||
)
|
||||
|
||||
if document_list:
|
||||
|
||||
# If the file is an upload, it's in the scratch directory
|
||||
# Move it to consume directory to be picked up
|
||||
# Otherwise, use the current parent to keep possible tags
|
||||
# from subdirectories
|
||||
try:
|
||||
# is_relative_to would be nicer, but new in 3.9
|
||||
_ = path.relative_to(settings.SCRATCH_DIR)
|
||||
save_to_dir = settings.CONSUMPTION_DIR
|
||||
except ValueError:
|
||||
save_to_dir = path.parent
|
||||
|
||||
for n, document in enumerate(document_list):
|
||||
# save to consumption dir
|
||||
# rename it to the original filename with number prefix
|
||||
@@ -136,23 +148,18 @@ def consume_file(
|
||||
else:
|
||||
newname = None
|
||||
|
||||
# If the file is an upload, it's in the scratch directory
|
||||
# Move it to consume directory to be picked up
|
||||
# Otherwise, use the current parent to keep possible tags
|
||||
# from subdirectories
|
||||
try:
|
||||
# is_relative_to would be nicer, but new in 3.9
|
||||
_ = path.relative_to(settings.SCRATCH_DIR)
|
||||
save_to_dir = settings.CONSUMPTION_DIR
|
||||
except ValueError:
|
||||
save_to_dir = path.parent
|
||||
|
||||
barcodes.save_to_dir(
|
||||
document,
|
||||
newname=newname,
|
||||
target_dir=save_to_dir,
|
||||
)
|
||||
|
||||
# Split file has been copied safely, remove it
|
||||
os.remove(document)
|
||||
|
||||
# And clean up the directory as well, now it's empty
|
||||
shutil.rmtree(os.path.dirname(document_list[0]))
|
||||
|
||||
# Delete the PDF file which was split
|
||||
os.remove(doc_barcode_info.pdf_path)
|
||||
|
||||
@@ -164,7 +171,7 @@ def consume_file(
|
||||
# notify the sender, otherwise the progress bar
|
||||
# in the UI stays stuck
|
||||
payload = {
|
||||
"filename": override_filename,
|
||||
"filename": override_filename or path.name,
|
||||
"task_id": task_id,
|
||||
"current_progress": 100,
|
||||
"max_progress": 100,
|
||||
|
BIN
src/documents/tests/samples/barcodes/split-by-asn-1.pdf
Normal file
BIN
src/documents/tests/samples/barcodes/split-by-asn-1.pdf
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/barcodes/split-by-asn-2.pdf
Normal file
BIN
src/documents/tests/samples/barcodes/split-by-asn-2.pdf
Normal file
Binary file not shown.
@@ -7,6 +7,7 @@ import tempfile
|
||||
import urllib.request
|
||||
import uuid
|
||||
import zipfile
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
@@ -23,6 +24,7 @@ from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import override_settings
|
||||
from django.utils import timezone
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from documents import bulk_edit
|
||||
from documents import index
|
||||
from documents.models import Correspondent
|
||||
@@ -119,28 +121,28 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
response = self.client.get("/api/documents/", format="json")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results_full = response.data["results"]
|
||||
self.assertTrue("content" in results_full[0])
|
||||
self.assertTrue("id" in results_full[0])
|
||||
self.assertIn("content", results_full[0])
|
||||
self.assertIn("id", results_full[0])
|
||||
|
||||
response = self.client.get("/api/documents/?fields=id", format="json")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data["results"]
|
||||
self.assertFalse("content" in results[0])
|
||||
self.assertTrue("id" in results[0])
|
||||
self.assertIn("id", results[0])
|
||||
self.assertEqual(len(results[0]), 1)
|
||||
|
||||
response = self.client.get("/api/documents/?fields=content", format="json")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data["results"]
|
||||
self.assertTrue("content" in results[0])
|
||||
self.assertIn("content", results[0])
|
||||
self.assertFalse("id" in results[0])
|
||||
self.assertEqual(len(results[0]), 1)
|
||||
|
||||
response = self.client.get("/api/documents/?fields=id,content", format="json")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data["results"]
|
||||
self.assertTrue("content" in results[0])
|
||||
self.assertTrue("id" in results[0])
|
||||
self.assertIn("content", results[0])
|
||||
self.assertIn("id", results[0])
|
||||
self.assertEqual(len(results[0]), 2)
|
||||
|
||||
response = self.client.get(
|
||||
@@ -150,7 +152,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data["results"]
|
||||
self.assertFalse("content" in results[0])
|
||||
self.assertTrue("id" in results[0])
|
||||
self.assertIn("id", results[0])
|
||||
self.assertEqual(len(results[0]), 1)
|
||||
|
||||
response = self.client.get("/api/documents/?fields=", format="json")
|
||||
@@ -505,6 +507,270 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
response = self.client.get("/api/documents/?query=content&page=3&page_size=10")
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="UTC",
|
||||
)
|
||||
def test_search_added_in_last_week(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Three documents added right now
|
||||
- The timezone is UTC time
|
||||
WHEN:
|
||||
- Query for documents added in the last 7 days
|
||||
THEN:
|
||||
- All three recent documents are returned
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
content="the thing i bought at a shop and paid with bank account",
|
||||
checksum="A",
|
||||
pk=1,
|
||||
)
|
||||
d2 = Document.objects.create(
|
||||
title="bank statement 1",
|
||||
content="things i paid for in august",
|
||||
pk=2,
|
||||
checksum="B",
|
||||
)
|
||||
d3 = Document.objects.create(
|
||||
title="bank statement 3",
|
||||
content="things i paid for in september",
|
||||
pk=3,
|
||||
checksum="C",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
|
||||
results = response.data["results"]
|
||||
# Expect 3 documents returned
|
||||
self.assertEqual(len(results), 3)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[
|
||||
{"id": 1, "title": "invoice"},
|
||||
{"id": 2, "title": "bank statement 1"},
|
||||
{"id": 3, "title": "bank statement 3"},
|
||||
],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="America/Chicago",
|
||||
)
|
||||
def test_search_added_in_last_week_with_timezone_behind(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Two documents added right now
|
||||
- One document added over a week ago
|
||||
- The timezone is behind UTC time (-6)
|
||||
WHEN:
|
||||
- Query for documents added in the last 7 days
|
||||
THEN:
|
||||
- The two recent documents are returned
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
content="the thing i bought at a shop and paid with bank account",
|
||||
checksum="A",
|
||||
pk=1,
|
||||
)
|
||||
d2 = Document.objects.create(
|
||||
title="bank statement 1",
|
||||
content="things i paid for in august",
|
||||
pk=2,
|
||||
checksum="B",
|
||||
)
|
||||
d3 = Document.objects.create(
|
||||
title="bank statement 3",
|
||||
content="things i paid for in september",
|
||||
pk=3,
|
||||
checksum="C",
|
||||
# 7 days, 1 hour and 1 minute ago
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
|
||||
results = response.data["results"]
|
||||
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="Europe/Sofia",
|
||||
)
|
||||
def test_search_added_in_last_week_with_timezone_ahead(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Two documents added right now
|
||||
- One document added over a week ago
|
||||
- The timezone is behind UTC time (+2)
|
||||
WHEN:
|
||||
- Query for documents added in the last 7 days
|
||||
THEN:
|
||||
- The two recent documents are returned
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
content="the thing i bought at a shop and paid with bank account",
|
||||
checksum="A",
|
||||
pk=1,
|
||||
)
|
||||
d2 = Document.objects.create(
|
||||
title="bank statement 1",
|
||||
content="things i paid for in august",
|
||||
pk=2,
|
||||
checksum="B",
|
||||
)
|
||||
d3 = Document.objects.create(
|
||||
title="bank statement 3",
|
||||
content="things i paid for in september",
|
||||
pk=3,
|
||||
checksum="C",
|
||||
# 7 days, 1 hour and 1 minute ago
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
|
||||
results = response.data["results"]
|
||||
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
|
||||
def test_search_added_in_last_month(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- One document added right now
|
||||
- One documents added about a week ago
|
||||
- One document added over 1 month
|
||||
WHEN:
|
||||
- Query for documents added in the last month
|
||||
THEN:
|
||||
- The two recent documents are returned
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
content="the thing i bought at a shop and paid with bank account",
|
||||
checksum="A",
|
||||
pk=1,
|
||||
)
|
||||
d2 = Document.objects.create(
|
||||
title="bank statement 1",
|
||||
content="things i paid for in august",
|
||||
pk=2,
|
||||
checksum="B",
|
||||
# 1 month, 1 day ago
|
||||
added=timezone.now() - relativedelta(months=1, days=1),
|
||||
)
|
||||
d3 = Document.objects.create(
|
||||
title="bank statement 3",
|
||||
content="things i paid for in september",
|
||||
pk=3,
|
||||
checksum="C",
|
||||
# 7 days, 1 hour and 1 minute ago
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 month to now]")
|
||||
results = response.data["results"]
|
||||
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="America/Denver",
|
||||
)
|
||||
def test_search_added_in_last_month_timezone_behind(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- One document added right now
|
||||
- One documents added about a week ago
|
||||
- One document added over 1 month
|
||||
- The timezone is behind UTC time (-6 or -7)
|
||||
WHEN:
|
||||
- Query for documents added in the last month
|
||||
THEN:
|
||||
- The two recent documents are returned
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
content="the thing i bought at a shop and paid with bank account",
|
||||
checksum="A",
|
||||
pk=1,
|
||||
)
|
||||
d2 = Document.objects.create(
|
||||
title="bank statement 1",
|
||||
content="things i paid for in august",
|
||||
pk=2,
|
||||
checksum="B",
|
||||
# 1 month, 1 day ago
|
||||
added=timezone.now() - relativedelta(months=1, days=1),
|
||||
)
|
||||
d3 = Document.objects.create(
|
||||
title="bank statement 3",
|
||||
content="things i paid for in september",
|
||||
pk=3,
|
||||
checksum="C",
|
||||
# 7 days, 1 hour and 1 minute ago
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 month to now]")
|
||||
results = response.data["results"]
|
||||
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
|
||||
@mock.patch("documents.index.autocomplete")
|
||||
def test_search_autocomplete(self, m):
|
||||
m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
|
||||
@@ -2933,8 +3199,32 @@ class TestApiStoragePaths(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, 400)
|
||||
self.assertEqual(StoragePath.objects.count(), 1)
|
||||
|
||||
def test_api_storage_path_placeholders(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- API request to create a storage path with placeholders
|
||||
- Storage path is valid
|
||||
WHEN:
|
||||
- API is called
|
||||
THEN:
|
||||
- Correct HTTP response
|
||||
- New storage path is created
|
||||
"""
|
||||
response = self.client.post(
|
||||
self.ENDPOINT,
|
||||
json.dumps(
|
||||
{
|
||||
"name": "Storage path with placeholders",
|
||||
"path": "{title}/{correspondent}/{document_type}/{created}/{created_year}/{created_year_short}/{created_month}/{created_month_name}/{created_month_name_short}/{created_day}/{added}/{added_year}/{added_year_short}/{added_month}/{added_month_name}/{added_month_name_short}/{added_day}/{asn}/{tags}/{tag_list}/",
|
||||
},
|
||||
),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(response.status_code, 201)
|
||||
self.assertEqual(StoragePath.objects.count(), 2)
|
||||
|
||||
class TestTasks(APITestCase):
|
||||
|
||||
class TestTasks(DirectoriesMixin, APITestCase):
|
||||
ENDPOINT = "/api/tasks/"
|
||||
ENDPOINT_ACKNOWLEDGE = "/api/acknowledge_tasks/"
|
||||
|
||||
|
@@ -294,7 +294,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [0])
|
||||
self.assertDictEqual(separator_page_numbers, {0: False})
|
||||
|
||||
def test_scan_file_for_separating_barcodes_none_present(self):
|
||||
"""
|
||||
@@ -314,7 +314,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [])
|
||||
self.assertDictEqual(separator_page_numbers, {})
|
||||
|
||||
def test_scan_file_for_separating_barcodes_middle_page(self):
|
||||
"""
|
||||
@@ -337,7 +337,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [1])
|
||||
self.assertDictEqual(separator_page_numbers, {1: False})
|
||||
|
||||
def test_scan_file_for_separating_barcodes_multiple_pages(self):
|
||||
"""
|
||||
@@ -360,7 +360,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [2, 5])
|
||||
self.assertDictEqual(separator_page_numbers, {2: False, 5: False})
|
||||
|
||||
def test_scan_file_for_separating_barcodes_upside_down(self):
|
||||
"""
|
||||
@@ -384,7 +384,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [1])
|
||||
self.assertDictEqual(separator_page_numbers, {1: False})
|
||||
|
||||
def test_scan_file_for_separating_barcodes_fax_decode(self):
|
||||
"""
|
||||
@@ -407,7 +407,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [1])
|
||||
self.assertDictEqual(separator_page_numbers, {1: False})
|
||||
|
||||
def test_scan_file_for_separating_qr_barcodes(self):
|
||||
"""
|
||||
@@ -431,7 +431,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [0])
|
||||
self.assertDictEqual(separator_page_numbers, {0: False})
|
||||
|
||||
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
|
||||
def test_scan_file_for_separating_custom_barcodes(self):
|
||||
@@ -456,7 +456,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [0])
|
||||
self.assertDictEqual(separator_page_numbers, {0: False})
|
||||
|
||||
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
|
||||
def test_scan_file_for_separating_custom_qr_barcodes(self):
|
||||
@@ -482,7 +482,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [0])
|
||||
self.assertDictEqual(separator_page_numbers, {0: False})
|
||||
|
||||
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
|
||||
def test_scan_file_for_separating_custom_128_barcodes(self):
|
||||
@@ -508,7 +508,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [0])
|
||||
self.assertDictEqual(separator_page_numbers, {0: False})
|
||||
|
||||
def test_scan_file_for_separating_wrong_qr_barcodes(self):
|
||||
"""
|
||||
@@ -533,7 +533,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [])
|
||||
self.assertDictEqual(separator_page_numbers, {})
|
||||
|
||||
@override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC")
|
||||
def test_scan_file_for_separating_qr_barcodes(self):
|
||||
@@ -558,7 +558,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertGreater(len(doc_barcode_info.barcodes), 0)
|
||||
self.assertListEqual(separator_page_numbers, [1])
|
||||
self.assertDictEqual(separator_page_numbers, {1: False})
|
||||
|
||||
def test_separate_pages(self):
|
||||
"""
|
||||
@@ -573,7 +573,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
self.BARCODE_SAMPLE_DIR,
|
||||
"patch-code-t-middle.pdf",
|
||||
)
|
||||
documents = barcodes.separate_pages(test_file, [1])
|
||||
documents = barcodes.separate_pages(test_file, {1: False})
|
||||
|
||||
self.assertEqual(len(documents), 2)
|
||||
|
||||
@@ -591,7 +591,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
self.BARCODE_SAMPLE_DIR,
|
||||
"patch-code-t-double.pdf",
|
||||
)
|
||||
pages = barcodes.separate_pages(test_file, [1, 2])
|
||||
pages = barcodes.separate_pages(test_file, {1: False, 2: False})
|
||||
|
||||
self.assertEqual(len(pages), 2)
|
||||
|
||||
@@ -610,7 +610,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
"patch-code-t-middle.pdf",
|
||||
)
|
||||
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
|
||||
pages = barcodes.separate_pages(test_file, [])
|
||||
pages = barcodes.separate_pages(test_file, {})
|
||||
self.assertEqual(pages, [])
|
||||
self.assertEqual(
|
||||
cm.output,
|
||||
@@ -858,7 +858,88 @@ class TestBarcode(DirectoriesMixin, TestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(doc_barcode_info.pdf_path, test_file)
|
||||
self.assertListEqual(separator_page_numbers, [])
|
||||
self.assertDictEqual(separator_page_numbers, {})
|
||||
|
||||
@override_settings(
|
||||
CONSUMER_ENABLE_BARCODES=True,
|
||||
CONSUMER_ENABLE_ASN_BARCODE=True,
|
||||
)
|
||||
def test_separate_pages_by_asn_barcodes_and_patcht(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Input PDF with a patch code on page 3 and ASN barcodes on pages 1,5,6,9,11
|
||||
WHEN:
|
||||
- Input file is split on barcodes
|
||||
THEN:
|
||||
- Correct number of files produced, split correctly by correct pages
|
||||
"""
|
||||
test_file = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
self.BARCODE_SAMPLE_DIR,
|
||||
"split-by-asn-2.pdf",
|
||||
)
|
||||
|
||||
doc_barcode_info = barcodes.scan_file_for_barcodes(
|
||||
test_file,
|
||||
)
|
||||
separator_page_numbers = barcodes.get_separating_barcodes(
|
||||
doc_barcode_info.barcodes,
|
||||
)
|
||||
|
||||
self.assertEqual(test_file, doc_barcode_info.pdf_path)
|
||||
self.assertDictEqual(
|
||||
separator_page_numbers,
|
||||
{
|
||||
2: False,
|
||||
4: True,
|
||||
5: True,
|
||||
8: True,
|
||||
10: True,
|
||||
},
|
||||
)
|
||||
|
||||
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
|
||||
self.assertEqual(len(document_list), 6)
|
||||
|
||||
@override_settings(
|
||||
CONSUMER_ENABLE_BARCODES=True,
|
||||
CONSUMER_ENABLE_ASN_BARCODE=True,
|
||||
)
|
||||
def test_separate_pages_by_asn_barcodes(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Input PDF with ASN barcodes on pages 1,3,4,7,9
|
||||
WHEN:
|
||||
- Input file is split on barcodes
|
||||
THEN:
|
||||
- Correct number of files produced, split correctly by correct pages
|
||||
"""
|
||||
test_file = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
self.BARCODE_SAMPLE_DIR,
|
||||
"split-by-asn-1.pdf",
|
||||
)
|
||||
|
||||
doc_barcode_info = barcodes.scan_file_for_barcodes(
|
||||
test_file,
|
||||
)
|
||||
separator_page_numbers = barcodes.get_separating_barcodes(
|
||||
doc_barcode_info.barcodes,
|
||||
)
|
||||
|
||||
self.assertEqual(test_file, doc_barcode_info.pdf_path)
|
||||
self.assertDictEqual(
|
||||
separator_page_numbers,
|
||||
{
|
||||
2: True,
|
||||
3: True,
|
||||
6: True,
|
||||
8: True,
|
||||
},
|
||||
)
|
||||
|
||||
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
|
||||
self.assertEqual(len(document_list), 5)
|
||||
|
||||
|
||||
class TestAsnBarcodes(DirectoriesMixin, TestCase):
|
||||
|
@@ -847,13 +847,11 @@ class PreConsumeTestCase(TestCase):
|
||||
self.assertEqual(command[0], script.name)
|
||||
self.assertEqual(command[1], "path-to-file")
|
||||
|
||||
self.assertDictContainsSubset(
|
||||
{
|
||||
"DOCUMENT_SOURCE_PATH": c.original_path,
|
||||
"DOCUMENT_WORKING_PATH": c.path,
|
||||
},
|
||||
environment,
|
||||
)
|
||||
subset = {
|
||||
"DOCUMENT_SOURCE_PATH": c.original_path,
|
||||
"DOCUMENT_WORKING_PATH": c.path,
|
||||
}
|
||||
self.assertDictEqual(environment, {**environment, **subset})
|
||||
|
||||
@mock.patch("documents.consumer.Consumer.log")
|
||||
def test_script_with_output(self, mocked_log):
|
||||
@@ -983,16 +981,15 @@ class PostConsumeTestCase(TestCase):
|
||||
self.assertEqual(command[7], "my_bank")
|
||||
self.assertCountEqual(command[8].split(","), ["a", "b"])
|
||||
|
||||
self.assertDictContainsSubset(
|
||||
{
|
||||
"DOCUMENT_ID": str(doc.pk),
|
||||
"DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
|
||||
"DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
|
||||
"DOCUMENT_CORRESPONDENT": "my_bank",
|
||||
"DOCUMENT_TAGS": "a,b",
|
||||
},
|
||||
environment,
|
||||
)
|
||||
subset = {
|
||||
"DOCUMENT_ID": str(doc.pk),
|
||||
"DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
|
||||
"DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
|
||||
"DOCUMENT_CORRESPONDENT": "my_bank",
|
||||
"DOCUMENT_TAGS": "a,b",
|
||||
}
|
||||
|
||||
self.assertDictEqual(environment, {**environment, **subset})
|
||||
|
||||
def test_script_exit_non_zero(self):
|
||||
"""
|
||||
|
@@ -25,7 +25,7 @@ class TestImporter(TestCase):
|
||||
cmd.manifest = [{"model": "documents.document"}]
|
||||
with self.assertRaises(CommandError) as cm:
|
||||
cmd._check_manifest()
|
||||
self.assertTrue("The manifest file contains a record" in str(cm.exception))
|
||||
self.assertIn("The manifest file contains a record", str(cm.exception))
|
||||
|
||||
cmd.manifest = [
|
||||
{"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"},
|
||||
@@ -33,6 +33,7 @@ class TestImporter(TestCase):
|
||||
# self.assertRaises(CommandError, cmd._check_manifest)
|
||||
with self.assertRaises(CommandError) as cm:
|
||||
cmd._check_manifest()
|
||||
self.assertTrue(
|
||||
'The manifest file refers to "noexist.pdf"' in str(cm.exception),
|
||||
self.assertIn(
|
||||
'The manifest file refers to "noexist.pdf"',
|
||||
str(cm.exception),
|
||||
)
|
||||
|
@@ -1,3 +1,5 @@
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from documents import index
|
||||
from documents.models import Document
|
||||
@@ -31,3 +33,60 @@ class TestAutoComplete(DirectoriesMixin, TestCase):
|
||||
)
|
||||
self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"])
|
||||
self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
|
||||
|
||||
def test_archive_serial_number_ranging(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Document with an archive serial number above schema allowed size
|
||||
WHEN:
|
||||
- Document is provided to the index
|
||||
THEN:
|
||||
- Error is logged
|
||||
- Document ASN is reset to 0 for the index
|
||||
"""
|
||||
doc1 = Document.objects.create(
|
||||
title="doc1",
|
||||
checksum="A",
|
||||
content="test test2 test3",
|
||||
# yes, this is allowed, unless full_clean is run
|
||||
# DRF does call the validators, this test won't
|
||||
archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1,
|
||||
)
|
||||
with self.assertLogs("paperless.index", level="ERROR") as cm:
|
||||
with mock.patch(
|
||||
"documents.index.AsyncWriter.update_document",
|
||||
) as mocked_update_doc:
|
||||
index.add_or_update_document(doc1)
|
||||
|
||||
mocked_update_doc.assert_called_once()
|
||||
_, kwargs = mocked_update_doc.call_args
|
||||
|
||||
self.assertEqual(kwargs["asn"], 0)
|
||||
|
||||
error_str = cm.output[0]
|
||||
expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1"
|
||||
self.assertIn(expected_str, error_str)
|
||||
|
||||
def test_archive_serial_number_is_none(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Document with no archive serial number
|
||||
WHEN:
|
||||
- Document is provided to the index
|
||||
THEN:
|
||||
- ASN isn't touched
|
||||
"""
|
||||
doc1 = Document.objects.create(
|
||||
title="doc1",
|
||||
checksum="A",
|
||||
content="test test2 test3",
|
||||
)
|
||||
with mock.patch(
|
||||
"documents.index.AsyncWriter.update_document",
|
||||
) as mocked_update_doc:
|
||||
index.add_or_update_document(doc1)
|
||||
|
||||
mocked_update_doc.assert_called_once()
|
||||
_, kwargs = mocked_update_doc.call_args
|
||||
|
||||
self.assertIsNone(kwargs["asn"])
|
||||
|
@@ -247,22 +247,85 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
||||
|
||||
def test_is_ignored(self):
|
||||
test_paths = [
|
||||
(os.path.join(self.dirs.consumption_dir, "foo.pdf"), False),
|
||||
(os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"), False),
|
||||
(os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True),
|
||||
(
|
||||
os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"),
|
||||
True,
|
||||
),
|
||||
(os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True),
|
||||
(os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True),
|
||||
(os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False),
|
||||
{
|
||||
"path": os.path.join(self.dirs.consumption_dir, "foo.pdf"),
|
||||
"ignore": False,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"),
|
||||
"ignore": False,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"),
|
||||
"ignore": True,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(
|
||||
self.dirs.consumption_dir,
|
||||
"foo",
|
||||
".DS_STORE",
|
||||
"bar.pdf",
|
||||
),
|
||||
"ignore": True,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(
|
||||
self.dirs.consumption_dir,
|
||||
".DS_STORE",
|
||||
"foo",
|
||||
"bar.pdf",
|
||||
),
|
||||
"ignore": True,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"),
|
||||
"ignore": True,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(self.dirs.consumption_dir, ".stfolder.pdf"),
|
||||
"ignore": False,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(
|
||||
self.dirs.consumption_dir,
|
||||
".stversions",
|
||||
"foo.pdf",
|
||||
),
|
||||
"ignore": True,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(self.dirs.consumption_dir, ".stversions.pdf"),
|
||||
"ignore": False,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(self.dirs.consumption_dir, "._foo.pdf"),
|
||||
"ignore": True,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(self.dirs.consumption_dir, "my_foo.pdf"),
|
||||
"ignore": False,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"),
|
||||
"ignore": True,
|
||||
},
|
||||
{
|
||||
"path": os.path.join(
|
||||
self.dirs.consumption_dir,
|
||||
"@eaDir",
|
||||
"SYNO@.fileindexdb",
|
||||
"_1jk.fnm",
|
||||
),
|
||||
"ignore": True,
|
||||
},
|
||||
]
|
||||
for file_path, expected_ignored in test_paths:
|
||||
for test_setup in test_paths:
|
||||
filepath = test_setup["path"]
|
||||
expected_ignored_result = test_setup["ignore"]
|
||||
self.assertEqual(
|
||||
expected_ignored,
|
||||
document_consumer._is_ignored(file_path),
|
||||
f'_is_ignored("{file_path}") != {expected_ignored}',
|
||||
expected_ignored_result,
|
||||
document_consumer._is_ignored(filepath),
|
||||
f'_is_ignored("{filepath}") != {expected_ignored_result}',
|
||||
)
|
||||
|
||||
@mock.patch("documents.management.commands.document_consumer.open")
|
||||
|
@@ -1,6 +1,8 @@
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
from django.apps import apps
|
||||
from django.test import override_settings
|
||||
from django.test import TestCase
|
||||
from documents.parsers import get_default_file_extension
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
@@ -8,6 +10,7 @@ from documents.parsers import get_supported_file_extensions
|
||||
from documents.parsers import is_file_ext_supported
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
|
||||
|
||||
class TestParserDiscovery(TestCase):
|
||||
@@ -124,14 +127,43 @@ class TestParserDiscovery(TestCase):
|
||||
|
||||
|
||||
class TestParserAvailability(TestCase):
|
||||
def test_file_extensions(self):
|
||||
|
||||
def test_tesseract_parser(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Various mime types
|
||||
WHEN:
|
||||
- The parser class is instantiated
|
||||
THEN:
|
||||
- The Tesseract based parser is return
|
||||
"""
|
||||
supported_mimes_and_exts = [
|
||||
("application/pdf", ".pdf"),
|
||||
("image/png", ".png"),
|
||||
("image/jpeg", ".jpg"),
|
||||
("image/tiff", ".tif"),
|
||||
("image/webp", ".webp"),
|
||||
]
|
||||
|
||||
supported_exts = get_supported_file_extensions()
|
||||
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
RasterisedDocumentParser,
|
||||
)
|
||||
|
||||
def test_text_parser(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Various mime types of a text form
|
||||
WHEN:
|
||||
- The parser class is instantiated
|
||||
THEN:
|
||||
- The text based parser is return
|
||||
"""
|
||||
supported_mimes_and_exts = [
|
||||
("text/plain", ".txt"),
|
||||
("text/csv", ".csv"),
|
||||
]
|
||||
@@ -141,23 +173,55 @@ class TestParserAvailability(TestCase):
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
TextDocumentParser,
|
||||
)
|
||||
|
||||
def test_tika_parser(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Various mime types of a office document form
|
||||
WHEN:
|
||||
- The parser class is instantiated
|
||||
THEN:
|
||||
- The Tika/Gotenberg based parser is return
|
||||
"""
|
||||
supported_mimes_and_exts = [
|
||||
("application/vnd.oasis.opendocument.text", ".odt"),
|
||||
("text/rtf", ".rtf"),
|
||||
("application/msword", ".doc"),
|
||||
(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
".docx",
|
||||
),
|
||||
]
|
||||
|
||||
# Force the app ready to notice the settings override
|
||||
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
|
||||
app = apps.get_app_config("paperless_tika")
|
||||
app.ready()
|
||||
supported_exts = get_supported_file_extensions()
|
||||
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
TikaDocumentParser,
|
||||
)
|
||||
|
||||
def test_no_parser_for_mime(self):
|
||||
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
||||
|
||||
def test_default_extension(self):
|
||||
# Test no parser declared still returns a an extension
|
||||
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
|
||||
|
||||
# Test invalid mimetype returns no extension
|
||||
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
|
||||
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type("application/pdf")(logging_group=None),
|
||||
RasterisedDocumentParser,
|
||||
)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type("text/plain")(logging_group=None),
|
||||
TextDocumentParser,
|
||||
)
|
||||
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
||||
|
||||
def test_file_extension_support(self):
|
||||
self.assertTrue(is_file_ext_supported(".pdf"))
|
||||
self.assertFalse(is_file_ext_supported(".hsdfh"))
|
||||
self.assertFalse(is_file_ext_supported(""))
|
||||
|
@@ -3,7 +3,7 @@ msgstr ""
|
||||
"Project-Id-Version: paperless-ngx\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2022-11-09 21:50+0000\n"
|
||||
"PO-Revision-Date: 2023-01-23 12:37\n"
|
||||
"PO-Revision-Date: 2023-01-27 19:22\n"
|
||||
"Last-Translator: \n"
|
||||
"Language-Team: Dutch\n"
|
||||
"Language: nl_NL\n"
|
||||
@@ -368,15 +368,15 @@ msgstr "heeft tags in"
|
||||
|
||||
#: documents/models.py:410
|
||||
msgid "ASN greater than"
|
||||
msgstr ""
|
||||
msgstr "ASN groter dan"
|
||||
|
||||
#: documents/models.py:411
|
||||
msgid "ASN less than"
|
||||
msgstr ""
|
||||
msgstr "ASN kleiner dan"
|
||||
|
||||
#: documents/models.py:412
|
||||
msgid "storage path is"
|
||||
msgstr ""
|
||||
msgstr "opslagpad is"
|
||||
|
||||
#: documents/models.py:422
|
||||
msgid "rule type"
|
||||
@@ -396,99 +396,99 @@ msgstr "filterregels"
|
||||
|
||||
#: documents/models.py:536
|
||||
msgid "Task ID"
|
||||
msgstr ""
|
||||
msgstr "Taak ID"
|
||||
|
||||
#: documents/models.py:537
|
||||
msgid "Celery ID for the Task that was run"
|
||||
msgstr ""
|
||||
msgstr "Celery ID voor de taak die werd uitgevoerd"
|
||||
|
||||
#: documents/models.py:542
|
||||
msgid "Acknowledged"
|
||||
msgstr ""
|
||||
msgstr "Bevestigd"
|
||||
|
||||
#: documents/models.py:543
|
||||
msgid "If the task is acknowledged via the frontend or API"
|
||||
msgstr ""
|
||||
msgstr "Of de taak is bevestigd via de frontend of de API"
|
||||
|
||||
#: documents/models.py:549 documents/models.py:556
|
||||
msgid "Task Name"
|
||||
msgstr ""
|
||||
msgstr "Taaknaam"
|
||||
|
||||
#: documents/models.py:550
|
||||
msgid "Name of the file which the Task was run for"
|
||||
msgstr ""
|
||||
msgstr "Naam van het bestand waarvoor de taak werd uitgevoerd"
|
||||
|
||||
#: documents/models.py:557
|
||||
msgid "Name of the Task which was run"
|
||||
msgstr ""
|
||||
msgstr "Naam van de uitgevoerde taak"
|
||||
|
||||
#: documents/models.py:562
|
||||
msgid "Task Positional Arguments"
|
||||
msgstr ""
|
||||
msgstr "Positionele argumenten voor taak"
|
||||
|
||||
#: documents/models.py:564
|
||||
msgid "JSON representation of the positional arguments used with the task"
|
||||
msgstr ""
|
||||
msgstr "JSON weergave van de positionele argumenten die gebruikt worden voor de taak"
|
||||
|
||||
#: documents/models.py:569
|
||||
msgid "Task Named Arguments"
|
||||
msgstr ""
|
||||
msgstr "Argumenten met naam voor taak"
|
||||
|
||||
#: documents/models.py:571
|
||||
msgid "JSON representation of the named arguments used with the task"
|
||||
msgstr ""
|
||||
msgstr "JSON weergave van de argumenten met naam die gebruikt worden voor de taak"
|
||||
|
||||
#: documents/models.py:578
|
||||
msgid "Task State"
|
||||
msgstr ""
|
||||
msgstr "Taakstatus"
|
||||
|
||||
#: documents/models.py:579
|
||||
msgid "Current state of the task being run"
|
||||
msgstr ""
|
||||
msgstr "Huidige status van de taak die wordt uitgevoerd"
|
||||
|
||||
#: documents/models.py:584
|
||||
msgid "Created DateTime"
|
||||
msgstr ""
|
||||
msgstr "Aangemaakt DateTime"
|
||||
|
||||
#: documents/models.py:585
|
||||
msgid "Datetime field when the task result was created in UTC"
|
||||
msgstr ""
|
||||
msgstr "Datetime veld wanneer het resultaat van de taak werd aangemaakt in UTC"
|
||||
|
||||
#: documents/models.py:590
|
||||
msgid "Started DateTime"
|
||||
msgstr ""
|
||||
msgstr "Gestart DateTime"
|
||||
|
||||
#: documents/models.py:591
|
||||
msgid "Datetime field when the task was started in UTC"
|
||||
msgstr ""
|
||||
msgstr "Datetime veld wanneer de taak werd gestart in UTC"
|
||||
|
||||
#: documents/models.py:596
|
||||
msgid "Completed DateTime"
|
||||
msgstr ""
|
||||
msgstr "Voltooid DateTime"
|
||||
|
||||
#: documents/models.py:597
|
||||
msgid "Datetime field when the task was completed in UTC"
|
||||
msgstr ""
|
||||
msgstr "Datetime veld wanneer de taak werd voltooid in UTC"
|
||||
|
||||
#: documents/models.py:602
|
||||
msgid "Result Data"
|
||||
msgstr ""
|
||||
msgstr "Resultaatgegevens"
|
||||
|
||||
#: documents/models.py:604
|
||||
msgid "The data returned by the task"
|
||||
msgstr ""
|
||||
msgstr "Gegevens geretourneerd door de taak"
|
||||
|
||||
#: documents/models.py:613
|
||||
msgid "Comment for the document"
|
||||
msgstr ""
|
||||
msgstr "Commentaar op het document"
|
||||
|
||||
#: documents/models.py:642
|
||||
msgid "comment"
|
||||
msgstr ""
|
||||
msgstr "opmerking"
|
||||
|
||||
#: documents/models.py:643
|
||||
msgid "comments"
|
||||
msgstr ""
|
||||
msgstr "opmerkingen"
|
||||
|
||||
#: documents/serialisers.py:72
|
||||
#, python-format
|
||||
|
@@ -109,6 +109,16 @@ def _parse_redis_url(env_redis: Optional[str]) -> Tuple[str]:
|
||||
|
||||
|
||||
def _parse_beat_schedule() -> Dict:
|
||||
"""
|
||||
Configures the scheduled tasks, according to default or
|
||||
environment variables. Task expiration is configured so the task will
|
||||
expire (and not run), shortly before the default frequency will put another
|
||||
of the same task into the queue
|
||||
|
||||
|
||||
https://docs.celeryq.dev/en/stable/userguide/periodic-tasks.html#beat-entries
|
||||
https://docs.celeryq.dev/en/latest/userguide/calling.html#expiration
|
||||
"""
|
||||
schedule = {}
|
||||
tasks = [
|
||||
{
|
||||
@@ -117,6 +127,11 @@ def _parse_beat_schedule() -> Dict:
|
||||
# Default every ten minutes
|
||||
"env_default": "*/10 * * * *",
|
||||
"task": "paperless_mail.tasks.process_mail_accounts",
|
||||
"options": {
|
||||
# 1 minute before default schedule sends again
|
||||
"expires": 9.0
|
||||
* 60.0,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "Train the classifier",
|
||||
@@ -124,6 +139,11 @@ def _parse_beat_schedule() -> Dict:
|
||||
# Default hourly at 5 minutes past the hour
|
||||
"env_default": "5 */1 * * *",
|
||||
"task": "documents.tasks.train_classifier",
|
||||
"options": {
|
||||
# 1 minute before default schedule sends again
|
||||
"expires": 59.0
|
||||
* 60.0,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "Optimize the index",
|
||||
@@ -131,6 +151,12 @@ def _parse_beat_schedule() -> Dict:
|
||||
# Default daily at midnight
|
||||
"env_default": "0 0 * * *",
|
||||
"task": "documents.tasks.index_optimize",
|
||||
"options": {
|
||||
# 1 hour before default schedule sends again
|
||||
"expires": 23.0
|
||||
* 60.0
|
||||
* 60.0,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "Perform sanity check",
|
||||
@@ -138,6 +164,12 @@ def _parse_beat_schedule() -> Dict:
|
||||
# Default Sunday at 00:30
|
||||
"env_default": "30 0 * * sun",
|
||||
"task": "documents.tasks.sanity_check",
|
||||
"options": {
|
||||
# 1 hour before default schedule sends again
|
||||
"expires": ((7.0 * 24.0) - 1.0)
|
||||
* 60.0
|
||||
* 60.0,
|
||||
},
|
||||
},
|
||||
]
|
||||
for task in tasks:
|
||||
@@ -151,9 +183,11 @@ def _parse_beat_schedule() -> Dict:
|
||||
# - five time-and-date fields
|
||||
# - separated by at least one blank
|
||||
minute, hour, day_month, month, day_week = value.split(" ")
|
||||
|
||||
schedule[task["name"]] = {
|
||||
"task": task["task"],
|
||||
"schedule": crontab(minute, hour, day_week, day_month, month),
|
||||
"options": task["options"],
|
||||
}
|
||||
|
||||
return schedule
|
||||
@@ -263,6 +297,10 @@ MIDDLEWARE = [
|
||||
"django.middleware.clickjacking.XFrameOptionsMiddleware",
|
||||
]
|
||||
|
||||
# Optional to enable compression
|
||||
if __get_boolean("PAPERLESS_ENABLE_COMPRESSION", "yes"): # pragma: nocover
|
||||
MIDDLEWARE.insert(0, "compression_middleware.middleware.CompressionMiddleware")
|
||||
|
||||
ROOT_URLCONF = "paperless.urls"
|
||||
|
||||
FORCE_SCRIPT_NAME = os.getenv("PAPERLESS_FORCE_SCRIPT_NAME")
|
||||
@@ -280,7 +318,6 @@ _CELERY_REDIS_URL, _CHANNELS_REDIS_URL = _parse_redis_url(
|
||||
os.getenv("PAPERLESS_REDIS", None),
|
||||
)
|
||||
|
||||
# TODO: what is this used for?
|
||||
TEMPLATES = [
|
||||
{
|
||||
"BACKEND": "django.template.backends.django.DjangoTemplates",
|
||||
@@ -561,22 +598,21 @@ LOGGING = {
|
||||
# Task queue #
|
||||
###############################################################################
|
||||
|
||||
TASK_WORKERS = __get_int("PAPERLESS_TASK_WORKERS", 1)
|
||||
|
||||
WORKER_TIMEOUT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)
|
||||
# https://docs.celeryq.dev/en/stable/userguide/configuration.html
|
||||
|
||||
CELERY_BROKER_URL = _CELERY_REDIS_URL
|
||||
CELERY_TIMEZONE = TIME_ZONE
|
||||
|
||||
CELERY_WORKER_HIJACK_ROOT_LOGGER = False
|
||||
CELERY_WORKER_CONCURRENCY = TASK_WORKERS
|
||||
CELERY_WORKER_CONCURRENCY: Final[int] = __get_int("PAPERLESS_TASK_WORKERS", 1)
|
||||
TASK_WORKERS = CELERY_WORKER_CONCURRENCY
|
||||
CELERY_WORKER_MAX_TASKS_PER_CHILD = 1
|
||||
CELERY_WORKER_SEND_TASK_EVENTS = True
|
||||
|
||||
CELERY_TASK_SEND_SENT_EVENT = True
|
||||
CELERY_SEND_TASK_SENT_EVENT = True
|
||||
|
||||
CELERY_TASK_TRACK_STARTED = True
|
||||
CELERY_TASK_TIME_LIMIT = WORKER_TIMEOUT
|
||||
CELERY_TASK_TIME_LIMIT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)
|
||||
|
||||
CELERY_RESULT_EXTENDED = True
|
||||
CELERY_RESULT_BACKEND = "django-db"
|
||||
@@ -608,7 +644,7 @@ def default_threads_per_worker(task_workers) -> int:
|
||||
|
||||
THREADS_PER_WORKER = os.getenv(
|
||||
"PAPERLESS_THREADS_PER_WORKER",
|
||||
default_threads_per_worker(TASK_WORKERS),
|
||||
default_threads_per_worker(CELERY_WORKER_CONCURRENCY),
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
@@ -637,7 +673,7 @@ CONSUMER_IGNORE_PATTERNS = list(
|
||||
json.loads(
|
||||
os.getenv(
|
||||
"PAPERLESS_CONSUMER_IGNORE_PATTERNS",
|
||||
'[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]', # noqa: E501
|
||||
'[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]', # noqa: E501
|
||||
),
|
||||
),
|
||||
)
|
||||
|
@@ -149,6 +149,11 @@ class TestRedisSocketConversion(TestCase):
|
||||
|
||||
|
||||
class TestCeleryScheduleParsing(TestCase):
|
||||
MAIL_EXPIRE_TIME = 9.0 * 60.0
|
||||
CLASSIFIER_EXPIRE_TIME = 59.0 * 60.0
|
||||
INDEX_EXPIRE_TIME = 23.0 * 60.0 * 60.0
|
||||
SANITY_EXPIRE_TIME = ((7.0 * 24.0) - 1.0) * 60.0 * 60.0
|
||||
|
||||
def test_schedule_configuration_default(self):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -165,18 +170,22 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"Check all e-mail accounts": {
|
||||
"task": "paperless_mail.tasks.process_mail_accounts",
|
||||
"schedule": crontab(minute="*/10"),
|
||||
"options": {"expires": self.MAIL_EXPIRE_TIME},
|
||||
},
|
||||
"Train the classifier": {
|
||||
"task": "documents.tasks.train_classifier",
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
"options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
|
||||
},
|
||||
"Optimize the index": {
|
||||
"task": "documents.tasks.index_optimize",
|
||||
"schedule": crontab(minute=0, hour=0),
|
||||
"options": {"expires": self.INDEX_EXPIRE_TIME},
|
||||
},
|
||||
"Perform sanity check": {
|
||||
"task": "documents.tasks.sanity_check",
|
||||
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
|
||||
"options": {"expires": self.SANITY_EXPIRE_TIME},
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
@@ -203,18 +212,22 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"Check all e-mail accounts": {
|
||||
"task": "paperless_mail.tasks.process_mail_accounts",
|
||||
"schedule": crontab(minute="*/50", day_of_week="mon"),
|
||||
"options": {"expires": self.MAIL_EXPIRE_TIME},
|
||||
},
|
||||
"Train the classifier": {
|
||||
"task": "documents.tasks.train_classifier",
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
"options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
|
||||
},
|
||||
"Optimize the index": {
|
||||
"task": "documents.tasks.index_optimize",
|
||||
"schedule": crontab(minute=0, hour=0),
|
||||
"options": {"expires": self.INDEX_EXPIRE_TIME},
|
||||
},
|
||||
"Perform sanity check": {
|
||||
"task": "documents.tasks.sanity_check",
|
||||
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
|
||||
"options": {"expires": self.SANITY_EXPIRE_TIME},
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
@@ -238,14 +251,17 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"Check all e-mail accounts": {
|
||||
"task": "paperless_mail.tasks.process_mail_accounts",
|
||||
"schedule": crontab(minute="*/10"),
|
||||
"options": {"expires": self.MAIL_EXPIRE_TIME},
|
||||
},
|
||||
"Train the classifier": {
|
||||
"task": "documents.tasks.train_classifier",
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
"options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
|
||||
},
|
||||
"Perform sanity check": {
|
||||
"task": "documents.tasks.sanity_check",
|
||||
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
|
||||
"options": {"expires": self.SANITY_EXPIRE_TIME},
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
|
@@ -14,15 +14,14 @@ TEST_CHANNEL_LAYERS = {
|
||||
}
|
||||
|
||||
|
||||
@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
|
||||
class TestWebSockets(TestCase):
|
||||
@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
|
||||
async def test_no_auth(self):
|
||||
communicator = WebsocketCommunicator(application, "/ws/status/")
|
||||
connected, subprotocol = await communicator.connect()
|
||||
self.assertFalse(connected)
|
||||
await communicator.disconnect()
|
||||
|
||||
@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
|
||||
@mock.patch("paperless.consumers.StatusConsumer._authenticated")
|
||||
async def test_auth(self, _authenticated):
|
||||
_authenticated.return_value = True
|
||||
@@ -33,7 +32,6 @@ class TestWebSockets(TestCase):
|
||||
|
||||
await communicator.disconnect()
|
||||
|
||||
@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
|
||||
@mock.patch("paperless.consumers.StatusConsumer._authenticated")
|
||||
async def test_receive(self, _authenticated):
|
||||
_authenticated.return_value = True
|
||||
|
@@ -12,7 +12,7 @@ class StandardPagination(PageNumberPagination):
|
||||
|
||||
|
||||
class FaviconView(View):
|
||||
def get(self, request, *args, **kwargs):
|
||||
def get(self, request, *args, **kwargs): # pragma: nocover
|
||||
favicon = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"static",
|
||||
|
@@ -2,12 +2,13 @@ from django.contrib.auth.models import User
|
||||
from documents.models import Correspondent
|
||||
from documents.models import DocumentType
|
||||
from documents.models import Tag
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from paperless_mail.models import MailAccount
|
||||
from paperless_mail.models import MailRule
|
||||
from rest_framework.test import APITestCase
|
||||
|
||||
|
||||
class TestAPIMailAccounts(APITestCase):
|
||||
class TestAPIMailAccounts(DirectoriesMixin, APITestCase):
|
||||
ENDPOINT = "/api/mail_accounts/"
|
||||
|
||||
def setUp(self):
|
||||
@@ -165,7 +166,7 @@ class TestAPIMailAccounts(APITestCase):
|
||||
self.assertEqual(returned_account2.password, "123xyz")
|
||||
|
||||
|
||||
class TestAPIMailRules(APITestCase):
|
||||
class TestAPIMailRules(DirectoriesMixin, APITestCase):
|
||||
ENDPOINT = "/api/mail_rules/"
|
||||
|
||||
def setUp(self):
|
||||
|
@@ -67,11 +67,6 @@ class TestParserLive(TestCase):
|
||||
|
||||
return result
|
||||
|
||||
# Only run if convert is available
|
||||
@pytest.mark.skipif(
|
||||
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
|
||||
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
|
||||
)
|
||||
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
|
||||
def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
|
||||
"""
|
||||
@@ -204,11 +199,6 @@ class TestParserLive(TestCase):
|
||||
"GOTENBERG_LIVE" not in os.environ,
|
||||
reason="No gotenberg server",
|
||||
)
|
||||
# Only run if convert is available
|
||||
@pytest.mark.skipif(
|
||||
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
|
||||
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
|
||||
)
|
||||
def test_generate_pdf_from_mail(self):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -301,11 +291,6 @@ class TestParserLive(TestCase):
|
||||
"GOTENBERG_LIVE" not in os.environ,
|
||||
reason="No gotenberg server",
|
||||
)
|
||||
# Only run if convert is available
|
||||
@pytest.mark.skipif(
|
||||
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
|
||||
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
|
||||
)
|
||||
def test_generate_pdf_from_html(self):
|
||||
"""
|
||||
GIVEN:
|
||||
|
@@ -161,7 +161,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
except Exception:
|
||||
# TODO catch all for various issues with PDFminer.six.
|
||||
# If PDFminer fails, fall back to OCR.
|
||||
# If pdftotext fails, fall back to OCR.
|
||||
self.log(
|
||||
"warning",
|
||||
"Error while getting text from PDF document with " "pdfminer.six",
|
||||
|
@@ -364,7 +364,7 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
)
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
|
||||
self.assertFalse("page 3" in parser.get_text().lower())
|
||||
self.assertNotIn("page 3", parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_PAGES=1, OCR_MODE="force")
|
||||
def test_multi_page_analog_pages_force(self):
|
||||
@@ -386,8 +386,8 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
)
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
|
||||
self.assertFalse("page 2" in parser.get_text().lower())
|
||||
self.assertFalse("page 3" in parser.get_text().lower())
|
||||
self.assertNotIn("page 2", parser.get_text().lower())
|
||||
self.assertNotIn("page 3", parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_skip_noarchive_withtext(self):
|
||||
@@ -660,6 +660,15 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertNotIn("deskew", params)
|
||||
|
||||
with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertIn("max_image_mpixels", params)
|
||||
self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
|
||||
|
||||
with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertNotIn("max_image_mpixels", params)
|
||||
|
||||
def test_rtl_language_detection(self):
|
||||
"""
|
||||
GIVEN:
|
||||
|
@@ -90,7 +90,7 @@ class TikaDocumentParser(DocumentParser):
|
||||
with open(document_path, "rb") as document_handle:
|
||||
files = {
|
||||
"files": (
|
||||
file_name or os.path.basename(document_path),
|
||||
"convert" + os.path.splitext(document_path)[-1],
|
||||
document_handle,
|
||||
),
|
||||
}
|
||||
|
@@ -3,7 +3,9 @@ import os
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import override_settings
|
||||
from django.test import TestCase
|
||||
from documents.parsers import ParseError
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
from requests import Response
|
||||
|
||||
@@ -54,3 +56,63 @@ class TestTikaParser(TestCase):
|
||||
|
||||
self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
|
||||
self.assertTrue("Some-key" in [m["key"] for m in metadata])
|
||||
|
||||
@mock.patch("paperless_tika.parsers.parser.from_file")
|
||||
@mock.patch("paperless_tika.parsers.requests.post")
|
||||
def test_convert_failure(self, post, from_file):
|
||||
"""
|
||||
GIVEN:
|
||||
- Document needs to be converted to PDF
|
||||
WHEN:
|
||||
- Gotenberg server returns an error
|
||||
THEN:
|
||||
- Parse error is raised
|
||||
"""
|
||||
from_file.return_value = {
|
||||
"content": "the content",
|
||||
"metadata": {"Creation-Date": "2020-11-21"},
|
||||
}
|
||||
response = Response()
|
||||
response._content = b"PDF document"
|
||||
response.status_code = 500
|
||||
post.return_value = response
|
||||
|
||||
file = os.path.join(self.parser.tempdir, "input.odt")
|
||||
Path(file).touch()
|
||||
|
||||
with self.assertRaises(ParseError):
|
||||
self.parser.convert_to_pdf(file, None)
|
||||
|
||||
@mock.patch("paperless_tika.parsers.requests.post")
|
||||
def test_request_pdf_a_format(self, post: mock.Mock):
|
||||
"""
|
||||
GIVEN:
|
||||
- Document needs to be converted to PDF
|
||||
WHEN:
|
||||
- Specific PDF/A format requested
|
||||
THEN:
|
||||
- Request to Gotenberg contains the expected PDF/A format string
|
||||
"""
|
||||
file = os.path.join(self.parser.tempdir, "input.odt")
|
||||
Path(file).touch()
|
||||
|
||||
response = Response()
|
||||
response._content = b"PDF document"
|
||||
response.status_code = 200
|
||||
post.return_value = response
|
||||
|
||||
for setting, expected_key in [
|
||||
("pdfa", "PDF/A-2b"),
|
||||
("pdfa-2", "PDF/A-2b"),
|
||||
("pdfa-1", "PDF/A-1a"),
|
||||
("pdfa-3", "PDF/A-3b"),
|
||||
]:
|
||||
with override_settings(OCR_OUTPUT_TYPE=setting):
|
||||
self.parser.convert_to_pdf(file, None)
|
||||
|
||||
post.assert_called_once()
|
||||
_, kwargs = post.call_args
|
||||
|
||||
self.assertEqual(kwargs["data"]["pdfFormat"], expected_key)
|
||||
|
||||
post.reset_mock()
|
||||
|
@@ -7,7 +7,7 @@ max-line-length = 88
|
||||
|
||||
[tool:pytest]
|
||||
DJANGO_SETTINGS_MODULE=paperless.settings
|
||||
addopts = --pythonwarnings=all --cov --cov-report=html --numprocesses auto --quiet
|
||||
addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet
|
||||
env =
|
||||
PAPERLESS_DISABLE_DBHANDLER=true
|
||||
|
||||
|
Reference in New Issue
Block a user