Merge branch 'dev' into feature-permissions

This commit is contained in:
shamoon
2023-02-03 14:23:50 -08:00
36 changed files with 2538 additions and 14911 deletions

View File

@@ -4,18 +4,17 @@ import shutil
import tempfile
from dataclasses import dataclass
from functools import lru_cache
from math import ceil
from pathlib import Path
from typing import Dict
from typing import List
from typing import Optional
import magic
from django.conf import settings
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf
from pikepdf import PdfImage
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
@@ -154,52 +153,15 @@ def scan_file_for_barcodes(
(page_number, barcode_text) tuples
"""
def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]:
detected_barcodes = []
with Pdf.open(pdf_filepath) as pdf:
for page_num, page in enumerate(pdf.pages):
for image_key in page.images:
pdfimage = PdfImage(page.images[image_key])
# This type is known to have issues:
# https://github.com/pikepdf/pikepdf/issues/401
if "/CCITTFaxDecode" in pdfimage.filters:
raise BarcodeImageFormatError(
"Unable to decode CCITTFaxDecode images",
)
# Not all images can be transcoded to a PIL image, which
# is what pyzbar expects to receive, so this may
# raise an exception, triggering fallback
pillow_img = pdfimage.as_pil_image()
# Scale the image down
# See: https://github.com/paperless-ngx/paperless-ngx/issues/2385
# TLDR: zbar has issues with larger images
width, height = pillow_img.size
if width > 1024:
scaler = ceil(width / 1024)
new_width = int(width / scaler)
new_height = int(height / scaler)
pillow_img = pillow_img.resize((new_width, new_height))
width, height = pillow_img.size
if height > 2048:
scaler = ceil(height / 2048)
new_width = int(width / scaler)
new_height = int(height / scaler)
pillow_img = pillow_img.resize((new_width, new_height))
for barcode_value in barcode_reader(pillow_img):
detected_barcodes.append(Barcode(page_num, barcode_value))
return detected_barcodes
def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]:
detected_barcodes = []
# use a temporary directory in case the file is too big to handle in memory
with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
pages_from_path = convert_from_path(
pdf_filepath,
dpi=300,
output_folder=path,
)
for current_page_number, page in enumerate(pages_from_path):
for barcode_value in barcode_reader(page):
detected_barcodes.append(
@@ -219,27 +181,19 @@ def scan_file_for_barcodes(
# Always try pikepdf first, it's usually fine, faster and
# uses less memory
try:
barcodes = _pikepdf_barcode_scan(pdf_filepath)
barcodes = _pdf2image_barcode_scan(pdf_filepath)
# Password protected files can't be checked
except PasswordError as e:
# This is the exception raised for those
except PDFPageCountError as e:
logger.warning(
f"File is likely password protected, not checking for barcodes: {e}",
)
# Handle pikepdf related image decoding issues with a fallback to page
# by page conversion to images in a temporary directory
except Exception as e:
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
logger.warning(
f"Falling back to pdf2image because: {e}",
f"Exception during barcode scanning: {e}",
)
try:
barcodes = _pdf2image_barcode_scan(pdf_filepath)
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)
else:
logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}",
@@ -248,16 +202,25 @@ def scan_file_for_barcodes(
return DocumentBarcodeInfo(pdf_filepath, barcodes)
def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]:
"""
Search the parsed barcodes for separators
and returns a list of page numbers, which
separate the file into new files.
and returns a dict of page numbers, which
separate the file into new files, together
with the information whether to keep the page.
"""
# filter all barcodes for the separator string
# get the page numbers of the separating barcodes
separator_pages = {bc.page: False for bc in barcodes if bc.is_separator}
if not settings.CONSUMER_ENABLE_ASN_BARCODE:
return separator_pages
return list({bc.page for bc in barcodes if bc.is_separator})
# add the page numbers of the ASN barcodes
# (except for first page, that might lead to infinite loops).
return {
**separator_pages,
**{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0},
}
def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
@@ -289,10 +252,11 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
return asn
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
def separate_pages(filepath: str, pages_to_split_on: Dict[int, bool]) -> List[str]:
"""
Separate the provided pdf file on the pages_to_split_on.
The pages which are defined by page_numbers will be removed.
The pages which are defined by the keys in page_numbers
will be removed if the corresponding value is false.
Returns a list of (temporary) filepaths to consume.
These will need to be deleted later.
"""
@@ -308,26 +272,28 @@ def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
fname = os.path.splitext(os.path.basename(filepath))[0]
pdf = Pdf.open(filepath)
# Start with an empty document
current_document: List[Page] = []
# A list of documents, ie a list of lists of pages
documents: List[List[Page]] = []
# A single document, ie a list of pages
document: List[Page] = []
documents: List[List[Page]] = [current_document]
for idx, page in enumerate(pdf.pages):
# Keep building the new PDF as long as it is not a
# separator index
if idx not in pages_to_split_on:
document.append(page)
# Make sure to append the very last document to the documents
if idx == (len(pdf.pages) - 1):
documents.append(document)
document = []
else:
# This is a split index, save the current PDF pages, and restart
# a new destination page listing
logger.debug(f"Starting new document at idx {idx}")
documents.append(document)
document = []
current_document.append(page)
continue
# This is a split index
# Start a new destination page listing
logger.debug(f"Starting new document at idx {idx}")
current_document = []
documents.append(current_document)
keep_page = pages_to_split_on[idx]
if keep_page:
# Keep the page
# (new document is started by asn barcode)
current_document.append(page)
documents = [x for x in documents if len(x)]

View File

@@ -1,7 +1,10 @@
import datetime
import hashlib
import os
import shutil
import tempfile
import uuid
from pathlib import Path
from subprocess import CompletedProcess
from subprocess import run
from typing import Optional
@@ -95,7 +98,8 @@ class Consumer(LoggingMixin):
def __init__(self):
super().__init__()
self.path = None
self.path: Optional[Path] = None
self.original_path: Optional[Path] = None
self.filename = None
self.override_title = None
self.override_correspondent_id = None
@@ -144,11 +148,16 @@ class Consumer(LoggingMixin):
return
# Validate the range is above zero and less than uint32_t max
# otherwise, Whoosh can't handle it in the index
if self.override_asn < 0 or self.override_asn > 0xFF_FF_FF_FF:
if (
self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
):
self._fail(
MESSAGE_ASN_RANGE,
f"Not consuming {self.filename}: "
f"Given ASN {self.override_asn} is out of range [0, 4,294,967,295]",
f"Given ASN {self.override_asn} is out of range "
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
)
if Document.objects.filter(archive_serial_number=self.override_asn).exists():
self._fail(
@@ -169,16 +178,18 @@ class Consumer(LoggingMixin):
self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
filepath_arg = os.path.normpath(self.path)
working_file_path = str(self.path)
original_file_path = str(self.original_path)
script_env = os.environ.copy()
script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg
script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
script_env["DOCUMENT_WORKING_PATH"] = working_file_path
try:
completed_proc = run(
args=[
settings.PRE_CONSUME_SCRIPT,
filepath_arg,
original_file_path,
],
env=script_env,
capture_output=True,
@@ -197,7 +208,7 @@ class Consumer(LoggingMixin):
exception=e,
)
def run_post_consume_script(self, document):
def run_post_consume_script(self, document: Document):
if not settings.POST_CONSUME_SCRIPT:
return
@@ -288,8 +299,8 @@ class Consumer(LoggingMixin):
Return the document object if it was successfully created.
"""
self.path = path
self.filename = override_filename or os.path.basename(path)
self.path = Path(path).resolve()
self.filename = override_filename or self.path.name
self.override_title = override_title
self.override_correspondent_id = override_correspondent_id
self.override_document_type_id = override_document_type_id
@@ -315,6 +326,15 @@ class Consumer(LoggingMixin):
self.log("info", f"Consuming {self.filename}")
# For the actual work, copy the file into a tempdir
self.original_path = self.path
tempdir = tempfile.TemporaryDirectory(
prefix="paperless-ngx",
dir=settings.SCRATCH_DIR,
)
self.path = Path(tempdir.name) / Path(self.filename)
shutil.copy(self.original_path, self.path)
# Determine the parser class.
mime_type = magic.from_file(self.path, mime=True)
@@ -457,11 +477,12 @@ class Consumer(LoggingMixin):
# Delete the file only if it was successfully consumed
self.log("debug", f"Deleting file {self.path}")
os.unlink(self.path)
self.original_path.unlink()
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = os.path.join(
os.path.dirname(self.path),
"._" + os.path.basename(self.path),
os.path.dirname(self.original_path),
"._" + os.path.basename(self.original_path),
)
if os.path.isfile(shadow_file):
@@ -478,6 +499,7 @@ class Consumer(LoggingMixin):
)
finally:
document_parser.cleanup()
tempdir.cleanup()
self.run_post_consume_script(document)

View File

@@ -5,6 +5,7 @@ from contextlib import contextmanager
from dateutil.parser import isoparse
from django.conf import settings
from django.utils import timezone
from documents.models import Comment
from documents.models import Document
from guardian.shortcuts import get_users_with_perms
@@ -94,10 +95,22 @@ def open_index_searcher():
searcher.close()
def update_document(writer, doc):
def update_document(writer: AsyncWriter, doc: Document):
tags = ",".join([t.name for t in doc.tags.all()])
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)])
asn = doc.archive_serial_number
if asn is not None and (
asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
):
logger.error(
f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
f"ASN is out of range "
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
)
asn = 0
users_with_perms = get_users_with_perms(
doc,
only_with_perms_in=["view_document"],
@@ -118,7 +131,7 @@ def update_document(writer, doc):
has_type=doc.document_type is not None,
created=doc.created,
added=doc.added,
asn=doc.archive_serial_number,
asn=asn,
modified=doc.modified,
path=doc.storage_path.name if doc.storage_path else None,
path_id=doc.storage_path.id if doc.storage_path else None,
@@ -283,7 +296,7 @@ class DelayedFullTextQuery(DelayedQuery):
["content", "title", "correspondent", "tag", "type", "comments"],
self.searcher.ixreader.schema,
)
qp.add_plugin(DateParserPlugin())
qp.add_plugin(DateParserPlugin(basedate=timezone.now()))
q = qp.parse(q_str)
corrected = self.searcher.correct_query(q, q_str)

View File

@@ -311,8 +311,8 @@ class Command(BaseCommand):
archive_target = None
# 3.4. write files to target folder
t = int(time.mktime(document.created.timetuple()))
if document.storage_type == Document.STORAGE_TYPE_GPG:
t = int(time.mktime(document.created.timetuple()))
original_target.parent.mkdir(parents=True, exist_ok=True)
with document.source_file as out_file:

View File

@@ -0,0 +1,23 @@
# Generated by Django 4.1.5 on 2023-02-03 21:53
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("documents", "1029_alter_document_archive_serial_number"),
]
operations = [
migrations.AlterField(
model_name="paperlesstask",
name="task_file_name",
field=models.CharField(
help_text="Name of the file which the Task was run for",
max_length=255,
null=True,
verbose_name="Task Filename",
),
),
]

View File

@@ -3,6 +3,7 @@ import logging
import os
import re
from collections import OrderedDict
from typing import Final
from typing import Optional
import dateutil.parser
@@ -242,6 +243,9 @@ class Document(ModelWithOwner):
help_text=_("The original name of the file when it was uploaded"),
)
ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0
ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF
archive_serial_number = models.PositiveIntegerField(
_("archive serial number"),
blank=True,
@@ -249,8 +253,8 @@ class Document(ModelWithOwner):
unique=True,
db_index=True,
validators=[
MaxValueValidator(0xFF_FF_FF_FF),
MinValueValidator(0),
MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX),
MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN),
],
help_text=_(
"The position of this document in your physical document " "archive.",
@@ -567,7 +571,7 @@ class PaperlessTask(models.Model):
task_file_name = models.CharField(
null=True,
max_length=255,
verbose_name=_("Task Name"),
verbose_name=_("Task Filename"),
help_text=_("Name of the file which the Task was run for"),
)

View File

@@ -166,7 +166,7 @@ def consume_file(
# notify the sender, otherwise the progress bar
# in the UI stays stuck
payload = {
"filename": override_filename,
"filename": override_filename or path.name,
"task_id": task_id,
"current_progress": 100,
"max_progress": 100,

View File

@@ -7,6 +7,7 @@ import tempfile
import urllib.request
import uuid
import zipfile
from datetime import timedelta
from pathlib import Path
from unittest import mock
from unittest.mock import MagicMock
@@ -25,6 +26,7 @@ from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from django.test import override_settings
from django.utils import timezone
from dateutil.relativedelta import relativedelta
from documents import bulk_edit
from documents import index
from documents.models import Correspondent
@@ -509,6 +511,270 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
response = self.client.get("/api/documents/?query=content&page=3&page_size=10")
self.assertEqual(response.status_code, 404)
@override_settings(
TIME_ZONE="UTC",
)
def test_search_added_in_last_week(self):
"""
GIVEN:
- Three documents added right now
- The timezone is UTC time
WHEN:
- Query for documents added in the last 7 days
THEN:
- All three recent documents are returned
"""
d1 = Document.objects.create(
title="invoice",
content="the thing i bought at a shop and paid with bank account",
checksum="A",
pk=1,
)
d2 = Document.objects.create(
title="bank statement 1",
content="things i paid for in august",
pk=2,
checksum="B",
)
d3 = Document.objects.create(
title="bank statement 3",
content="things i paid for in september",
pk=3,
checksum="C",
)
with index.open_index_writer() as writer:
index.update_document(writer, d1)
index.update_document(writer, d2)
index.update_document(writer, d3)
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
results = response.data["results"]
# Expect 3 documents returned
self.assertEqual(len(results), 3)
for idx, subset in enumerate(
[
{"id": 1, "title": "invoice"},
{"id": 2, "title": "bank statement 1"},
{"id": 3, "title": "bank statement 3"},
],
):
result = results[idx]
# Assert subset in results
self.assertDictEqual(result, {**result, **subset})
@override_settings(
TIME_ZONE="America/Chicago",
)
def test_search_added_in_last_week_with_timezone_behind(self):
"""
GIVEN:
- Two documents added right now
- One document added over a week ago
- The timezone is behind UTC time (-6)
WHEN:
- Query for documents added in the last 7 days
THEN:
- The two recent documents are returned
"""
d1 = Document.objects.create(
title="invoice",
content="the thing i bought at a shop and paid with bank account",
checksum="A",
pk=1,
)
d2 = Document.objects.create(
title="bank statement 1",
content="things i paid for in august",
pk=2,
checksum="B",
)
d3 = Document.objects.create(
title="bank statement 3",
content="things i paid for in september",
pk=3,
checksum="C",
# 7 days, 1 hour and 1 minute ago
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
)
with index.open_index_writer() as writer:
index.update_document(writer, d1)
index.update_document(writer, d2)
index.update_document(writer, d3)
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
results = response.data["results"]
# Expect 2 documents returned
self.assertEqual(len(results), 2)
for idx, subset in enumerate(
[{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
):
result = results[idx]
# Assert subset in results
self.assertDictEqual(result, {**result, **subset})
@override_settings(
TIME_ZONE="Europe/Sofia",
)
def test_search_added_in_last_week_with_timezone_ahead(self):
"""
GIVEN:
- Two documents added right now
- One document added over a week ago
- The timezone is behind UTC time (+2)
WHEN:
- Query for documents added in the last 7 days
THEN:
- The two recent documents are returned
"""
d1 = Document.objects.create(
title="invoice",
content="the thing i bought at a shop and paid with bank account",
checksum="A",
pk=1,
)
d2 = Document.objects.create(
title="bank statement 1",
content="things i paid for in august",
pk=2,
checksum="B",
)
d3 = Document.objects.create(
title="bank statement 3",
content="things i paid for in september",
pk=3,
checksum="C",
# 7 days, 1 hour and 1 minute ago
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
)
with index.open_index_writer() as writer:
index.update_document(writer, d1)
index.update_document(writer, d2)
index.update_document(writer, d3)
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
results = response.data["results"]
# Expect 2 documents returned
self.assertEqual(len(results), 2)
for idx, subset in enumerate(
[{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
):
result = results[idx]
# Assert subset in results
self.assertDictEqual(result, {**result, **subset})
def test_search_added_in_last_month(self):
"""
GIVEN:
- One document added right now
- One documents added about a week ago
- One document added over 1 month
WHEN:
- Query for documents added in the last month
THEN:
- The two recent documents are returned
"""
d1 = Document.objects.create(
title="invoice",
content="the thing i bought at a shop and paid with bank account",
checksum="A",
pk=1,
)
d2 = Document.objects.create(
title="bank statement 1",
content="things i paid for in august",
pk=2,
checksum="B",
# 1 month, 1 day ago
added=timezone.now() - relativedelta(months=1, days=1),
)
d3 = Document.objects.create(
title="bank statement 3",
content="things i paid for in september",
pk=3,
checksum="C",
# 7 days, 1 hour and 1 minute ago
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
)
with index.open_index_writer() as writer:
index.update_document(writer, d1)
index.update_document(writer, d2)
index.update_document(writer, d3)
response = self.client.get("/api/documents/?query=added:[-1 month to now]")
results = response.data["results"]
# Expect 2 documents returned
self.assertEqual(len(results), 2)
for idx, subset in enumerate(
[{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
):
result = results[idx]
# Assert subset in results
self.assertDictEqual(result, {**result, **subset})
@override_settings(
TIME_ZONE="America/Denver",
)
def test_search_added_in_last_month_timezone_behind(self):
"""
GIVEN:
- One document added right now
- One documents added about a week ago
- One document added over 1 month
- The timezone is behind UTC time (-6 or -7)
WHEN:
- Query for documents added in the last month
THEN:
- The two recent documents are returned
"""
d1 = Document.objects.create(
title="invoice",
content="the thing i bought at a shop and paid with bank account",
checksum="A",
pk=1,
)
d2 = Document.objects.create(
title="bank statement 1",
content="things i paid for in august",
pk=2,
checksum="B",
# 1 month, 1 day ago
added=timezone.now() - relativedelta(months=1, days=1),
)
d3 = Document.objects.create(
title="bank statement 3",
content="things i paid for in september",
pk=3,
checksum="C",
# 7 days, 1 hour and 1 minute ago
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
)
with index.open_index_writer() as writer:
index.update_document(writer, d1)
index.update_document(writer, d2)
index.update_document(writer, d3)
response = self.client.get("/api/documents/?query=added:[-1 month to now]")
results = response.data["results"]
# Expect 2 documents returned
self.assertEqual(len(results), 2)
for idx, subset in enumerate(
[{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
):
result = results[idx]
# Assert subset in results
self.assertDictEqual(result, {**result, **subset})
@mock.patch("documents.index.autocomplete")
def test_search_autocomplete(self, m):
m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]

File diff suppressed because it is too large Load Diff

View File

@@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase):
with tempfile.NamedTemporaryFile() as script:
with override_settings(PRE_CONSUME_SCRIPT=script.name):
c = Consumer()
c.path = "path-to-file"
c.original_path = "path-to-file"
c.path = "/tmp/somewhere/path-to-file"
c.run_pre_consume_script()
m.assert_called_once()
@@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase):
args, kwargs = m.call_args
command = kwargs["args"]
environment = kwargs["env"]
self.assertEqual(command[0], script.name)
self.assertEqual(command[1], "path-to-file")
self.assertDictContainsSubset(
{
"DOCUMENT_SOURCE_PATH": c.original_path,
"DOCUMENT_WORKING_PATH": c.path,
},
environment,
)
@mock.patch("documents.consumer.Consumer.log")
def test_script_with_output(self, mocked_log):
"""
@@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase):
m.assert_called_once()
args, kwargs = m.call_args
_, kwargs = m.call_args
command = kwargs["args"]
environment = kwargs["env"]
self.assertEqual(command[0], script.name)
self.assertEqual(command[1], str(doc.pk))
@@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase):
self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"])
self.assertDictContainsSubset(
{
"DOCUMENT_ID": str(doc.pk),
"DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
"DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
"DOCUMENT_CORRESPONDENT": "my_bank",
"DOCUMENT_TAGS": "a,b",
},
environment,
)
def test_script_exit_non_zero(self):
"""
GIVEN:

View File

@@ -1,3 +1,5 @@
from unittest import mock
from django.test import TestCase
from documents import index
from documents.models import Document
@@ -31,3 +33,60 @@ class TestAutoComplete(DirectoriesMixin, TestCase):
)
self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"])
self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
def test_archive_serial_number_ranging(self):
"""
GIVEN:
- Document with an archive serial number above schema allowed size
WHEN:
- Document is provided to the index
THEN:
- Error is logged
- Document ASN is reset to 0 for the index
"""
doc1 = Document.objects.create(
title="doc1",
checksum="A",
content="test test2 test3",
# yes, this is allowed, unless full_clean is run
# DRF does call the validators, this test won't
archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1,
)
with self.assertLogs("paperless.index", level="ERROR") as cm:
with mock.patch(
"documents.index.AsyncWriter.update_document",
) as mocked_update_doc:
index.add_or_update_document(doc1)
mocked_update_doc.assert_called_once()
_, kwargs = mocked_update_doc.call_args
self.assertEqual(kwargs["asn"], 0)
error_str = cm.output[0]
expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1"
self.assertIn(expected_str, error_str)
def test_archive_serial_number_is_none(self):
"""
GIVEN:
- Document with no archive serial number
WHEN:
- Document is provided to the index
THEN:
- ASN isn't touched
"""
doc1 = Document.objects.create(
title="doc1",
checksum="A",
content="test test2 test3",
)
with mock.patch(
"documents.index.AsyncWriter.update_document",
) as mocked_update_doc:
index.add_or_update_document(doc1)
mocked_update_doc.assert_called_once()
_, kwargs = mocked_update_doc.call_args
self.assertIsNone(kwargs["asn"])

View File

@@ -3,6 +3,7 @@ import shutil
import tempfile
from collections import namedtuple
from contextlib import contextmanager
from unittest import mock
from django.apps import apps
from django.db import connection
@@ -86,6 +87,30 @@ class DirectoriesMixin:
remove_dirs(self.dirs)
class ConsumerProgressMixin:
def setUp(self) -> None:
self.send_progress_patcher = mock.patch(
"documents.consumer.Consumer._send_progress",
)
self.send_progress_mock = self.send_progress_patcher.start()
super().setUp()
def tearDown(self) -> None:
super().tearDown()
self.send_progress_patcher.stop()
class DocumentConsumeDelayMixin:
def setUp(self) -> None:
self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay")
self.consume_file_mock = self.consume_file_patcher.start()
super().setUp()
def tearDown(self) -> None:
super().tearDown()
self.consume_file_patcher.stop()
class TestMigrations(TransactionTestCase):
@property
def app(self):

View File

@@ -1,7 +1,7 @@
from typing import Final
from typing import Tuple
__version__: Final[Tuple[int, int, int]] = (1, 12, 1)
__version__: Final[Tuple[int, int, int]] = (1, 12, 2)
# Version string like X.Y.Z
__full_version_str__: Final[str] = ".".join(map(str, __version__))
# Version string like X.Y

View File

@@ -67,11 +67,6 @@ class TestParserLive(TestCase):
return result
# Only run if convert is available
@pytest.mark.skipif(
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
"""
@@ -204,11 +199,6 @@ class TestParserLive(TestCase):
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
# Only run if convert is available
@pytest.mark.skipif(
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
)
def test_generate_pdf_from_mail(self):
"""
GIVEN:
@@ -301,11 +291,6 @@ class TestParserLive(TestCase):
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
# Only run if convert is available
@pytest.mark.skipif(
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
)
def test_generate_pdf_from_html(self):
"""
GIVEN:

View File

@@ -90,7 +90,7 @@ class TikaDocumentParser(DocumentParser):
with open(document_path, "rb") as document_handle:
files = {
"files": (
file_name or os.path.basename(document_path),
"convert" + os.path.splitext(document_path)[-1],
document_handle,
),
}

View File

@@ -7,7 +7,7 @@ max-line-length = 88
[tool:pytest]
DJANGO_SETTINGS_MODULE=paperless.settings
addopts = --pythonwarnings=all --cov --cov-report=html --numprocesses auto --quiet
addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet
env =
PAPERLESS_DISABLE_DBHANDLER=true