mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'dev' into feature-permissions
This commit is contained in:
@@ -4,18 +4,17 @@ import shutil
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from math import ceil
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from pdf2image import convert_from_path
|
||||
from pdf2image.exceptions import PDFPageCountError
|
||||
from pikepdf import Page
|
||||
from pikepdf import PasswordError
|
||||
from pikepdf import Pdf
|
||||
from pikepdf import PdfImage
|
||||
from PIL import Image
|
||||
from PIL import ImageSequence
|
||||
from pyzbar import pyzbar
|
||||
@@ -154,52 +153,15 @@ def scan_file_for_barcodes(
|
||||
(page_number, barcode_text) tuples
|
||||
"""
|
||||
|
||||
def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]:
|
||||
detected_barcodes = []
|
||||
with Pdf.open(pdf_filepath) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
for image_key in page.images:
|
||||
pdfimage = PdfImage(page.images[image_key])
|
||||
|
||||
# This type is known to have issues:
|
||||
# https://github.com/pikepdf/pikepdf/issues/401
|
||||
if "/CCITTFaxDecode" in pdfimage.filters:
|
||||
raise BarcodeImageFormatError(
|
||||
"Unable to decode CCITTFaxDecode images",
|
||||
)
|
||||
|
||||
# Not all images can be transcoded to a PIL image, which
|
||||
# is what pyzbar expects to receive, so this may
|
||||
# raise an exception, triggering fallback
|
||||
pillow_img = pdfimage.as_pil_image()
|
||||
|
||||
# Scale the image down
|
||||
# See: https://github.com/paperless-ngx/paperless-ngx/issues/2385
|
||||
# TLDR: zbar has issues with larger images
|
||||
width, height = pillow_img.size
|
||||
if width > 1024:
|
||||
scaler = ceil(width / 1024)
|
||||
new_width = int(width / scaler)
|
||||
new_height = int(height / scaler)
|
||||
pillow_img = pillow_img.resize((new_width, new_height))
|
||||
|
||||
width, height = pillow_img.size
|
||||
if height > 2048:
|
||||
scaler = ceil(height / 2048)
|
||||
new_width = int(width / scaler)
|
||||
new_height = int(height / scaler)
|
||||
pillow_img = pillow_img.resize((new_width, new_height))
|
||||
|
||||
for barcode_value in barcode_reader(pillow_img):
|
||||
detected_barcodes.append(Barcode(page_num, barcode_value))
|
||||
|
||||
return detected_barcodes
|
||||
|
||||
def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]:
|
||||
detected_barcodes = []
|
||||
# use a temporary directory in case the file is too big to handle in memory
|
||||
with tempfile.TemporaryDirectory() as path:
|
||||
pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
|
||||
pages_from_path = convert_from_path(
|
||||
pdf_filepath,
|
||||
dpi=300,
|
||||
output_folder=path,
|
||||
)
|
||||
for current_page_number, page in enumerate(pages_from_path):
|
||||
for barcode_value in barcode_reader(page):
|
||||
detected_barcodes.append(
|
||||
@@ -219,27 +181,19 @@ def scan_file_for_barcodes(
|
||||
# Always try pikepdf first, it's usually fine, faster and
|
||||
# uses less memory
|
||||
try:
|
||||
barcodes = _pikepdf_barcode_scan(pdf_filepath)
|
||||
barcodes = _pdf2image_barcode_scan(pdf_filepath)
|
||||
# Password protected files can't be checked
|
||||
except PasswordError as e:
|
||||
# This is the exception raised for those
|
||||
except PDFPageCountError as e:
|
||||
logger.warning(
|
||||
f"File is likely password protected, not checking for barcodes: {e}",
|
||||
)
|
||||
# Handle pikepdf related image decoding issues with a fallback to page
|
||||
# by page conversion to images in a temporary directory
|
||||
except Exception as e:
|
||||
# This file is really borked, allow the consumption to continue
|
||||
# but it may fail further on
|
||||
except Exception as e: # pragma: no cover
|
||||
logger.warning(
|
||||
f"Falling back to pdf2image because: {e}",
|
||||
f"Exception during barcode scanning: {e}",
|
||||
)
|
||||
try:
|
||||
barcodes = _pdf2image_barcode_scan(pdf_filepath)
|
||||
# This file is really borked, allow the consumption to continue
|
||||
# but it may fail further on
|
||||
except Exception as e: # pragma: no cover
|
||||
logger.warning(
|
||||
f"Exception during barcode scanning: {e}",
|
||||
)
|
||||
|
||||
else:
|
||||
logger.warning(
|
||||
f"Unsupported file format for barcode reader: {str(mime_type)}",
|
||||
@@ -248,16 +202,25 @@ def scan_file_for_barcodes(
|
||||
return DocumentBarcodeInfo(pdf_filepath, barcodes)
|
||||
|
||||
|
||||
def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
|
||||
def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]:
|
||||
"""
|
||||
Search the parsed barcodes for separators
|
||||
and returns a list of page numbers, which
|
||||
separate the file into new files.
|
||||
and returns a dict of page numbers, which
|
||||
separate the file into new files, together
|
||||
with the information whether to keep the page.
|
||||
"""
|
||||
# filter all barcodes for the separator string
|
||||
# get the page numbers of the separating barcodes
|
||||
separator_pages = {bc.page: False for bc in barcodes if bc.is_separator}
|
||||
if not settings.CONSUMER_ENABLE_ASN_BARCODE:
|
||||
return separator_pages
|
||||
|
||||
return list({bc.page for bc in barcodes if bc.is_separator})
|
||||
# add the page numbers of the ASN barcodes
|
||||
# (except for first page, that might lead to infinite loops).
|
||||
return {
|
||||
**separator_pages,
|
||||
**{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0},
|
||||
}
|
||||
|
||||
|
||||
def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
|
||||
@@ -289,10 +252,11 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
|
||||
return asn
|
||||
|
||||
|
||||
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
||||
def separate_pages(filepath: str, pages_to_split_on: Dict[int, bool]) -> List[str]:
|
||||
"""
|
||||
Separate the provided pdf file on the pages_to_split_on.
|
||||
The pages which are defined by page_numbers will be removed.
|
||||
The pages which are defined by the keys in page_numbers
|
||||
will be removed if the corresponding value is false.
|
||||
Returns a list of (temporary) filepaths to consume.
|
||||
These will need to be deleted later.
|
||||
"""
|
||||
@@ -308,26 +272,28 @@ def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
||||
fname = os.path.splitext(os.path.basename(filepath))[0]
|
||||
pdf = Pdf.open(filepath)
|
||||
|
||||
# Start with an empty document
|
||||
current_document: List[Page] = []
|
||||
# A list of documents, ie a list of lists of pages
|
||||
documents: List[List[Page]] = []
|
||||
# A single document, ie a list of pages
|
||||
document: List[Page] = []
|
||||
documents: List[List[Page]] = [current_document]
|
||||
|
||||
for idx, page in enumerate(pdf.pages):
|
||||
# Keep building the new PDF as long as it is not a
|
||||
# separator index
|
||||
if idx not in pages_to_split_on:
|
||||
document.append(page)
|
||||
# Make sure to append the very last document to the documents
|
||||
if idx == (len(pdf.pages) - 1):
|
||||
documents.append(document)
|
||||
document = []
|
||||
else:
|
||||
# This is a split index, save the current PDF pages, and restart
|
||||
# a new destination page listing
|
||||
logger.debug(f"Starting new document at idx {idx}")
|
||||
documents.append(document)
|
||||
document = []
|
||||
current_document.append(page)
|
||||
continue
|
||||
|
||||
# This is a split index
|
||||
# Start a new destination page listing
|
||||
logger.debug(f"Starting new document at idx {idx}")
|
||||
current_document = []
|
||||
documents.append(current_document)
|
||||
keep_page = pages_to_split_on[idx]
|
||||
if keep_page:
|
||||
# Keep the page
|
||||
# (new document is started by asn barcode)
|
||||
current_document.append(page)
|
||||
|
||||
documents = [x for x in documents if len(x)]
|
||||
|
||||
|
@@ -1,7 +1,10 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from subprocess import CompletedProcess
|
||||
from subprocess import run
|
||||
from typing import Optional
|
||||
@@ -95,7 +98,8 @@ class Consumer(LoggingMixin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.path = None
|
||||
self.path: Optional[Path] = None
|
||||
self.original_path: Optional[Path] = None
|
||||
self.filename = None
|
||||
self.override_title = None
|
||||
self.override_correspondent_id = None
|
||||
@@ -144,11 +148,16 @@ class Consumer(LoggingMixin):
|
||||
return
|
||||
# Validate the range is above zero and less than uint32_t max
|
||||
# otherwise, Whoosh can't handle it in the index
|
||||
if self.override_asn < 0 or self.override_asn > 0xFF_FF_FF_FF:
|
||||
if (
|
||||
self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
|
||||
or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
self._fail(
|
||||
MESSAGE_ASN_RANGE,
|
||||
f"Not consuming {self.filename}: "
|
||||
f"Given ASN {self.override_asn} is out of range [0, 4,294,967,295]",
|
||||
f"Given ASN {self.override_asn} is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
|
||||
)
|
||||
if Document.objects.filter(archive_serial_number=self.override_asn).exists():
|
||||
self._fail(
|
||||
@@ -169,16 +178,18 @@ class Consumer(LoggingMixin):
|
||||
|
||||
self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
|
||||
|
||||
filepath_arg = os.path.normpath(self.path)
|
||||
working_file_path = str(self.path)
|
||||
original_file_path = str(self.original_path)
|
||||
|
||||
script_env = os.environ.copy()
|
||||
script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg
|
||||
script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
|
||||
script_env["DOCUMENT_WORKING_PATH"] = working_file_path
|
||||
|
||||
try:
|
||||
completed_proc = run(
|
||||
args=[
|
||||
settings.PRE_CONSUME_SCRIPT,
|
||||
filepath_arg,
|
||||
original_file_path,
|
||||
],
|
||||
env=script_env,
|
||||
capture_output=True,
|
||||
@@ -197,7 +208,7 @@ class Consumer(LoggingMixin):
|
||||
exception=e,
|
||||
)
|
||||
|
||||
def run_post_consume_script(self, document):
|
||||
def run_post_consume_script(self, document: Document):
|
||||
if not settings.POST_CONSUME_SCRIPT:
|
||||
return
|
||||
|
||||
@@ -288,8 +299,8 @@ class Consumer(LoggingMixin):
|
||||
Return the document object if it was successfully created.
|
||||
"""
|
||||
|
||||
self.path = path
|
||||
self.filename = override_filename or os.path.basename(path)
|
||||
self.path = Path(path).resolve()
|
||||
self.filename = override_filename or self.path.name
|
||||
self.override_title = override_title
|
||||
self.override_correspondent_id = override_correspondent_id
|
||||
self.override_document_type_id = override_document_type_id
|
||||
@@ -315,6 +326,15 @@ class Consumer(LoggingMixin):
|
||||
|
||||
self.log("info", f"Consuming {self.filename}")
|
||||
|
||||
# For the actual work, copy the file into a tempdir
|
||||
self.original_path = self.path
|
||||
tempdir = tempfile.TemporaryDirectory(
|
||||
prefix="paperless-ngx",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
)
|
||||
self.path = Path(tempdir.name) / Path(self.filename)
|
||||
shutil.copy(self.original_path, self.path)
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
mime_type = magic.from_file(self.path, mime=True)
|
||||
@@ -457,11 +477,12 @@ class Consumer(LoggingMixin):
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log("debug", f"Deleting file {self.path}")
|
||||
os.unlink(self.path)
|
||||
self.original_path.unlink()
|
||||
|
||||
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
|
||||
shadow_file = os.path.join(
|
||||
os.path.dirname(self.path),
|
||||
"._" + os.path.basename(self.path),
|
||||
os.path.dirname(self.original_path),
|
||||
"._" + os.path.basename(self.original_path),
|
||||
)
|
||||
|
||||
if os.path.isfile(shadow_file):
|
||||
@@ -478,6 +499,7 @@ class Consumer(LoggingMixin):
|
||||
)
|
||||
finally:
|
||||
document_parser.cleanup()
|
||||
tempdir.cleanup()
|
||||
|
||||
self.run_post_consume_script(document)
|
||||
|
||||
|
@@ -5,6 +5,7 @@ from contextlib import contextmanager
|
||||
|
||||
from dateutil.parser import isoparse
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from documents.models import Comment
|
||||
from documents.models import Document
|
||||
from guardian.shortcuts import get_users_with_perms
|
||||
@@ -94,10 +95,22 @@ def open_index_searcher():
|
||||
searcher.close()
|
||||
|
||||
|
||||
def update_document(writer, doc):
|
||||
def update_document(writer: AsyncWriter, doc: Document):
|
||||
tags = ",".join([t.name for t in doc.tags.all()])
|
||||
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
|
||||
comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)])
|
||||
asn = doc.archive_serial_number
|
||||
if asn is not None and (
|
||||
asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
|
||||
or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
logger.error(
|
||||
f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
|
||||
f"ASN is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
|
||||
)
|
||||
asn = 0
|
||||
users_with_perms = get_users_with_perms(
|
||||
doc,
|
||||
only_with_perms_in=["view_document"],
|
||||
@@ -118,7 +131,7 @@ def update_document(writer, doc):
|
||||
has_type=doc.document_type is not None,
|
||||
created=doc.created,
|
||||
added=doc.added,
|
||||
asn=doc.archive_serial_number,
|
||||
asn=asn,
|
||||
modified=doc.modified,
|
||||
path=doc.storage_path.name if doc.storage_path else None,
|
||||
path_id=doc.storage_path.id if doc.storage_path else None,
|
||||
@@ -283,7 +296,7 @@ class DelayedFullTextQuery(DelayedQuery):
|
||||
["content", "title", "correspondent", "tag", "type", "comments"],
|
||||
self.searcher.ixreader.schema,
|
||||
)
|
||||
qp.add_plugin(DateParserPlugin())
|
||||
qp.add_plugin(DateParserPlugin(basedate=timezone.now()))
|
||||
q = qp.parse(q_str)
|
||||
|
||||
corrected = self.searcher.correct_query(q, q_str)
|
||||
|
@@ -311,8 +311,8 @@ class Command(BaseCommand):
|
||||
archive_target = None
|
||||
|
||||
# 3.4. write files to target folder
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
|
||||
original_target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with document.source_file as out_file:
|
||||
|
@@ -0,0 +1,23 @@
|
||||
# Generated by Django 4.1.5 on 2023-02-03 21:53
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("documents", "1029_alter_document_archive_serial_number"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="paperlesstask",
|
||||
name="task_file_name",
|
||||
field=models.CharField(
|
||||
help_text="Name of the file which the Task was run for",
|
||||
max_length=255,
|
||||
null=True,
|
||||
verbose_name="Task Filename",
|
||||
),
|
||||
),
|
||||
]
|
@@ -3,6 +3,7 @@ import logging
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
from typing import Final
|
||||
from typing import Optional
|
||||
|
||||
import dateutil.parser
|
||||
@@ -242,6 +243,9 @@ class Document(ModelWithOwner):
|
||||
help_text=_("The original name of the file when it was uploaded"),
|
||||
)
|
||||
|
||||
ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0
|
||||
ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF
|
||||
|
||||
archive_serial_number = models.PositiveIntegerField(
|
||||
_("archive serial number"),
|
||||
blank=True,
|
||||
@@ -249,8 +253,8 @@ class Document(ModelWithOwner):
|
||||
unique=True,
|
||||
db_index=True,
|
||||
validators=[
|
||||
MaxValueValidator(0xFF_FF_FF_FF),
|
||||
MinValueValidator(0),
|
||||
MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX),
|
||||
MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN),
|
||||
],
|
||||
help_text=_(
|
||||
"The position of this document in your physical document " "archive.",
|
||||
@@ -567,7 +571,7 @@ class PaperlessTask(models.Model):
|
||||
task_file_name = models.CharField(
|
||||
null=True,
|
||||
max_length=255,
|
||||
verbose_name=_("Task Name"),
|
||||
verbose_name=_("Task Filename"),
|
||||
help_text=_("Name of the file which the Task was run for"),
|
||||
)
|
||||
|
||||
|
@@ -166,7 +166,7 @@ def consume_file(
|
||||
# notify the sender, otherwise the progress bar
|
||||
# in the UI stays stuck
|
||||
payload = {
|
||||
"filename": override_filename,
|
||||
"filename": override_filename or path.name,
|
||||
"task_id": task_id,
|
||||
"current_progress": 100,
|
||||
"max_progress": 100,
|
||||
|
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 33 KiB |
Before Width: | Height: | Size: 39 KiB After Width: | Height: | Size: 39 KiB |
BIN
src/documents/tests/samples/barcodes/split-by-asn-1.pdf
Normal file
BIN
src/documents/tests/samples/barcodes/split-by-asn-1.pdf
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/barcodes/split-by-asn-2.pdf
Normal file
BIN
src/documents/tests/samples/barcodes/split-by-asn-2.pdf
Normal file
Binary file not shown.
@@ -7,6 +7,7 @@ import tempfile
|
||||
import urllib.request
|
||||
import uuid
|
||||
import zipfile
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
@@ -25,6 +26,7 @@ from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import override_settings
|
||||
from django.utils import timezone
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from documents import bulk_edit
|
||||
from documents import index
|
||||
from documents.models import Correspondent
|
||||
@@ -509,6 +511,270 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
response = self.client.get("/api/documents/?query=content&page=3&page_size=10")
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="UTC",
|
||||
)
|
||||
def test_search_added_in_last_week(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Three documents added right now
|
||||
- The timezone is UTC time
|
||||
WHEN:
|
||||
- Query for documents added in the last 7 days
|
||||
THEN:
|
||||
- All three recent documents are returned
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
content="the thing i bought at a shop and paid with bank account",
|
||||
checksum="A",
|
||||
pk=1,
|
||||
)
|
||||
d2 = Document.objects.create(
|
||||
title="bank statement 1",
|
||||
content="things i paid for in august",
|
||||
pk=2,
|
||||
checksum="B",
|
||||
)
|
||||
d3 = Document.objects.create(
|
||||
title="bank statement 3",
|
||||
content="things i paid for in september",
|
||||
pk=3,
|
||||
checksum="C",
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
|
||||
results = response.data["results"]
|
||||
# Expect 3 documents returned
|
||||
self.assertEqual(len(results), 3)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[
|
||||
{"id": 1, "title": "invoice"},
|
||||
{"id": 2, "title": "bank statement 1"},
|
||||
{"id": 3, "title": "bank statement 3"},
|
||||
],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="America/Chicago",
|
||||
)
|
||||
def test_search_added_in_last_week_with_timezone_behind(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Two documents added right now
|
||||
- One document added over a week ago
|
||||
- The timezone is behind UTC time (-6)
|
||||
WHEN:
|
||||
- Query for documents added in the last 7 days
|
||||
THEN:
|
||||
- The two recent documents are returned
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
content="the thing i bought at a shop and paid with bank account",
|
||||
checksum="A",
|
||||
pk=1,
|
||||
)
|
||||
d2 = Document.objects.create(
|
||||
title="bank statement 1",
|
||||
content="things i paid for in august",
|
||||
pk=2,
|
||||
checksum="B",
|
||||
)
|
||||
d3 = Document.objects.create(
|
||||
title="bank statement 3",
|
||||
content="things i paid for in september",
|
||||
pk=3,
|
||||
checksum="C",
|
||||
# 7 days, 1 hour and 1 minute ago
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
|
||||
results = response.data["results"]
|
||||
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="Europe/Sofia",
|
||||
)
|
||||
def test_search_added_in_last_week_with_timezone_ahead(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Two documents added right now
|
||||
- One document added over a week ago
|
||||
- The timezone is behind UTC time (+2)
|
||||
WHEN:
|
||||
- Query for documents added in the last 7 days
|
||||
THEN:
|
||||
- The two recent documents are returned
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
content="the thing i bought at a shop and paid with bank account",
|
||||
checksum="A",
|
||||
pk=1,
|
||||
)
|
||||
d2 = Document.objects.create(
|
||||
title="bank statement 1",
|
||||
content="things i paid for in august",
|
||||
pk=2,
|
||||
checksum="B",
|
||||
)
|
||||
d3 = Document.objects.create(
|
||||
title="bank statement 3",
|
||||
content="things i paid for in september",
|
||||
pk=3,
|
||||
checksum="C",
|
||||
# 7 days, 1 hour and 1 minute ago
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 week to now]")
|
||||
results = response.data["results"]
|
||||
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
|
||||
def test_search_added_in_last_month(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- One document added right now
|
||||
- One documents added about a week ago
|
||||
- One document added over 1 month
|
||||
WHEN:
|
||||
- Query for documents added in the last month
|
||||
THEN:
|
||||
- The two recent documents are returned
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
content="the thing i bought at a shop and paid with bank account",
|
||||
checksum="A",
|
||||
pk=1,
|
||||
)
|
||||
d2 = Document.objects.create(
|
||||
title="bank statement 1",
|
||||
content="things i paid for in august",
|
||||
pk=2,
|
||||
checksum="B",
|
||||
# 1 month, 1 day ago
|
||||
added=timezone.now() - relativedelta(months=1, days=1),
|
||||
)
|
||||
d3 = Document.objects.create(
|
||||
title="bank statement 3",
|
||||
content="things i paid for in september",
|
||||
pk=3,
|
||||
checksum="C",
|
||||
# 7 days, 1 hour and 1 minute ago
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 month to now]")
|
||||
results = response.data["results"]
|
||||
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="America/Denver",
|
||||
)
|
||||
def test_search_added_in_last_month_timezone_behind(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- One document added right now
|
||||
- One documents added about a week ago
|
||||
- One document added over 1 month
|
||||
- The timezone is behind UTC time (-6 or -7)
|
||||
WHEN:
|
||||
- Query for documents added in the last month
|
||||
THEN:
|
||||
- The two recent documents are returned
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
content="the thing i bought at a shop and paid with bank account",
|
||||
checksum="A",
|
||||
pk=1,
|
||||
)
|
||||
d2 = Document.objects.create(
|
||||
title="bank statement 1",
|
||||
content="things i paid for in august",
|
||||
pk=2,
|
||||
checksum="B",
|
||||
# 1 month, 1 day ago
|
||||
added=timezone.now() - relativedelta(months=1, days=1),
|
||||
)
|
||||
d3 = Document.objects.create(
|
||||
title="bank statement 3",
|
||||
content="things i paid for in september",
|
||||
pk=3,
|
||||
checksum="C",
|
||||
# 7 days, 1 hour and 1 minute ago
|
||||
added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
|
||||
)
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:[-1 month to now]")
|
||||
results = response.data["results"]
|
||||
|
||||
# Expect 2 documents returned
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
for idx, subset in enumerate(
|
||||
[{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
|
||||
):
|
||||
result = results[idx]
|
||||
# Assert subset in results
|
||||
self.assertDictEqual(result, {**result, **subset})
|
||||
|
||||
@mock.patch("documents.index.autocomplete")
|
||||
def test_search_autocomplete(self, m):
|
||||
m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase):
|
||||
with tempfile.NamedTemporaryFile() as script:
|
||||
with override_settings(PRE_CONSUME_SCRIPT=script.name):
|
||||
c = Consumer()
|
||||
c.path = "path-to-file"
|
||||
c.original_path = "path-to-file"
|
||||
c.path = "/tmp/somewhere/path-to-file"
|
||||
c.run_pre_consume_script()
|
||||
|
||||
m.assert_called_once()
|
||||
@@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase):
|
||||
args, kwargs = m.call_args
|
||||
|
||||
command = kwargs["args"]
|
||||
environment = kwargs["env"]
|
||||
|
||||
self.assertEqual(command[0], script.name)
|
||||
self.assertEqual(command[1], "path-to-file")
|
||||
|
||||
self.assertDictContainsSubset(
|
||||
{
|
||||
"DOCUMENT_SOURCE_PATH": c.original_path,
|
||||
"DOCUMENT_WORKING_PATH": c.path,
|
||||
},
|
||||
environment,
|
||||
)
|
||||
|
||||
@mock.patch("documents.consumer.Consumer.log")
|
||||
def test_script_with_output(self, mocked_log):
|
||||
"""
|
||||
@@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase):
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
_, kwargs = m.call_args
|
||||
|
||||
command = kwargs["args"]
|
||||
environment = kwargs["env"]
|
||||
|
||||
self.assertEqual(command[0], script.name)
|
||||
self.assertEqual(command[1], str(doc.pk))
|
||||
@@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase):
|
||||
self.assertEqual(command[7], "my_bank")
|
||||
self.assertCountEqual(command[8].split(","), ["a", "b"])
|
||||
|
||||
self.assertDictContainsSubset(
|
||||
{
|
||||
"DOCUMENT_ID": str(doc.pk),
|
||||
"DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
|
||||
"DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
|
||||
"DOCUMENT_CORRESPONDENT": "my_bank",
|
||||
"DOCUMENT_TAGS": "a,b",
|
||||
},
|
||||
environment,
|
||||
)
|
||||
|
||||
def test_script_exit_non_zero(self):
|
||||
"""
|
||||
GIVEN:
|
||||
|
@@ -1,3 +1,5 @@
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from documents import index
|
||||
from documents.models import Document
|
||||
@@ -31,3 +33,60 @@ class TestAutoComplete(DirectoriesMixin, TestCase):
|
||||
)
|
||||
self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"])
|
||||
self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
|
||||
|
||||
def test_archive_serial_number_ranging(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Document with an archive serial number above schema allowed size
|
||||
WHEN:
|
||||
- Document is provided to the index
|
||||
THEN:
|
||||
- Error is logged
|
||||
- Document ASN is reset to 0 for the index
|
||||
"""
|
||||
doc1 = Document.objects.create(
|
||||
title="doc1",
|
||||
checksum="A",
|
||||
content="test test2 test3",
|
||||
# yes, this is allowed, unless full_clean is run
|
||||
# DRF does call the validators, this test won't
|
||||
archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1,
|
||||
)
|
||||
with self.assertLogs("paperless.index", level="ERROR") as cm:
|
||||
with mock.patch(
|
||||
"documents.index.AsyncWriter.update_document",
|
||||
) as mocked_update_doc:
|
||||
index.add_or_update_document(doc1)
|
||||
|
||||
mocked_update_doc.assert_called_once()
|
||||
_, kwargs = mocked_update_doc.call_args
|
||||
|
||||
self.assertEqual(kwargs["asn"], 0)
|
||||
|
||||
error_str = cm.output[0]
|
||||
expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1"
|
||||
self.assertIn(expected_str, error_str)
|
||||
|
||||
def test_archive_serial_number_is_none(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Document with no archive serial number
|
||||
WHEN:
|
||||
- Document is provided to the index
|
||||
THEN:
|
||||
- ASN isn't touched
|
||||
"""
|
||||
doc1 = Document.objects.create(
|
||||
title="doc1",
|
||||
checksum="A",
|
||||
content="test test2 test3",
|
||||
)
|
||||
with mock.patch(
|
||||
"documents.index.AsyncWriter.update_document",
|
||||
) as mocked_update_doc:
|
||||
index.add_or_update_document(doc1)
|
||||
|
||||
mocked_update_doc.assert_called_once()
|
||||
_, kwargs = mocked_update_doc.call_args
|
||||
|
||||
self.assertIsNone(kwargs["asn"])
|
||||
|
@@ -3,6 +3,7 @@ import shutil
|
||||
import tempfile
|
||||
from collections import namedtuple
|
||||
from contextlib import contextmanager
|
||||
from unittest import mock
|
||||
|
||||
from django.apps import apps
|
||||
from django.db import connection
|
||||
@@ -86,6 +87,30 @@ class DirectoriesMixin:
|
||||
remove_dirs(self.dirs)
|
||||
|
||||
|
||||
class ConsumerProgressMixin:
|
||||
def setUp(self) -> None:
|
||||
self.send_progress_patcher = mock.patch(
|
||||
"documents.consumer.Consumer._send_progress",
|
||||
)
|
||||
self.send_progress_mock = self.send_progress_patcher.start()
|
||||
super().setUp()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
super().tearDown()
|
||||
self.send_progress_patcher.stop()
|
||||
|
||||
|
||||
class DocumentConsumeDelayMixin:
|
||||
def setUp(self) -> None:
|
||||
self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay")
|
||||
self.consume_file_mock = self.consume_file_patcher.start()
|
||||
super().setUp()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
super().tearDown()
|
||||
self.consume_file_patcher.stop()
|
||||
|
||||
|
||||
class TestMigrations(TransactionTestCase):
|
||||
@property
|
||||
def app(self):
|
||||
|
@@ -1,7 +1,7 @@
|
||||
from typing import Final
|
||||
from typing import Tuple
|
||||
|
||||
__version__: Final[Tuple[int, int, int]] = (1, 12, 1)
|
||||
__version__: Final[Tuple[int, int, int]] = (1, 12, 2)
|
||||
# Version string like X.Y.Z
|
||||
__full_version_str__: Final[str] = ".".join(map(str, __version__))
|
||||
# Version string like X.Y
|
||||
|
@@ -67,11 +67,6 @@ class TestParserLive(TestCase):
|
||||
|
||||
return result
|
||||
|
||||
# Only run if convert is available
|
||||
@pytest.mark.skipif(
|
||||
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
|
||||
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
|
||||
)
|
||||
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
|
||||
def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
|
||||
"""
|
||||
@@ -204,11 +199,6 @@ class TestParserLive(TestCase):
|
||||
"GOTENBERG_LIVE" not in os.environ,
|
||||
reason="No gotenberg server",
|
||||
)
|
||||
# Only run if convert is available
|
||||
@pytest.mark.skipif(
|
||||
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
|
||||
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
|
||||
)
|
||||
def test_generate_pdf_from_mail(self):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -301,11 +291,6 @@ class TestParserLive(TestCase):
|
||||
"GOTENBERG_LIVE" not in os.environ,
|
||||
reason="No gotenberg server",
|
||||
)
|
||||
# Only run if convert is available
|
||||
@pytest.mark.skipif(
|
||||
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
|
||||
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
|
||||
)
|
||||
def test_generate_pdf_from_html(self):
|
||||
"""
|
||||
GIVEN:
|
||||
|
@@ -90,7 +90,7 @@ class TikaDocumentParser(DocumentParser):
|
||||
with open(document_path, "rb") as document_handle:
|
||||
files = {
|
||||
"files": (
|
||||
file_name or os.path.basename(document_path),
|
||||
"convert" + os.path.splitext(document_path)[-1],
|
||||
document_handle,
|
||||
),
|
||||
}
|
||||
|
@@ -7,7 +7,7 @@ max-line-length = 88
|
||||
|
||||
[tool:pytest]
|
||||
DJANGO_SETTINGS_MODULE=paperless.settings
|
||||
addopts = --pythonwarnings=all --cov --cov-report=html --numprocesses auto --quiet
|
||||
addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet
|
||||
env =
|
||||
PAPERLESS_DISABLE_DBHANDLER=true
|
||||
|
||||
|
Reference in New Issue
Block a user