Merge branch 'dev' into feature-permissions

2025-10-28 03:46:06 -05:00 · 2023-02-03 14:23:50 -08:00
parent 94db39e055 8154c7b53a
commit d2a6f79612
36 changed files with 2538 additions and 14911 deletions
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -4,18 +4,17 @@ import shutil
 import tempfile
 from dataclasses import dataclass
 from functools import lru_cache
-from math import ceil
 from pathlib import Path
+from typing import Dict
 from typing import List
 from typing import Optional

 import magic
 from django.conf import settings
 from pdf2image import convert_from_path
+from pdf2image.exceptions import PDFPageCountError
 from pikepdf import Page
-from pikepdf import PasswordError
 from pikepdf import Pdf
-from pikepdf import PdfImage
 from PIL import Image
 from PIL import ImageSequence
 from pyzbar import pyzbar
@@ -154,52 +153,15 @@ def scan_file_for_barcodes(
    (page_number, barcode_text) tuples
    """

-    def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]:
-        detected_barcodes = []
-        with Pdf.open(pdf_filepath) as pdf:
-            for page_num, page in enumerate(pdf.pages):
-                for image_key in page.images:
-                    pdfimage = PdfImage(page.images[image_key])
-
-                    # This type is known to have issues:
-                    # https://github.com/pikepdf/pikepdf/issues/401
-                    if "/CCITTFaxDecode" in pdfimage.filters:
-                        raise BarcodeImageFormatError(
-                            "Unable to decode CCITTFaxDecode images",
-                        )
-
-                    # Not all images can be transcoded to a PIL image, which
-                    # is what pyzbar expects to receive, so this may
-                    # raise an exception, triggering fallback
-                    pillow_img = pdfimage.as_pil_image()
-
-                    # Scale the image down
-                    # See: https://github.com/paperless-ngx/paperless-ngx/issues/2385
-                    # TLDR: zbar has issues with larger images
-                    width, height = pillow_img.size
-                    if width > 1024:
-                        scaler = ceil(width / 1024)
-                        new_width = int(width / scaler)
-                        new_height = int(height / scaler)
-                        pillow_img = pillow_img.resize((new_width, new_height))
-
-                    width, height = pillow_img.size
-                    if height > 2048:
-                        scaler = ceil(height / 2048)
-                        new_width = int(width / scaler)
-                        new_height = int(height / scaler)
-                        pillow_img = pillow_img.resize((new_width, new_height))
-
-                    for barcode_value in barcode_reader(pillow_img):
-                        detected_barcodes.append(Barcode(page_num, barcode_value))
-
-        return detected_barcodes
-
    def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]:
        detected_barcodes = []
        # use a temporary directory in case the file is too big to handle in memory
        with tempfile.TemporaryDirectory() as path:
-            pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
+            pages_from_path = convert_from_path(
+                pdf_filepath,
+                dpi=300,
+                output_folder=path,
+            )
            for current_page_number, page in enumerate(pages_from_path):
                for barcode_value in barcode_reader(page):
                    detected_barcodes.append(
@@ -219,27 +181,19 @@ def scan_file_for_barcodes(
        # Always try pikepdf first, it's usually fine, faster and
        # uses less memory
        try:
-            barcodes = _pikepdf_barcode_scan(pdf_filepath)
+            barcodes = _pdf2image_barcode_scan(pdf_filepath)
        # Password protected files can't be checked
-        except PasswordError as e:
+        # This is the exception raised for those
+        except PDFPageCountError as e:
            logger.warning(
                f"File is likely password protected, not checking for barcodes: {e}",
            )
-        # Handle pikepdf related image decoding issues with a fallback to page
-        # by page conversion to images in a temporary directory
-        except Exception as e:
+        # This file is really borked, allow the consumption to continue
+        # but it may fail further on
+        except Exception as e:  # pragma: no cover
            logger.warning(
-                f"Falling back to pdf2image because: {e}",
+                f"Exception during barcode scanning: {e}",
            )
-            try:
-                barcodes = _pdf2image_barcode_scan(pdf_filepath)
-            # This file is really borked, allow the consumption to continue
-            # but it may fail further on
-            except Exception as e:  # pragma: no cover
-                logger.warning(
-                    f"Exception during barcode scanning: {e}",
-                )
-
    else:
        logger.warning(
            f"Unsupported file format for barcode reader: {str(mime_type)}",
@@ -248,16 +202,25 @@ def scan_file_for_barcodes(
    return DocumentBarcodeInfo(pdf_filepath, barcodes)


-def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
+def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]:
    """
    Search the parsed barcodes for separators
-    and returns a list of page numbers, which
-    separate the file into new files.
+    and returns a dict of page numbers, which
+    separate the file into new files, together
+    with the information whether to keep the page.
    """
    # filter all barcodes for the separator string
    # get the page numbers of the separating barcodes
+    separator_pages = {bc.page: False for bc in barcodes if bc.is_separator}
+    if not settings.CONSUMER_ENABLE_ASN_BARCODE:
+        return separator_pages

-    return list({bc.page for bc in barcodes if bc.is_separator})
+    # add the page numbers of the ASN barcodes
+    # (except for first page, that might lead to infinite loops).
+    return {
+        **separator_pages,
+        **{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0},
+    }


 def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
@@ -289,10 +252,11 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
    return asn


-def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
+def separate_pages(filepath: str, pages_to_split_on: Dict[int, bool]) -> List[str]:
    """
    Separate the provided pdf file on the pages_to_split_on.
-    The pages which are defined by page_numbers will be removed.
+    The pages which are defined by the keys in page_numbers
+    will be removed if the corresponding value is false.
    Returns a list of (temporary) filepaths to consume.
    These will need to be deleted later.
    """
@@ -308,26 +272,28 @@ def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
    fname = os.path.splitext(os.path.basename(filepath))[0]
    pdf = Pdf.open(filepath)

+    # Start with an empty document
+    current_document: List[Page] = []
    # A list of documents, ie a list of lists of pages
-    documents: List[List[Page]] = []
-    # A single document, ie a list of pages
-    document: List[Page] = []
+    documents: List[List[Page]] = [current_document]

    for idx, page in enumerate(pdf.pages):
        # Keep building the new PDF as long as it is not a
        # separator index
        if idx not in pages_to_split_on:
-            document.append(page)
-            # Make sure to append the very last document to the documents
-            if idx == (len(pdf.pages) - 1):
-                documents.append(document)
-                document = []
-        else:
-            # This is a split index, save the current PDF pages, and restart
-            # a new destination page listing
-            logger.debug(f"Starting new document at idx {idx}")
-            documents.append(document)
-            document = []
+            current_document.append(page)
+            continue
+
+        # This is a split index
+        # Start a new destination page listing
+        logger.debug(f"Starting new document at idx {idx}")
+        current_document = []
+        documents.append(current_document)
+        keep_page = pages_to_split_on[idx]
+        if keep_page:
+            # Keep the page
+            # (new document is started by asn barcode)
+            current_document.append(page)

    documents = [x for x in documents if len(x)]

--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,7 +1,10 @@
 import datetime
 import hashlib
 import os
+import shutil
+import tempfile
 import uuid
+from pathlib import Path
 from subprocess import CompletedProcess
 from subprocess import run
 from typing import Optional
@@ -95,7 +98,8 @@ class Consumer(LoggingMixin):

    def __init__(self):
        super().__init__()
-        self.path = None
+        self.path: Optional[Path] = None
+        self.original_path: Optional[Path] = None
        self.filename = None
        self.override_title = None
        self.override_correspondent_id = None
@@ -144,11 +148,16 @@ class Consumer(LoggingMixin):
            return
        # Validate the range is above zero and less than uint32_t max
        # otherwise, Whoosh can't handle it in the index
-        if self.override_asn < 0 or self.override_asn > 0xFF_FF_FF_FF:
+        if (
+            self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
+            or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
+        ):
            self._fail(
                MESSAGE_ASN_RANGE,
                f"Not consuming {self.filename}: "
-                f"Given ASN {self.override_asn} is out of range [0, 4,294,967,295]",
+                f"Given ASN {self.override_asn} is out of range "
+                f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
+                f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
            )
        if Document.objects.filter(archive_serial_number=self.override_asn).exists():
            self._fail(
@@ -169,16 +178,18 @@ class Consumer(LoggingMixin):

        self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")

-        filepath_arg = os.path.normpath(self.path)
+        working_file_path = str(self.path)
+        original_file_path = str(self.original_path)

        script_env = os.environ.copy()
-        script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg
+        script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
+        script_env["DOCUMENT_WORKING_PATH"] = working_file_path

        try:
            completed_proc = run(
                args=[
                    settings.PRE_CONSUME_SCRIPT,
-                    filepath_arg,
+                    original_file_path,
                ],
                env=script_env,
                capture_output=True,
@@ -197,7 +208,7 @@ class Consumer(LoggingMixin):
                exception=e,
            )

-    def run_post_consume_script(self, document):
+    def run_post_consume_script(self, document: Document):
        if not settings.POST_CONSUME_SCRIPT:
            return

@@ -288,8 +299,8 @@ class Consumer(LoggingMixin):
        Return the document object if it was successfully created.
        """

-        self.path = path
-        self.filename = override_filename or os.path.basename(path)
+        self.path = Path(path).resolve()
+        self.filename = override_filename or self.path.name
        self.override_title = override_title
        self.override_correspondent_id = override_correspondent_id
        self.override_document_type_id = override_document_type_id
@@ -315,6 +326,15 @@ class Consumer(LoggingMixin):

        self.log("info", f"Consuming {self.filename}")

+        # For the actual work, copy the file into a tempdir
+        self.original_path = self.path
+        tempdir = tempfile.TemporaryDirectory(
+            prefix="paperless-ngx",
+            dir=settings.SCRATCH_DIR,
+        )
+        self.path = Path(tempdir.name) / Path(self.filename)
+        shutil.copy(self.original_path, self.path)
+
        # Determine the parser class.

        mime_type = magic.from_file(self.path, mime=True)
@@ -457,11 +477,12 @@ class Consumer(LoggingMixin):
                # Delete the file only if it was successfully consumed
                self.log("debug", f"Deleting file {self.path}")
                os.unlink(self.path)
+                self.original_path.unlink()

                # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                shadow_file = os.path.join(
-                    os.path.dirname(self.path),
-                    "._" + os.path.basename(self.path),
+                    os.path.dirname(self.original_path),
+                    "._" + os.path.basename(self.original_path),
                )

                if os.path.isfile(shadow_file):
@@ -478,6 +499,7 @@ class Consumer(LoggingMixin):
            )
        finally:
            document_parser.cleanup()
+            tempdir.cleanup()

        self.run_post_consume_script(document)

--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -5,6 +5,7 @@ from contextlib import contextmanager

 from dateutil.parser import isoparse
 from django.conf import settings
+from django.utils import timezone
 from documents.models import Comment
 from documents.models import Document
 from guardian.shortcuts import get_users_with_perms
@@ -94,10 +95,22 @@ def open_index_searcher():
        searcher.close()


-def update_document(writer, doc):
+def update_document(writer: AsyncWriter, doc: Document):
    tags = ",".join([t.name for t in doc.tags.all()])
    tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
    comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)])
+    asn = doc.archive_serial_number
+    if asn is not None and (
+        asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
+        or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
+    ):
+        logger.error(
+            f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
+            f"ASN is out of range "
+            f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
+            f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
+        )
+        asn = 0
    users_with_perms = get_users_with_perms(
        doc,
        only_with_perms_in=["view_document"],
@@ -118,7 +131,7 @@ def update_document(writer, doc):
        has_type=doc.document_type is not None,
        created=doc.created,
        added=doc.added,
-        asn=doc.archive_serial_number,
+        asn=asn,
        modified=doc.modified,
        path=doc.storage_path.name if doc.storage_path else None,
        path_id=doc.storage_path.id if doc.storage_path else None,
@@ -283,7 +296,7 @@ class DelayedFullTextQuery(DelayedQuery):
            ["content", "title", "correspondent", "tag", "type", "comments"],
            self.searcher.ixreader.schema,
        )
-        qp.add_plugin(DateParserPlugin())
+        qp.add_plugin(DateParserPlugin(basedate=timezone.now()))
        q = qp.parse(q_str)

        corrected = self.searcher.correct_query(q, q_str)
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -311,8 +311,8 @@ class Command(BaseCommand):
                archive_target = None

            # 3.4. write files to target folder
-            t = int(time.mktime(document.created.timetuple()))
            if document.storage_type == Document.STORAGE_TYPE_GPG:
+                t = int(time.mktime(document.created.timetuple()))

                original_target.parent.mkdir(parents=True, exist_ok=True)
                with document.source_file as out_file:
--- a/src/documents/migrations/1030_alter_paperlesstask_task_file_name.py
+++ b/src/documents/migrations/1030_alter_paperlesstask_task_file_name.py
@@ -0,0 +1,23 @@
+# Generated by Django 4.1.5 on 2023-02-03 21:53
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("documents", "1029_alter_document_archive_serial_number"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="paperlesstask",
+            name="task_file_name",
+            field=models.CharField(
+                help_text="Name of the file which the Task was run for",
+                max_length=255,
+                null=True,
+                verbose_name="Task Filename",
+            ),
+        ),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -3,6 +3,7 @@ import logging
 import os
 import re
 from collections import OrderedDict
+from typing import Final
 from typing import Optional

 import dateutil.parser
@@ -242,6 +243,9 @@ class Document(ModelWithOwner):
        help_text=_("The original name of the file when it was uploaded"),
    )

+    ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0
+    ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF
+
    archive_serial_number = models.PositiveIntegerField(
        _("archive serial number"),
        blank=True,
@@ -249,8 +253,8 @@ class Document(ModelWithOwner):
        unique=True,
        db_index=True,
        validators=[
-            MaxValueValidator(0xFF_FF_FF_FF),
-            MinValueValidator(0),
+            MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX),
+            MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN),
        ],
        help_text=_(
            "The position of this document in your physical document " "archive.",
@@ -567,7 +571,7 @@ class PaperlessTask(models.Model):
    task_file_name = models.CharField(
        null=True,
        max_length=255,
-        verbose_name=_("Task Name"),
+        verbose_name=_("Task Filename"),
        help_text=_("Name of the file which the Task was run for"),
    )

--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -166,7 +166,7 @@ def consume_file(
                    # notify the sender, otherwise the progress bar
                    # in the UI stays stuck
                    payload = {
-                        "filename": override_filename,
+                        "filename": override_filename or path.name,
                        "task_id": task_id,
                        "current_progress": 100,
                        "max_progress": 100,
--- a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png
+++ b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png
--- a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png
+++ b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png
--- a/src/documents/tests/samples/barcodes/split-by-asn-1.pdf
+++ b/src/documents/tests/samples/barcodes/split-by-asn-1.pdf
--- a/src/documents/tests/samples/barcodes/split-by-asn-2.pdf
+++ b/src/documents/tests/samples/barcodes/split-by-asn-2.pdf
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -7,6 +7,7 @@ import tempfile
 import urllib.request
 import uuid
 import zipfile
+from datetime import timedelta
 from pathlib import Path
 from unittest import mock
 from unittest.mock import MagicMock
@@ -25,6 +26,7 @@ from django.contrib.auth.models import Permission
 from django.contrib.auth.models import User
 from django.test import override_settings
 from django.utils import timezone
+from dateutil.relativedelta import relativedelta
 from documents import bulk_edit
 from documents import index
 from documents.models import Correspondent
@@ -509,6 +511,270 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        response = self.client.get("/api/documents/?query=content&page=3&page_size=10")
        self.assertEqual(response.status_code, 404)

+    @override_settings(
+        TIME_ZONE="UTC",
+    )
+    def test_search_added_in_last_week(self):
+        """
+        GIVEN:
+            - Three documents added right now
+            - The timezone is UTC time
+        WHEN:
+            - Query for documents added in the last 7 days
+        THEN:
+            - All three recent documents are returned
+        """
+        d1 = Document.objects.create(
+            title="invoice",
+            content="the thing i bought at a shop and paid with bank account",
+            checksum="A",
+            pk=1,
+        )
+        d2 = Document.objects.create(
+            title="bank statement 1",
+            content="things i paid for in august",
+            pk=2,
+            checksum="B",
+        )
+        d3 = Document.objects.create(
+            title="bank statement 3",
+            content="things i paid for in september",
+            pk=3,
+            checksum="C",
+        )
+        with index.open_index_writer() as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get("/api/documents/?query=added:[-1 week to now]")
+        results = response.data["results"]
+        # Expect 3 documents returned
+        self.assertEqual(len(results), 3)
+
+        for idx, subset in enumerate(
+            [
+                {"id": 1, "title": "invoice"},
+                {"id": 2, "title": "bank statement 1"},
+                {"id": 3, "title": "bank statement 3"},
+            ],
+        ):
+            result = results[idx]
+            # Assert subset in results
+            self.assertDictEqual(result, {**result, **subset})
+
+    @override_settings(
+        TIME_ZONE="America/Chicago",
+    )
+    def test_search_added_in_last_week_with_timezone_behind(self):
+        """
+        GIVEN:
+            - Two documents added right now
+            - One document added over a week ago
+            - The timezone is behind UTC time (-6)
+        WHEN:
+            - Query for documents added in the last 7 days
+        THEN:
+            - The two recent documents are returned
+        """
+        d1 = Document.objects.create(
+            title="invoice",
+            content="the thing i bought at a shop and paid with bank account",
+            checksum="A",
+            pk=1,
+        )
+        d2 = Document.objects.create(
+            title="bank statement 1",
+            content="things i paid for in august",
+            pk=2,
+            checksum="B",
+        )
+        d3 = Document.objects.create(
+            title="bank statement 3",
+            content="things i paid for in september",
+            pk=3,
+            checksum="C",
+            # 7 days, 1 hour and 1 minute ago
+            added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
+        )
+        with index.open_index_writer() as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get("/api/documents/?query=added:[-1 week to now]")
+        results = response.data["results"]
+
+        # Expect 2 documents returned
+        self.assertEqual(len(results), 2)
+
+        for idx, subset in enumerate(
+            [{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
+        ):
+            result = results[idx]
+            # Assert subset in results
+            self.assertDictEqual(result, {**result, **subset})
+
+    @override_settings(
+        TIME_ZONE="Europe/Sofia",
+    )
+    def test_search_added_in_last_week_with_timezone_ahead(self):
+        """
+        GIVEN:
+            - Two documents added right now
+            - One document added over a week ago
+            - The timezone is behind UTC time (+2)
+        WHEN:
+            - Query for documents added in the last 7 days
+        THEN:
+            - The two recent documents are returned
+        """
+        d1 = Document.objects.create(
+            title="invoice",
+            content="the thing i bought at a shop and paid with bank account",
+            checksum="A",
+            pk=1,
+        )
+        d2 = Document.objects.create(
+            title="bank statement 1",
+            content="things i paid for in august",
+            pk=2,
+            checksum="B",
+        )
+        d3 = Document.objects.create(
+            title="bank statement 3",
+            content="things i paid for in september",
+            pk=3,
+            checksum="C",
+            # 7 days, 1 hour and 1 minute ago
+            added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
+        )
+        with index.open_index_writer() as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get("/api/documents/?query=added:[-1 week to now]")
+        results = response.data["results"]
+
+        # Expect 2 documents returned
+        self.assertEqual(len(results), 2)
+
+        for idx, subset in enumerate(
+            [{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
+        ):
+            result = results[idx]
+            # Assert subset in results
+            self.assertDictEqual(result, {**result, **subset})
+
+    def test_search_added_in_last_month(self):
+        """
+        GIVEN:
+            - One document added right now
+            - One documents added about a week ago
+            - One document added over 1 month
+        WHEN:
+            - Query for documents added in the last month
+        THEN:
+            - The two recent documents are returned
+        """
+        d1 = Document.objects.create(
+            title="invoice",
+            content="the thing i bought at a shop and paid with bank account",
+            checksum="A",
+            pk=1,
+        )
+        d2 = Document.objects.create(
+            title="bank statement 1",
+            content="things i paid for in august",
+            pk=2,
+            checksum="B",
+            # 1 month, 1 day ago
+            added=timezone.now() - relativedelta(months=1, days=1),
+        )
+        d3 = Document.objects.create(
+            title="bank statement 3",
+            content="things i paid for in september",
+            pk=3,
+            checksum="C",
+            # 7 days, 1 hour and 1 minute ago
+            added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
+        )
+
+        with index.open_index_writer() as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get("/api/documents/?query=added:[-1 month to now]")
+        results = response.data["results"]
+
+        # Expect 2 documents returned
+        self.assertEqual(len(results), 2)
+
+        for idx, subset in enumerate(
+            [{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
+        ):
+            result = results[idx]
+            # Assert subset in results
+            self.assertDictEqual(result, {**result, **subset})
+
+    @override_settings(
+        TIME_ZONE="America/Denver",
+    )
+    def test_search_added_in_last_month_timezone_behind(self):
+        """
+        GIVEN:
+            - One document added right now
+            - One documents added about a week ago
+            - One document added over 1 month
+            - The timezone is behind UTC time (-6 or -7)
+        WHEN:
+            - Query for documents added in the last month
+        THEN:
+            - The two recent documents are returned
+        """
+        d1 = Document.objects.create(
+            title="invoice",
+            content="the thing i bought at a shop and paid with bank account",
+            checksum="A",
+            pk=1,
+        )
+        d2 = Document.objects.create(
+            title="bank statement 1",
+            content="things i paid for in august",
+            pk=2,
+            checksum="B",
+            # 1 month, 1 day ago
+            added=timezone.now() - relativedelta(months=1, days=1),
+        )
+        d3 = Document.objects.create(
+            title="bank statement 3",
+            content="things i paid for in september",
+            pk=3,
+            checksum="C",
+            # 7 days, 1 hour and 1 minute ago
+            added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
+        )
+
+        with index.open_index_writer() as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get("/api/documents/?query=added:[-1 month to now]")
+        results = response.data["results"]
+
+        # Expect 2 documents returned
+        self.assertEqual(len(results), 2)
+
+        for idx, subset in enumerate(
+            [{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
+        ):
+            result = results[idx]
+            # Assert subset in results
+            self.assertDictEqual(result, {**result, **subset})
+
    @mock.patch("documents.index.autocomplete")
    def test_search_autocomplete(self, m):
        m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase):
        with tempfile.NamedTemporaryFile() as script:
            with override_settings(PRE_CONSUME_SCRIPT=script.name):
                c = Consumer()
-                c.path = "path-to-file"
+                c.original_path = "path-to-file"
+                c.path = "/tmp/somewhere/path-to-file"
                c.run_pre_consume_script()

                m.assert_called_once()
@@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase):
                args, kwargs = m.call_args

                command = kwargs["args"]
+                environment = kwargs["env"]

                self.assertEqual(command[0], script.name)
                self.assertEqual(command[1], "path-to-file")

+                self.assertDictContainsSubset(
+                    {
+                        "DOCUMENT_SOURCE_PATH": c.original_path,
+                        "DOCUMENT_WORKING_PATH": c.path,
+                    },
+                    environment,
+                )
+
    @mock.patch("documents.consumer.Consumer.log")
    def test_script_with_output(self, mocked_log):
        """
@@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase):

                m.assert_called_once()

-                args, kwargs = m.call_args
+                _, kwargs = m.call_args

                command = kwargs["args"]
+                environment = kwargs["env"]

                self.assertEqual(command[0], script.name)
                self.assertEqual(command[1], str(doc.pk))
@@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase):
                self.assertEqual(command[7], "my_bank")
                self.assertCountEqual(command[8].split(","), ["a", "b"])

+                self.assertDictContainsSubset(
+                    {
+                        "DOCUMENT_ID": str(doc.pk),
+                        "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
+                        "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
+                        "DOCUMENT_CORRESPONDENT": "my_bank",
+                        "DOCUMENT_TAGS": "a,b",
+                    },
+                    environment,
+                )
+
    def test_script_exit_non_zero(self):
        """
        GIVEN:
--- a/src/documents/tests/test_index.py
+++ b/src/documents/tests/test_index.py
@@ -1,3 +1,5 @@
+from unittest import mock
+
 from django.test import TestCase
 from documents import index
 from documents.models import Document
@@ -31,3 +33,60 @@ class TestAutoComplete(DirectoriesMixin, TestCase):
        )
        self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"])
        self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
+
+    def test_archive_serial_number_ranging(self):
+        """
+        GIVEN:
+            - Document with an archive serial number above schema allowed size
+        WHEN:
+            - Document is provided to the index
+        THEN:
+            - Error is logged
+            - Document ASN is reset to 0 for the index
+        """
+        doc1 = Document.objects.create(
+            title="doc1",
+            checksum="A",
+            content="test test2 test3",
+            # yes, this is allowed, unless full_clean is run
+            # DRF does call the validators, this test won't
+            archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1,
+        )
+        with self.assertLogs("paperless.index", level="ERROR") as cm:
+            with mock.patch(
+                "documents.index.AsyncWriter.update_document",
+            ) as mocked_update_doc:
+                index.add_or_update_document(doc1)
+
+                mocked_update_doc.assert_called_once()
+                _, kwargs = mocked_update_doc.call_args
+
+                self.assertEqual(kwargs["asn"], 0)
+
+                error_str = cm.output[0]
+                expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1"
+                self.assertIn(expected_str, error_str)
+
+    def test_archive_serial_number_is_none(self):
+        """
+        GIVEN:
+            - Document with no archive serial number
+        WHEN:
+            - Document is provided to the index
+        THEN:
+            - ASN isn't touched
+        """
+        doc1 = Document.objects.create(
+            title="doc1",
+            checksum="A",
+            content="test test2 test3",
+        )
+        with mock.patch(
+            "documents.index.AsyncWriter.update_document",
+        ) as mocked_update_doc:
+            index.add_or_update_document(doc1)
+
+            mocked_update_doc.assert_called_once()
+            _, kwargs = mocked_update_doc.call_args
+
+            self.assertIsNone(kwargs["asn"])
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@@ -3,6 +3,7 @@ import shutil
 import tempfile
 from collections import namedtuple
 from contextlib import contextmanager
+from unittest import mock

 from django.apps import apps
 from django.db import connection
@@ -86,6 +87,30 @@ class DirectoriesMixin:
        remove_dirs(self.dirs)


+class ConsumerProgressMixin:
+    def setUp(self) -> None:
+        self.send_progress_patcher = mock.patch(
+            "documents.consumer.Consumer._send_progress",
+        )
+        self.send_progress_mock = self.send_progress_patcher.start()
+        super().setUp()
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        self.send_progress_patcher.stop()
+
+
+class DocumentConsumeDelayMixin:
+    def setUp(self) -> None:
+        self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay")
+        self.consume_file_mock = self.consume_file_patcher.start()
+        super().setUp()
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        self.consume_file_patcher.stop()
+
+
 class TestMigrations(TransactionTestCase):
    @property
    def app(self):
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@@ -1,7 +1,7 @@
 from typing import Final
 from typing import Tuple

-__version__: Final[Tuple[int, int, int]] = (1, 12, 1)
+__version__: Final[Tuple[int, int, int]] = (1, 12, 2)
 # Version string like X.Y.Z
 __full_version_str__: Final[str] = ".".join(map(str, __version__))
 # Version string like X.Y
--- a/src/paperless_mail/tests/test_parsers_live.py
+++ b/src/paperless_mail/tests/test_parsers_live.py
@@ -67,11 +67,6 @@ class TestParserLive(TestCase):

        return result

-    # Only run if convert is available
-    @pytest.mark.skipif(
-        "PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
-        reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
-    )
    @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
    def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
        """
@@ -204,11 +199,6 @@ class TestParserLive(TestCase):
        "GOTENBERG_LIVE" not in os.environ,
        reason="No gotenberg server",
    )
-    # Only run if convert is available
-    @pytest.mark.skipif(
-        "PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
-        reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
-    )
    def test_generate_pdf_from_mail(self):
        """
        GIVEN:
@@ -301,11 +291,6 @@ class TestParserLive(TestCase):
        "GOTENBERG_LIVE" not in os.environ,
        reason="No gotenberg server",
    )
-    # Only run if convert is available
-    @pytest.mark.skipif(
-        "PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
-        reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
-    )
    def test_generate_pdf_from_html(self):
        """
        GIVEN:
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -90,7 +90,7 @@ class TikaDocumentParser(DocumentParser):
        with open(document_path, "rb") as document_handle:
            files = {
                "files": (
-                    file_name or os.path.basename(document_path),
+                    "convert" + os.path.splitext(document_path)[-1],
                    document_handle,
                ),
            }
--- a/src/setup.cfg
+++ b/src/setup.cfg
@@ -7,7 +7,7 @@ max-line-length = 88

 [tool:pytest]
 DJANGO_SETTINGS_MODULE=paperless.settings
-addopts = --pythonwarnings=all --cov --cov-report=html --numprocesses auto --quiet
+addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet
 env =
  PAPERLESS_DISABLE_DBHANDLER=true