Merge remote-tracking branch 'origin/dev'

2025-07-06 16:34:50 -05:00 · 2023-01-29 08:40:13 -08:00 · 2023-01-29 08:40:13 -08:00 · 9aea8a7d7c
commit 9aea8a7d7c
parent b07b8d65a6 7b9c0d65b9
30 changed files with 1680 additions and 1053 deletions
--- a/.github/scripts/cleanup-tags.py
+++ b/.github/scripts/cleanup-tags.py
@ -15,6 +15,8 @@ from github import ContainerPackage
 from github import GithubBranchApi
 from github import GithubContainerRegistryApi
 import docker
 logger = logging.getLogger("cleanup-tags")
@ -151,12 +153,16 @@ class RegistryTagsCleaner:
            for tag in sorted(self.tags_to_keep):
                full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}"
                logger.info(f"Checking manifest for {full_name}")
                # TODO: It would be nice to use RegistryData from docker
                # except the ID doesn't map to anything in the manifest
                try:
                    proc = subprocess.run(
                        [
                            shutil.which("docker"),
-                            "manifest",
+                            "buildx",
                            "imagetools",
                            "inspect",
                            "--raw",
                            full_name,
                        ],
                        capture_output=True,
@ -241,6 +247,65 @@ class RegistryTagsCleaner:
        # By default, keep anything which is tagged
        self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys()))
    def check_tags_pull(self):
        """
        This method uses the Docker Python SDK to confirm all tags which were
        kept still pull, for all platforms.
        TODO: This is much slower (although more comprehensive).  Maybe a Pool?
        """
        logger.info("Beginning confirmation step")
        client = docker.from_env()
        imgs = []
        for tag in sorted(self.tags_to_keep):
            repository = f"ghcr.io/{self.repo_owner}/{self.package_name}"
            for arch, variant in [("amd64", None), ("arm64", None), ("arm", "v7")]:
                # From 11.2.0 onwards, qpdf is cross compiled, so there is a single arch, amd64
                # skip others in this case
                if "qpdf" in self.package_name and arch != "amd64" and tag == "11.2.0":
                    continue
                # Skip beta and release candidate tags
                elif "beta" in tag:
                    continue
                # Build the platform name
                if variant is not None:
                    platform = f"linux/{arch}/{variant}"
                else:
                    platform = f"linux/{arch}"
                try:
                    logger.info(f"Pulling {repository}:{tag} for {platform}")
                    image = client.images.pull(
                        repository=repository,
                        tag=tag,
                        platform=platform,
                    )
                    imgs.append(image)
                except docker.errors.APIError as e:
                    logger.error(
                        f"Failed to pull {repository}:{tag}: {e}",
                    )
            # Prevent out of space errors by removing after a few
            # pulls
            if len(imgs) > 50:
                for image in imgs:
                    try:
                        client.images.remove(image.id)
                    except docker.errors.APIError as e:
                        err_str = str(e)
                        # Ignore attempts to remove images that are partly shared
                        # Ignore images which are somehow gone already
                        if (
                            "must be forced" not in err_str
                            and "No such image" not in err_str
                        ):
                            logger.error(
                                f"Remove image ghcr.io/{self.repo_owner}/{self.package_name}:{tag} failed: {e}",
                            )
                imgs = []
 class MainImageTagsCleaner(RegistryTagsCleaner):
    def decide_what_tags_to_keep(self):
@ -397,6 +462,10 @@ def _main():
            # Clean images which are untagged
            cleaner.clean_untagged(args.is_manifest)
            # Verify remaining tags still pull
            if args.is_manifest:
                cleaner.check_tags_pull()
 if __name__ == "__main__":
    _main()
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -212,12 +212,6 @@ jobs:
    name: Prepare Docker Pipeline Data
    if: github.event_name == 'push' && (startsWith(github.ref, 'refs/heads/feature-') || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/beta' || contains(github.ref, 'beta.rc') || startsWith(github.ref, 'refs/tags/v'))
    runs-on: ubuntu-22.04
    # If the push triggered the installer library workflow, wait for it to
    # complete here.  This ensures the required versions for the final
    # image have been built, while not waiting at all if the versions haven't changed
    concurrency:
      group: build-installer-library
      cancel-in-progress: false
    needs:
      - documentation
      - tests-backend
--- a/.github/workflows/cleanup-tags.yml
+++ b/.github/workflows/cleanup-tags.yml
@ -62,9 +62,9 @@ jobs:
        with:
          python-version: "3.10"
      -
-        name: Install httpx
+        name: Install Python libraries
        run: |
-          python -m pip install httpx
+          python -m pip install httpx docker
      #
      # Clean up primary package
      #
@ -81,13 +81,3 @@ jobs:
        if: "${{ env.TOKEN != '' }}"
        run: |
          python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --delete "${{ matrix.cache-name }}"
      #
      # Verify tags which are left still pull
      #
      -
        name: Check all tags still pull
        run: |
          ghcr_name=$(echo "ghcr.io/${GITHUB_REPOSITORY_OWNER}/${{ matrix.primary-name }}" | awk '{ print tolower($0) }')
          echo "Pulling all tags of ${ghcr_name}"
          docker pull --quiet --all-tags ${ghcr_name}
          docker image list
--- a/.github/workflows/installer-library.yml
+++ b/.github/workflows/installer-library.yml
@ -169,3 +169,142 @@ jobs:
        PIKEPDF_VERSION=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }}
        PILLOW_VERSION=${{ needs.prepare-docker-build.outputs.pillow-version }}
        LXML_VERSION=${{ needs.prepare-docker-build.outputs.lxml-version }}
  commit-binary-files:
    name: Store installers
    needs:
      - prepare-docker-build
      - build-qpdf-debs
      - build-jbig2enc
      - build-psycopg2-wheel
      - build-pikepdf-wheel
    runs-on: ubuntu-22.04
    steps:
      -
        name: Checkout
        uses: actions/checkout@v3
        with:
          ref: binary-library
      -
        name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.9"
      -
        name: Install system dependencies
        run: |
          sudo apt-get update -qq
          sudo apt-get install -qq --no-install-recommends tree
      -
        name: Extract qpdf files
        run: |
          version=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).version }}
          tag=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).image_tag }}
          docker pull --quiet ${tag}
          docker create --name qpdf-extract ${tag}
          mkdir --parents qpdf/${version}/amd64
          docker cp qpdf-extract:/usr/src/qpdf/${version}/amd64 qpdf/${version}
          mkdir --parents qpdf/${version}/arm64
          docker cp qpdf-extract:/usr/src/qpdf/${version}/arm64 qpdf/${version}
          mkdir --parents qpdf/${version}/armv7
          docker cp qpdf-extract:/usr/src/qpdf/${version}/armv7 qpdf/${version}
      -
        name: Extract psycopg2 files
        run: |
          version=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).version }}
          tag=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).image_tag }}
          docker pull --quiet --platform linux/amd64 ${tag}
          docker create --platform linux/amd64 --name psycopg2-extract ${tag}
          mkdir --parents psycopg2/${version}/amd64
          docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/amd64
          mv psycopg2/${version}/amd64/wheels/* psycopg2/${version}/amd64
          rm -r psycopg2/${version}/amd64/wheels/
          docker rm psycopg2-extract
          docker pull --quiet --platform linux/arm64 ${tag}
          docker create --platform linux/arm64 --name psycopg2-extract ${tag}
          mkdir --parents psycopg2/${version}/arm64
          docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/arm64
          mv psycopg2/${version}/arm64/wheels/* psycopg2/${version}/arm64
          rm -r psycopg2/${version}/arm64/wheels/
          docker rm psycopg2-extract
          docker pull --quiet --platform linux/arm/v7 ${tag}
          docker create --platform linux/arm/v7 --name psycopg2-extract ${tag}
          mkdir --parents psycopg2/${version}/armv7
          docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/armv7
          mv psycopg2/${version}/armv7/wheels/* psycopg2/${version}/armv7
          rm -r psycopg2/${version}/armv7/wheels/
          docker rm psycopg2-extract
      -
        name: Extract pikepdf files
        run: |
          version=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }}
          tag=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).image_tag }}
          docker pull --quiet --platform linux/amd64 ${tag}
          docker create --platform linux/amd64 --name pikepdf-extract ${tag}
          mkdir --parents pikepdf/${version}/amd64
          docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/amd64
          mv pikepdf/${version}/amd64/wheels/* pikepdf/${version}/amd64
          rm -r pikepdf/${version}/amd64/wheels/
          docker rm pikepdf-extract
          docker pull --quiet --platform linux/arm64 ${tag}
          docker create --platform linux/arm64 --name pikepdf-extract ${tag}
          mkdir --parents pikepdf/${version}/arm64
          docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/arm64
          mv pikepdf/${version}/arm64/wheels/* pikepdf/${version}/arm64
          rm -r pikepdf/${version}/arm64/wheels/
          docker rm pikepdf-extract
          docker pull --quiet --platform linux/arm/v7 ${tag}
          docker create --platform linux/arm/v7 --name pikepdf-extract ${tag}
          mkdir --parents pikepdf/${version}/armv7
          docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/armv7
          mv pikepdf/${version}/armv7/wheels/* pikepdf/${version}/armv7
          rm -r pikepdf/${version}/armv7/wheels/
          docker rm pikepdf-extract
      -
        name: Extract jbig2enc files
        run: |
          version=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).version }}
          tag=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).image_tag }}
          docker pull --quiet --platform linux/amd64 ${tag}
          docker create --platform linux/amd64 --name jbig2enc-extract ${tag}
          mkdir --parents jbig2enc/${version}/amd64
          docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/amd64/
          mv jbig2enc/${version}/amd64/build/* jbig2enc/${version}/amd64/
          docker rm jbig2enc-extract
          docker pull --quiet --platform linux/arm64 ${tag}
          docker create --platform linux/arm64 --name jbig2enc-extract ${tag}
          mkdir --parents jbig2enc/${version}/arm64
          docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/arm64
          mv jbig2enc/${version}/arm64/build/* jbig2enc/${version}/arm64/
          docker rm jbig2enc-extract
          docker pull --quiet --platform linux/arm/v7 ${tag}
          docker create --platform linux/arm/v7 --name jbig2enc-extract ${tag}
          mkdir --parents jbig2enc/${version}/armv7
          docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/armv7
          mv jbig2enc/${version}/armv7/build/* jbig2enc/${version}/armv7/
          docker rm jbig2enc-extract
      -
        name: Show file structure
        run: |
          tree .
      -
        name: Commit files
        run: |
          git config --global user.name "github-actions"
          git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
          git add pikepdf/ qpdf/ psycopg2/ jbig2enc/
          git commit -m "Updating installer packages" || true
          git push origin || true
--- a/53
+++ b/53
@ -1,19 +1,5 @@
 # syntax=docker/dockerfile:1.4
 # Pull the installer images from the library
 # These are all built previously
 # They provide either a .deb or .whl
 ARG JBIG2ENC_VERSION
 ARG QPDF_VERSION
 ARG PIKEPDF_VERSION
 ARG PSYCOPG2_VERSION
 FROM ghcr.io/paperless-ngx/paperless-ngx/builder/jbig2enc:${JBIG2ENC_VERSION} as jbig2enc-builder
 FROM --platform=$BUILDPLATFORM ghcr.io/paperless-ngx/paperless-ngx/builder/qpdf:${QPDF_VERSION} as qpdf-builder
 FROM ghcr.io/paperless-ngx/paperless-ngx/builder/pikepdf:${PIKEPDF_VERSION} as pikepdf-builder
 FROM ghcr.io/paperless-ngx/paperless-ngx/builder/psycopg2:${PSYCOPG2_VERSION} as psycopg2-builder
 FROM --platform=$BUILDPLATFORM node:16-bullseye-slim AS compile-frontend
 # This stage compiles the frontend
@ -58,24 +44,21 @@ LABEL org.opencontainers.image.url="https://github.com/paperless-ngx/paperless-n
 LABEL org.opencontainers.image.licenses="GPL-3.0-only"
 ARG DEBIAN_FRONTEND=noninteractive
-# Buildx provided
+# Buildx provided, must be defined to use though
 ARG TARGETARCH
 ARG TARGETVARIANT
 # Workflow provided
 ARG JBIG2ENC_VERSION
 ARG QPDF_VERSION
 ARG PIKEPDF_VERSION
 ARG PSYCOPG2_VERSION
 #
 # Begin installation and configuration
 # Order the steps below from least often changed to most
 #
 # copy jbig2enc
 # Basically will never change again
 COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/.libs/libjbig2enc* /usr/local/lib/
 COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/jbig2 /usr/local/bin/
 COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/*.h /usr/local/include/
 # Packages need for running
 ARG RUNTIME_PACKAGES="\
  # Python
@ -198,19 +181,29 @@ RUN set -eux \
 # Install the built packages from the installer library images
 # Use mounts to avoid copying installer files into the image
 # These change sometimes
-RUN --mount=type=bind,from=qpdf-builder,target=/qpdf \
+RUN set -eux \
-    --mount=type=bind,from=psycopg2-builder,target=/psycopg2 \
+  && echo "Getting binaries" \
-    --mount=type=bind,from=pikepdf-builder,target=/pikepdf \
+    && mkdir paperless-ngx \
-  set -eux \
+    && curl --fail --silent --show-error --output paperless-ngx.tar.gz --location https://github.com/paperless-ngx/paperless-ngx/archive/41d6e7e407af09a0882736d50c89b6e015997bff.tar.gz \
    && tar -xf paperless-ngx.tar.gz --directory paperless-ngx --strip-components=1 \
    && cd paperless-ngx \
    # Setting a specific revision ensures we know what this installed
    # and ensures cache breaking on changes
  && echo "Installing jbig2enc" \
    && cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/jbig2 /usr/local/bin/ \
    && cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/libjbig2enc* /usr/local/lib/ \
  && echo "Installing qpdf" \
-    && apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \
+    && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \
-    && apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \
+    && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \
  && echo "Installing pikepdf and dependencies" \
-    && python3 -m pip install --no-cache-dir /pikepdf/usr/src/wheels/*.whl \
+    && python3 -m pip install --no-cache-dir ./pikepdf/${PIKEPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/*.whl \
    && python3 -m pip list \
  && echo "Installing psycopg2" \
-    && python3 -m pip install --no-cache-dir /psycopg2/usr/src/wheels/psycopg2*.whl \
+    && python3 -m pip install --no-cache-dir ./psycopg2/${PSYCOPG2_VERSION}/${TARGETARCH}${TARGETVARIANT}/psycopg2*.whl \
-    && python3 -m pip list
+    && python3 -m pip list \
  && echo "Cleaning up image layer" \
    && cd ../ \
    && rm -rf paperless-ngx
 WORKDIR /usr/src/paperless/src/
--- a/docker-builders/Dockerfile.jbig2enc
+++ b/docker-builders/Dockerfile.jbig2enc
@ -29,7 +29,20 @@ RUN set -eux \
    && ./autogen.sh \
    && ./configure \
    && make \
  && echo "Gathering package data" \
    && dpkg-query -f '${Package;-40}${Version}\n' -W > ./pkg-list.txt \
  && echo "Cleaning up image" \
    && apt-get -y purge ${BUILD_PACKAGES} \
    && apt-get -y autoremove --purge \
-    && rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/* \
  && echo "Moving files around" \
    && mkdir build \
    # Unlink a symlink that causes problems
    && unlink ./src/.libs/libjbig2enc.la \
    # Move what the link pointed to
    && mv ./src/libjbig2enc.la ./build/ \
    # Move the shared library .so files
    && mv ./src/.libs/libjbig2enc* ./build/ \
    # And move the cli binary
    && mv ./src/jbig2 ./build/ \
    && mv ./pkg-list.txt ./build/
--- a/docker-builders/Dockerfile.pikepdf
+++ b/docker-builders/Dockerfile.pikepdf
@ -7,12 +7,17 @@
 # Default to pulling from the main repo registry when manually building
 ARG REPO="paperless-ngx/paperless-ngx"
 # This does nothing, except provide a name for a copy below
 ARG QPDF_VERSION
 FROM --platform=$BUILDPLATFORM ghcr.io/${REPO}/builder/qpdf:${QPDF_VERSION} as qpdf-builder
-# This does nothing, except provide a name for a copy below
+#
-
+# Stage: builder
-FROM python:3.9-slim-bullseye as main
+# Purpose:
 #  - Build the pikepdf wheel
 #  - Build any dependent wheels which can't be found
 #
 FROM python:3.9-slim-bullseye as builder
 LABEL org.opencontainers.image.description="A intermediate image with pikepdf wheel built"
@ -100,3 +105,14 @@ RUN set -eux \
    && apt-get -y purge ${BUILD_PACKAGES} \
    && apt-get -y autoremove --purge \
    && rm -rf /var/lib/apt/lists/*
 #
 # Stage: package
 # Purpose: Holds the compiled .whl files in a tiny image to pull
 #
 FROM alpine:3.17 as package
 WORKDIR /usr/src/wheels/
 COPY --from=builder /usr/src/wheels/*.whl ./
 COPY --from=builder /usr/src/wheels/pkg-list.txt ./
--- a/docker-builders/Dockerfile.psycopg2
+++ b/docker-builders/Dockerfile.psycopg2
@ -2,7 +2,12 @@
 # Inputs:
 #    - PSYCOPG2_VERSION - Version to build
-FROM python:3.9-slim-bullseye as main
+#
 # Stage: builder
 # Purpose:
 #  - Build the psycopg2 wheel
 #
 FROM python:3.9-slim-bullseye as builder
 LABEL org.opencontainers.image.description="A intermediate image with psycopg2 wheel built"
@ -48,3 +53,14 @@ RUN set -eux \
    && apt-get -y purge ${BUILD_PACKAGES} \
    && apt-get -y autoremove --purge \
    && rm -rf /var/lib/apt/lists/*
 #
 # Stage: package
 # Purpose: Holds the compiled .whl files in a tiny image to pull
 #
 FROM alpine:3.17 as package
 WORKDIR /usr/src/wheels/
 COPY --from=builder /usr/src/wheels/*.whl ./
 COPY --from=builder /usr/src/wheels/pkg-list.txt ./
--- a/docker-builders/README.md
+++ b/docker-builders/README.md
@ -0,0 +1,57 @@
 # Installer Library
 This folder contains the Dockerfiles for building certain installers or libraries, which are then pulled into the main image.
 ## [jbig2enc](https://github.com/agl/jbig2enc)
 ### Why
 JBIG is an image coding which can achieve better compression of images for PDFs.
 ### What
 The Docker image builds a shared library file and utility, which is copied into the correct location in the final image.
 ### Updating
 1. Ensure the given qpdf version is present in [Debian bookworm](https://packages.debian.org/bookworm/qpdf)
 2. Update `.build-config.json` to the given version
 3. If the Debian specific version has incremented, update `Dockerfile.qpdf`
 See Also:
 - [OCRMyPDF Documentation](https://ocrmypdf.readthedocs.io/en/latest/jbig2.html)
 ## [psycopg2](https://www.psycopg.org/)
 ### Why
 The pre-built wheels of psycopg2 are built on Debian 9, which provides a quite old version of libpq-dev. This causes issue with authentication methods.
 ### What
 The image builds psycopg2 wheels on Debian 10 and places the produced wheels into `/usr/src/wheels/`.
 See Also:
 - [Issue 266](https://github.com/paperless-ngx/paperless-ngx/issues/266)
 ## [qpdf](https://qpdf.readthedocs.io/en/stable/index.html)
 ### Why
 qpdf and it's library provide tools to read, manipulate and fix up PDFs. Version 11 is also required by `pikepdf` 6+ and Debian 9 does not provide above version 10.
 ### What
 The Docker image cross compiles .deb installers for each supported architecture of the main image. The installers are placed in `/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/`
 ## [pikepdf](https://pikepdf.readthedocs.io/en/latest/)
 ### Why
 Required by OCRMyPdf, this is a general purpose library for PDF manipulation in Python via the qpdf libraries.
 ### What
 The built wheels are placed into `/usr/src/wheels/`
--- a/docker/docker-prepare.sh
+++ b/docker/docker-prepare.sh
@ -80,7 +80,7 @@ django_checks() {
 search_index() {
-	local -r index_version=1
+	local -r index_version=2
 	local -r index_version_file=${DATA_DIR}/.index_version
 	if [[ (! -f "${index_version_file}") || $(<"${index_version_file}") != "$index_version" ]]; then
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@ -121,7 +121,17 @@ Executed after the consumer sees a new document in the consumption
 folder, but before any processing of the document is performed. This
 script can access the following relevant environment variables set:
- `DOCUMENT_SOURCE_PATH`
+| Environment Variable    | Description                                                  |
 | ----------------------- | ------------------------------------------------------------ |
 | `DOCUMENT_SOURCE_PATH`  | Original path of the consumed document                       |
 | `DOCUMENT_WORKING_PATH` | Path to a copy of the original that consumption will work on |
 !!! note
    Pre-consume scripts which modify the document should only change
    the `DOCUMENT_WORKING_PATH` file or a second consume task may
    be triggered, leading to failures as two tasks work on the
    same document path
 A simple but common example for this would be creating a simple script
 like this:
@ -130,7 +140,7 @@ like this:
 ```bash
 #!/usr/bin/env bash
-pdf2pdfocr.py -i ${DOCUMENT_SOURCE_PATH}
+pdf2pdfocr.py -i ${DOCUMENT_WORKING_PATH}
 ```
 `/etc/paperless.conf`
@ -157,26 +167,36 @@ Executed after the consumer has successfully processed a document and
 has moved it into paperless. It receives the following environment
 variables:
- `DOCUMENT_ID`
+| Environment Variable         | Description                                   |
- `DOCUMENT_FILE_NAME`
+| ---------------------------- | --------------------------------------------- |
- `DOCUMENT_CREATED`
+| `DOCUMENT_ID`                | Database primary key of the document          |
- `DOCUMENT_MODIFIED`
+| `DOCUMENT_FILE_NAME`         | Formatted filename, not including paths       |
- `DOCUMENT_ADDED`
+| `DOCUMENT_CREATED`           | Date & time when document created             |
- `DOCUMENT_SOURCE_PATH`
+| `DOCUMENT_MODIFIED`          | Date & time when document was last modified   |
- `DOCUMENT_ARCHIVE_PATH`
+| `DOCUMENT_ADDED`             | Date & time when document was added           |
- `DOCUMENT_THUMBNAIL_PATH`
+| `DOCUMENT_SOURCE_PATH`       | Path to the original document file            |
- `DOCUMENT_DOWNLOAD_URL`
+| `DOCUMENT_ARCHIVE_PATH`      | Path to the generate archive file (if any)    |
- `DOCUMENT_THUMBNAIL_URL`
+| `DOCUMENT_THUMBNAIL_PATH`    | Path to the generated thumbnail               |
- `DOCUMENT_CORRESPONDENT`
+| `DOCUMENT_DOWNLOAD_URL`      | URL for document download                     |
- `DOCUMENT_TAGS`
+| `DOCUMENT_THUMBNAIL_URL`     | URL for the document thumbnail                |
- `DOCUMENT_ORIGINAL_FILENAME`
+| `DOCUMENT_CORRESPONDENT`     | Assigned correspondent (if any)               |
 | `DOCUMENT_TAGS`              | Comma separated list of tags applied (if any) |
 | `DOCUMENT_ORIGINAL_FILENAME` | Filename of original document                 |
-The script can be in any language, but for a simple shell script
+The script can be in any language, A simple shell script example:
 example, you can take a look at
 [post-consumption-example.sh](https://github.com/paperless-ngx/paperless-ngx/blob/main/scripts/post-consumption-example.sh)
 in this project.
-The post consumption script cannot cancel the consumption process.
+```bash title="post-consumption-example"
 --8<-- "./scripts/post-consumption-example.sh"
 ```
 !!! note
    The post consumption script cannot cancel the consumption process.
 !!! warning
    The post consumption script should not modify the document files
    directly
 The script's stdout and stderr will be logged line by line to the
 webserver log, along with the exit code of the script.
--- a/docs/changelog.md
+++ b/docs/changelog.md
@ -2,6 +2,9 @@
 ## paperless-ngx 1.12.1
 _Note: Version 1.12.x introduced searching of comments which will work for comments added after the upgrade but a reindex of the search index is required in order to be able to search
 older comments. The Docker image will automatically perform this reindex, bare metal installations will have to perform this manually, see [the docs](https://docs.paperless-ngx.com/administration/#index)._
 ### Bug Fixes
 - Fix: comments not showing in search until after manual reindex in v1.12 [@shamoon](https://github.com/shamoon) ([#2513](https://github.com/paperless-ngx/paperless-ngx/pull/2513))
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -41,6 +41,7 @@ markdown_extensions:
      anchor_linenums: true
  - pymdownx.superfences
  - pymdownx.inlinehilite
  - pymdownx.snippets
 strict: true
 nav:
    - index.md
--- a/src-ui/messages.xlf
+++ b/src-ui/messages.xlf
--- a/src-ui/src/app/components/document-detail/document-detail.component.ts
+++ b/src-ui/src/app/components/document-detail/document-detail.component.ts
@ -204,6 +204,10 @@ export class DocumentDetailComponent
            )
            .subscribe({
              next: (titleValue) => {
                // In the rare case when the field changed just after debounced event was fired.
                // We dont want to overwrite whats actually in the text field, so just return
                if (titleValue !== this.titleInput.value) return
                this.title = titleValue
                this.documentForm.patchValue({ title: titleValue })
              },
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
@ -26,11 +26,11 @@
        </div>
        <p class="card-text">
          <span *ngIf="document.__search_hit__ && document.__search_hit__.highlights" [innerHtml]="document.__search_hit__.highlights"></span>
-          <span *ngIf="document.__search_hit__ && document.__search_hit__.comment_highlights" class="d-block">
+          <span *ngFor="let highlight of searchCommentHighlights" class="d-block">
            <svg width="1em" height="1em" fill="currentColor" class="me-2">
              <use xlink:href="assets/bootstrap-icons.svg#chat-left-text"/>
            </svg>
-            <span [innerHtml]="document.__search_hit__.comment_highlights"></span>
+            <span [innerHtml]="highlight"></span>
          </span>
          <span *ngIf="!document.__search_hit__" class="result-content">{{contentTrimmed}}</span>
        </p>
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts
@ -70,6 +70,22 @@ export class DocumentCardLargeComponent {
    }
  }
  get searchCommentHighlights() {
    let highlights = []
    if (
      this.document['__search_hit__'] &&
      this.document['__search_hit__'].comment_highlights
    ) {
      // only show comments with a match
      highlights = (
        this.document['__search_hit__'].comment_highlights as string
      )
        .split(',')
        .filter((higlight) => higlight.includes('<span'))
    }
    return highlights
  }
  getIsThumbInverted() {
    return this.settingsService.get(SETTINGS_KEYS.DARK_MODE_THUMB_INVERTED)
  }
--- a/src-ui/src/app/components/manage/settings/settings.component.html
+++ b/src-ui/src/app/components/manage/settings/settings.component.html
@ -143,7 +143,7 @@
            <p i18n>
              <em>No tracking data is collected by the app in any way.</em>
            </p>
-            <app-input-check i18n-title title="Enable update checking" formControlName="updateCheckingEnabled" i18n-hint hint="Note that for users of thirdy-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release."></app-input-check>
+            <app-input-check i18n-title title="Enable update checking" formControlName="updateCheckingEnabled" i18n-hint hint="Note that for users of third-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release."></app-input-check>
          </div>
        </div>
--- a/src-ui/src/environments/environment.prod.ts
+++ b/src-ui/src/environments/environment.prod.ts
@ -5,7 +5,7 @@ export const environment = {
  apiBaseUrl: document.baseURI + 'api/',
  apiVersion: '2',
  appTitle: 'Paperless-ngx',
-  version: '1.12.1',
+  version: '1.12.1-dev',
  webSocketHost: window.location.host,
  webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:',
  webSocketBaseUrl: base_url.pathname + 'ws/',
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@ -4,7 +4,6 @@ import shutil
 import tempfile
 from dataclasses import dataclass
 from functools import lru_cache
 from math import ceil
 from pathlib import Path
 from typing import List
 from typing import Optional
@ -12,10 +11,9 @@ from typing import Optional
 import magic
 from django.conf import settings
 from pdf2image import convert_from_path
 from pdf2image.exceptions import PDFPageCountError
 from pikepdf import Page
 from pikepdf import PasswordError
 from pikepdf import Pdf
 from pikepdf import PdfImage
 from PIL import Image
 from PIL import ImageSequence
 from pyzbar import pyzbar
@ -154,52 +152,15 @@ def scan_file_for_barcodes(
    (page_number, barcode_text) tuples
    """
    def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]:
        detected_barcodes = []
        with Pdf.open(pdf_filepath) as pdf:
            for page_num, page in enumerate(pdf.pages):
                for image_key in page.images:
                    pdfimage = PdfImage(page.images[image_key])
                    # This type is known to have issues:
                    # https://github.com/pikepdf/pikepdf/issues/401
                    if "/CCITTFaxDecode" in pdfimage.filters:
                        raise BarcodeImageFormatError(
                            "Unable to decode CCITTFaxDecode images",
                        )
                    # Not all images can be transcoded to a PIL image, which
                    # is what pyzbar expects to receive, so this may
                    # raise an exception, triggering fallback
                    pillow_img = pdfimage.as_pil_image()
                    # Scale the image down
                    # See: https://github.com/paperless-ngx/paperless-ngx/issues/2385
                    # TLDR: zbar has issues with larger images
                    width, height = pillow_img.size
                    if width > 1024:
                        scaler = ceil(width / 1024)
                        new_width = int(width / scaler)
                        new_height = int(height / scaler)
                        pillow_img = pillow_img.resize((new_width, new_height))
                    width, height = pillow_img.size
                    if height > 2048:
                        scaler = ceil(height / 2048)
                        new_width = int(width / scaler)
                        new_height = int(height / scaler)
                        pillow_img = pillow_img.resize((new_width, new_height))
                    for barcode_value in barcode_reader(pillow_img):
                        detected_barcodes.append(Barcode(page_num, barcode_value))
        return detected_barcodes
    def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]:
        detected_barcodes = []
        # use a temporary directory in case the file is too big to handle in memory
        with tempfile.TemporaryDirectory() as path:
-            pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
+            pages_from_path = convert_from_path(
                pdf_filepath,
                dpi=300,
                output_folder=path,
            )
            for current_page_number, page in enumerate(pages_from_path):
                for barcode_value in barcode_reader(page):
                    detected_barcodes.append(
@ -219,27 +180,19 @@ def scan_file_for_barcodes(
        # Always try pikepdf first, it's usually fine, faster and
        # uses less memory
        try:
-            barcodes = _pikepdf_barcode_scan(pdf_filepath)
+            barcodes = _pdf2image_barcode_scan(pdf_filepath)
        # Password protected files can't be checked
-        except PasswordError as e:
+        # This is the exception raised for those
        except PDFPageCountError as e:
            logger.warning(
                f"File is likely password protected, not checking for barcodes: {e}",
            )
-        # Handle pikepdf related image decoding issues with a fallback to page
+        # This file is really borked, allow the consumption to continue
-        # by page conversion to images in a temporary directory
+        # but it may fail further on
-        except Exception as e:
+        except Exception as e:  # pragma: no cover
            logger.warning(
-                f"Falling back to pdf2image because: {e}",
+                f"Exception during barcode scanning: {e}",
            )
            try:
                barcodes = _pdf2image_barcode_scan(pdf_filepath)
            # This file is really borked, allow the consumption to continue
            # but it may fail further on
            except Exception as e:  # pragma: no cover
                logger.warning(
                    f"Exception during barcode scanning: {e}",
                )
    else:
        logger.warning(
            f"Unsupported file format for barcode reader: {str(mime_type)}",
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -1,7 +1,10 @@
 import datetime
 import hashlib
 import os
 import shutil
 import tempfile
 import uuid
 from pathlib import Path
 from subprocess import CompletedProcess
 from subprocess import run
 from typing import Optional
@ -94,7 +97,8 @@ class Consumer(LoggingMixin):
    def __init__(self):
        super().__init__()
-        self.path = None
+        self.path: Optional[Path] = None
        self.original_path: Optional[Path] = None
        self.filename = None
        self.override_title = None
        self.override_correspondent_id = None
@ -167,16 +171,18 @@ class Consumer(LoggingMixin):
        self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
-        filepath_arg = os.path.normpath(self.path)
+        working_file_path = str(self.path)
        original_file_path = str(self.original_path)
        script_env = os.environ.copy()
-        script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg
+        script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
        script_env["DOCUMENT_WORKING_PATH"] = working_file_path
        try:
            completed_proc = run(
                args=[
                    settings.PRE_CONSUME_SCRIPT,
-                    filepath_arg,
+                    original_file_path,
                ],
                env=script_env,
                capture_output=True,
@ -195,7 +201,7 @@ class Consumer(LoggingMixin):
                exception=e,
            )
-    def run_post_consume_script(self, document):
+    def run_post_consume_script(self, document: Document):
        if not settings.POST_CONSUME_SCRIPT:
            return
@ -285,8 +291,8 @@ class Consumer(LoggingMixin):
        Return the document object if it was successfully created.
        """
-        self.path = path
+        self.path = Path(path).resolve()
-        self.filename = override_filename or os.path.basename(path)
+        self.filename = override_filename or self.path.name
        self.override_title = override_title
        self.override_correspondent_id = override_correspondent_id
        self.override_document_type_id = override_document_type_id
@ -311,6 +317,15 @@ class Consumer(LoggingMixin):
        self.log("info", f"Consuming {self.filename}")
        # For the actual work, copy the file into a tempdir
        self.original_path = self.path
        tempdir = tempfile.TemporaryDirectory(
            prefix="paperless-ngx",
            dir=settings.SCRATCH_DIR,
        )
        self.path = Path(tempdir.name) / Path(self.filename)
        shutil.copy(self.original_path, self.path)
        # Determine the parser class.
        mime_type = magic.from_file(self.path, mime=True)
@ -453,11 +468,12 @@ class Consumer(LoggingMixin):
                # Delete the file only if it was successfully consumed
                self.log("debug", f"Deleting file {self.path}")
                os.unlink(self.path)
                self.original_path.unlink()
                # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                shadow_file = os.path.join(
-                    os.path.dirname(self.path),
+                    os.path.dirname(self.original_path),
-                    "._" + os.path.basename(self.path),
+                    "._" + os.path.basename(self.original_path),
                )
                if os.path.isfile(shadow_file):
@ -474,6 +490,7 @@ class Consumer(LoggingMixin):
            )
        finally:
            document_parser.cleanup()
            tempdir.cleanup()
        self.run_post_consume_script(document)
--- a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png
+++ b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png
--- a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png
+++ b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase):
        with tempfile.NamedTemporaryFile() as script:
            with override_settings(PRE_CONSUME_SCRIPT=script.name):
                c = Consumer()
-                c.path = "path-to-file"
+                c.original_path = "path-to-file"
                c.path = "/tmp/somewhere/path-to-file"
                c.run_pre_consume_script()
                m.assert_called_once()
@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase):
                args, kwargs = m.call_args
                command = kwargs["args"]
                environment = kwargs["env"]
                self.assertEqual(command[0], script.name)
                self.assertEqual(command[1], "path-to-file")
                self.assertDictContainsSubset(
                    {
                        "DOCUMENT_SOURCE_PATH": c.original_path,
                        "DOCUMENT_WORKING_PATH": c.path,
                    },
                    environment,
                )
    @mock.patch("documents.consumer.Consumer.log")
    def test_script_with_output(self, mocked_log):
        """
@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase):
                m.assert_called_once()
-                args, kwargs = m.call_args
+                _, kwargs = m.call_args
                command = kwargs["args"]
                environment = kwargs["env"]
                self.assertEqual(command[0], script.name)
                self.assertEqual(command[1], str(doc.pk))
@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase):
                self.assertEqual(command[7], "my_bank")
                self.assertCountEqual(command[8].split(","), ["a", "b"])
                self.assertDictContainsSubset(
                    {
                        "DOCUMENT_ID": str(doc.pk),
                        "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
                        "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
                        "DOCUMENT_CORRESPONDENT": "my_bank",
                        "DOCUMENT_TAGS": "a,b",
                    },
                    environment,
                )
    def test_script_exit_non_zero(self):
        """
        GIVEN:
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@ -3,6 +3,7 @@ import shutil
 import tempfile
 from collections import namedtuple
 from contextlib import contextmanager
 from unittest import mock
 from django.apps import apps
 from django.db import connection
@ -86,6 +87,30 @@ class DirectoriesMixin:
        remove_dirs(self.dirs)
 class ConsumerProgressMixin:
    def setUp(self) -> None:
        self.send_progress_patcher = mock.patch(
            "documents.consumer.Consumer._send_progress",
        )
        self.send_progress_mock = self.send_progress_patcher.start()
        super().setUp()
    def tearDown(self) -> None:
        super().tearDown()
        self.send_progress_patcher.stop()
 class DocumentConsumeDelayMixin:
    def setUp(self) -> None:
        self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay")
        self.consume_file_mock = self.consume_file_patcher.start()
        super().setUp()
    def tearDown(self) -> None:
        super().tearDown()
        self.consume_file_patcher.stop()
 class TestMigrations(TransactionTestCase):
    @property
    def app(self):
--- a/src/documents/views.py
+++ b/src/documents/views.py
@ -477,21 +477,14 @@ class DocumentViewSet(
 class SearchResultSerializer(DocumentSerializer):
    def to_representation(self, instance):
        doc = Document.objects.get(id=instance["id"])
-        comments = ""
+        comments = ",".join(
-        if hasattr(instance.results.q, "subqueries"):
+            [str(c.comment) for c in Comment.objects.filter(document=instance["id"])],
-            commentTerm = instance.results.q.subqueries[0]
+        )
            comments = ",".join(
                [
                    str(c.comment)
                    for c in Comment.objects.filter(document=instance["id"])
                    if commentTerm.text in c.comment
                ],
            )
        r = super().to_representation(doc)
        r["__search_hit__"] = {
            "score": instance.score,
            "highlights": instance.highlights("content", text=doc.content),
-            "comment_highlights": instance.highlights("content", text=comments)
+            "comment_highlights": instance.highlights("comments", text=comments)
            if doc
            else None,
            "rank": instance.rank,
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@ -271,6 +271,16 @@ class MailDocumentParser(DocumentParser):
                "paperHeight": "11.7",
                "scale": "1.0",
            }
            # Set the output format of the resulting PDF
            # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
            if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
                data["pdfFormat"] = "PDF/A-2b"
            elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
                data["pdfFormat"] = "PDF/A-1a"
            elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
                data["pdfFormat"] = "PDF/A-3b"
            try:
                response = requests.post(
                    url,
--- a/src/paperless_mail/tests/test_parsers.py
+++ b/src/paperless_mail/tests/test_parsers.py
@ -573,8 +573,8 @@ class TestParser(TestCase):
            self.parser.gotenberg_server + "/forms/chromium/convert/html",
            mock_post.call_args.args[0],
        )
-        self.assertEqual({}, mock_post.call_args.kwargs["headers"])
+        self.assertDictEqual({}, mock_post.call_args.kwargs["headers"])
-        self.assertEqual(
+        self.assertDictEqual(
            {
                "marginTop": "0.1",
                "marginBottom": "0.1",
@ -583,6 +583,7 @@ class TestParser(TestCase):
                "paperWidth": "8.27",
                "paperHeight": "11.7",
                "scale": "1.0",
                "pdfFormat": "PDF/A-2b",
            },
            mock_post.call_args.kwargs["data"],
        )
@ -663,8 +664,8 @@ class TestParser(TestCase):
            self.parser.gotenberg_server + "/forms/chromium/convert/html",
            mock_post.call_args.args[0],
        )
-        self.assertEqual({}, mock_post.call_args.kwargs["headers"])
+        self.assertDictEqual({}, mock_post.call_args.kwargs["headers"])
-        self.assertEqual(
+        self.assertDictEqual(
            {
                "marginTop": "0.1",
                "marginBottom": "0.1",
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@ -95,9 +95,19 @@ class TikaDocumentParser(DocumentParser):
                ),
            }
            headers = {}
            data = {}
            # Set the output format of the resulting PDF
            # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
            if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
                data["pdfFormat"] = "PDF/A-2b"
            elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
                data["pdfFormat"] = "PDF/A-1a"
            elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
                data["pdfFormat"] = "PDF/A-3b"
            try:
-                response = requests.post(url, files=files, headers=headers)
+                response = requests.post(url, files=files, headers=headers, data=data)
                response.raise_for_status()  # ensure we notice bad responses
            except Exception as err:
                raise ParseError(