Merge remote-tracking branch 'origin/dev'

2025-12-18 01:41:14 -06:00 · 2023-01-29 08:40:13 -08:00
parent 228031b512 88d362309d
commit 143e61ff92
30 changed files with 1680 additions and 1053 deletions
--- a/.github/scripts/cleanup-tags.py
+++ b/.github/scripts/cleanup-tags.py
@@ -15,6 +15,8 @@ from github import ContainerPackage
 from github import GithubBranchApi
 from github import GithubContainerRegistryApi

+import docker
+
 logger = logging.getLogger("cleanup-tags")


@@ -151,12 +153,16 @@ class RegistryTagsCleaner:
            for tag in sorted(self.tags_to_keep):
                full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}"
                logger.info(f"Checking manifest for {full_name}")
+                # TODO: It would be nice to use RegistryData from docker
+                # except the ID doesn't map to anything in the manifest
                try:
                    proc = subprocess.run(
                        [
                            shutil.which("docker"),
-                            "manifest",
+                            "buildx",
+                            "imagetools",
                            "inspect",
+                            "--raw",
                            full_name,
                        ],
                        capture_output=True,
@@ -241,6 +247,65 @@ class RegistryTagsCleaner:
        # By default, keep anything which is tagged
        self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys()))

+    def check_tags_pull(self):
+        """
+        This method uses the Docker Python SDK to confirm all tags which were
+        kept still pull, for all platforms.
+
+        TODO: This is much slower (although more comprehensive).  Maybe a Pool?
+        """
+        logger.info("Beginning confirmation step")
+        client = docker.from_env()
+        imgs = []
+        for tag in sorted(self.tags_to_keep):
+            repository = f"ghcr.io/{self.repo_owner}/{self.package_name}"
+            for arch, variant in [("amd64", None), ("arm64", None), ("arm", "v7")]:
+                # From 11.2.0 onwards, qpdf is cross compiled, so there is a single arch, amd64
+                # skip others in this case
+                if "qpdf" in self.package_name and arch != "amd64" and tag == "11.2.0":
+                    continue
+                # Skip beta and release candidate tags
+                elif "beta" in tag:
+                    continue
+
+                # Build the platform name
+                if variant is not None:
+                    platform = f"linux/{arch}/{variant}"
+                else:
+                    platform = f"linux/{arch}"
+
+                try:
+                    logger.info(f"Pulling {repository}:{tag} for {platform}")
+                    image = client.images.pull(
+                        repository=repository,
+                        tag=tag,
+                        platform=platform,
+                    )
+                    imgs.append(image)
+                except docker.errors.APIError as e:
+                    logger.error(
+                        f"Failed to pull {repository}:{tag}: {e}",
+                    )
+
+            # Prevent out of space errors by removing after a few
+            # pulls
+            if len(imgs) > 50:
+                for image in imgs:
+                    try:
+                        client.images.remove(image.id)
+                    except docker.errors.APIError as e:
+                        err_str = str(e)
+                        # Ignore attempts to remove images that are partly shared
+                        # Ignore images which are somehow gone already
+                        if (
+                            "must be forced" not in err_str
+                            and "No such image" not in err_str
+                        ):
+                            logger.error(
+                                f"Remove image ghcr.io/{self.repo_owner}/{self.package_name}:{tag} failed: {e}",
+                            )
+                imgs = []
+

 class MainImageTagsCleaner(RegistryTagsCleaner):
    def decide_what_tags_to_keep(self):
@@ -397,6 +462,10 @@ def _main():
            # Clean images which are untagged
            cleaner.clean_untagged(args.is_manifest)

+            # Verify remaining tags still pull
+            if args.is_manifest:
+                cleaner.check_tags_pull()
+

 if __name__ == "__main__":
    _main()
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -212,12 +212,6 @@ jobs:
    name: Prepare Docker Pipeline Data
    if: github.event_name == 'push' && (startsWith(github.ref, 'refs/heads/feature-') || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/beta' || contains(github.ref, 'beta.rc') || startsWith(github.ref, 'refs/tags/v'))
    runs-on: ubuntu-22.04
-    # If the push triggered the installer library workflow, wait for it to
-    # complete here.  This ensures the required versions for the final
-    # image have been built, while not waiting at all if the versions haven't changed
-    concurrency:
-      group: build-installer-library
-      cancel-in-progress: false
    needs:
      - documentation
      - tests-backend
--- a/.github/workflows/cleanup-tags.yml
+++ b/.github/workflows/cleanup-tags.yml
@@ -62,9 +62,9 @@ jobs:
        with:
          python-version: "3.10"
      -
-        name: Install httpx
+        name: Install Python libraries
        run: |
-          python -m pip install httpx
+          python -m pip install httpx docker
      #
      # Clean up primary package
      #
@@ -81,13 +81,3 @@ jobs:
        if: "${{ env.TOKEN != '' }}"
        run: |
          python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --delete "${{ matrix.cache-name }}"
-      #
-      # Verify tags which are left still pull
-      #
-      -
-        name: Check all tags still pull
-        run: |
-          ghcr_name=$(echo "ghcr.io/${GITHUB_REPOSITORY_OWNER}/${{ matrix.primary-name }}" | awk '{ print tolower($0) }')
-          echo "Pulling all tags of ${ghcr_name}"
-          docker pull --quiet --all-tags ${ghcr_name}
-          docker image list
--- a/.github/workflows/installer-library.yml
+++ b/.github/workflows/installer-library.yml
@@ -169,3 +169,142 @@ jobs:
        PIKEPDF_VERSION=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }}
        PILLOW_VERSION=${{ needs.prepare-docker-build.outputs.pillow-version }}
        LXML_VERSION=${{ needs.prepare-docker-build.outputs.lxml-version }}
+
+  commit-binary-files:
+    name: Store installers
+    needs:
+      - prepare-docker-build
+      - build-qpdf-debs
+      - build-jbig2enc
+      - build-psycopg2-wheel
+      - build-pikepdf-wheel
+    runs-on: ubuntu-22.04
+    steps:
+      -
+        name: Checkout
+        uses: actions/checkout@v3
+        with:
+          ref: binary-library
+      -
+        name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+      -
+        name: Install system dependencies
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -qq --no-install-recommends tree
+      -
+        name: Extract qpdf files
+        run: |
+          version=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).version }}
+          tag=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).image_tag }}
+
+          docker pull --quiet ${tag}
+          docker create --name qpdf-extract ${tag}
+
+          mkdir --parents qpdf/${version}/amd64
+          docker cp qpdf-extract:/usr/src/qpdf/${version}/amd64 qpdf/${version}
+
+          mkdir --parents qpdf/${version}/arm64
+          docker cp qpdf-extract:/usr/src/qpdf/${version}/arm64 qpdf/${version}
+
+          mkdir --parents qpdf/${version}/armv7
+          docker cp qpdf-extract:/usr/src/qpdf/${version}/armv7 qpdf/${version}
+      -
+        name: Extract psycopg2 files
+        run: |
+          version=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).version }}
+          tag=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).image_tag }}
+
+          docker pull --quiet --platform linux/amd64 ${tag}
+          docker create --platform linux/amd64 --name psycopg2-extract ${tag}
+          mkdir --parents psycopg2/${version}/amd64
+          docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/amd64
+          mv psycopg2/${version}/amd64/wheels/* psycopg2/${version}/amd64
+          rm -r psycopg2/${version}/amd64/wheels/
+          docker rm psycopg2-extract
+
+          docker pull --quiet --platform linux/arm64 ${tag}
+          docker create --platform linux/arm64 --name psycopg2-extract ${tag}
+          mkdir --parents psycopg2/${version}/arm64
+          docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/arm64
+          mv psycopg2/${version}/arm64/wheels/* psycopg2/${version}/arm64
+          rm -r psycopg2/${version}/arm64/wheels/
+          docker rm psycopg2-extract
+
+          docker pull --quiet --platform linux/arm/v7 ${tag}
+          docker create --platform linux/arm/v7 --name psycopg2-extract ${tag}
+          mkdir --parents psycopg2/${version}/armv7
+          docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/armv7
+          mv psycopg2/${version}/armv7/wheels/* psycopg2/${version}/armv7
+          rm -r psycopg2/${version}/armv7/wheels/
+          docker rm psycopg2-extract
+      -
+        name: Extract pikepdf files
+        run: |
+          version=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }}
+          tag=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).image_tag }}
+
+          docker pull --quiet --platform linux/amd64 ${tag}
+          docker create --platform linux/amd64 --name pikepdf-extract ${tag}
+          mkdir --parents pikepdf/${version}/amd64
+          docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/amd64
+          mv pikepdf/${version}/amd64/wheels/* pikepdf/${version}/amd64
+          rm -r pikepdf/${version}/amd64/wheels/
+          docker rm pikepdf-extract
+
+          docker pull --quiet --platform linux/arm64 ${tag}
+          docker create --platform linux/arm64 --name pikepdf-extract ${tag}
+          mkdir --parents pikepdf/${version}/arm64
+          docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/arm64
+          mv pikepdf/${version}/arm64/wheels/* pikepdf/${version}/arm64
+          rm -r pikepdf/${version}/arm64/wheels/
+          docker rm pikepdf-extract
+
+          docker pull --quiet --platform linux/arm/v7 ${tag}
+          docker create --platform linux/arm/v7 --name pikepdf-extract ${tag}
+          mkdir --parents pikepdf/${version}/armv7
+          docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/armv7
+          mv pikepdf/${version}/armv7/wheels/* pikepdf/${version}/armv7
+          rm -r pikepdf/${version}/armv7/wheels/
+          docker rm pikepdf-extract
+      -
+        name: Extract jbig2enc files
+        run: |
+          version=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).version }}
+          tag=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).image_tag }}
+
+          docker pull --quiet --platform linux/amd64 ${tag}
+          docker create --platform linux/amd64 --name jbig2enc-extract ${tag}
+          mkdir --parents jbig2enc/${version}/amd64
+          docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/amd64/
+          mv jbig2enc/${version}/amd64/build/* jbig2enc/${version}/amd64/
+          docker rm jbig2enc-extract
+
+          docker pull --quiet --platform linux/arm64 ${tag}
+          docker create --platform linux/arm64 --name jbig2enc-extract ${tag}
+          mkdir --parents jbig2enc/${version}/arm64
+          docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/arm64
+          mv jbig2enc/${version}/arm64/build/* jbig2enc/${version}/arm64/
+          docker rm jbig2enc-extract
+
+          docker pull --quiet --platform linux/arm/v7 ${tag}
+          docker create --platform linux/arm/v7 --name jbig2enc-extract ${tag}
+          mkdir --parents jbig2enc/${version}/armv7
+          docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/armv7
+          mv jbig2enc/${version}/armv7/build/* jbig2enc/${version}/armv7/
+          docker rm jbig2enc-extract
+      -
+        name: Show file structure
+        run: |
+          tree .
+      -
+        name: Commit files
+        run: |
+          git config --global user.name "github-actions"
+          git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add pikepdf/ qpdf/ psycopg2/ jbig2enc/
+          git commit -m "Updating installer packages" || true
+          git push origin || true
--- a/53
+++ b/53
@@ -1,19 +1,5 @@
 # syntax=docker/dockerfile:1.4

-# Pull the installer images from the library
-# These are all built previously
-# They provide either a .deb or .whl
-
-ARG JBIG2ENC_VERSION
-ARG QPDF_VERSION
-ARG PIKEPDF_VERSION
-ARG PSYCOPG2_VERSION
-
-FROM ghcr.io/paperless-ngx/paperless-ngx/builder/jbig2enc:${JBIG2ENC_VERSION} as jbig2enc-builder
-FROM --platform=$BUILDPLATFORM ghcr.io/paperless-ngx/paperless-ngx/builder/qpdf:${QPDF_VERSION} as qpdf-builder
-FROM ghcr.io/paperless-ngx/paperless-ngx/builder/pikepdf:${PIKEPDF_VERSION} as pikepdf-builder
-FROM ghcr.io/paperless-ngx/paperless-ngx/builder/psycopg2:${PSYCOPG2_VERSION} as psycopg2-builder
-
 FROM --platform=$BUILDPLATFORM node:16-bullseye-slim AS compile-frontend

 # This stage compiles the frontend
@@ -58,24 +44,21 @@ LABEL org.opencontainers.image.url="https://github.com/paperless-ngx/paperless-n
 LABEL org.opencontainers.image.licenses="GPL-3.0-only"

 ARG DEBIAN_FRONTEND=noninteractive
-# Buildx provided
+# Buildx provided, must be defined to use though
 ARG TARGETARCH
 ARG TARGETVARIANT

 # Workflow provided
+ARG JBIG2ENC_VERSION
 ARG QPDF_VERSION
+ARG PIKEPDF_VERSION
+ARG PSYCOPG2_VERSION

 #
 # Begin installation and configuration
 # Order the steps below from least often changed to most
 #

-# copy jbig2enc
-# Basically will never change again
-COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/.libs/libjbig2enc* /usr/local/lib/
-COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/jbig2 /usr/local/bin/
-COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/*.h /usr/local/include/
-
 # Packages need for running
 ARG RUNTIME_PACKAGES="\
  # Python
@@ -198,19 +181,29 @@ RUN set -eux \
 # Install the built packages from the installer library images
 # Use mounts to avoid copying installer files into the image
 # These change sometimes
-RUN --mount=type=bind,from=qpdf-builder,target=/qpdf \
-    --mount=type=bind,from=psycopg2-builder,target=/psycopg2 \
-    --mount=type=bind,from=pikepdf-builder,target=/pikepdf \
-  set -eux \
+RUN set -eux \
+  && echo "Getting binaries" \
+    && mkdir paperless-ngx \
+    && curl --fail --silent --show-error --output paperless-ngx.tar.gz --location https://github.com/paperless-ngx/paperless-ngx/archive/41d6e7e407af09a0882736d50c89b6e015997bff.tar.gz \
+    && tar -xf paperless-ngx.tar.gz --directory paperless-ngx --strip-components=1 \
+    && cd paperless-ngx \
+    # Setting a specific revision ensures we know what this installed
+    # and ensures cache breaking on changes
+  && echo "Installing jbig2enc" \
+    && cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/jbig2 /usr/local/bin/ \
+    && cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/libjbig2enc* /usr/local/lib/ \
  && echo "Installing qpdf" \
-    && apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \
-    && apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \
+    && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \
+    && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \
  && echo "Installing pikepdf and dependencies" \
-    && python3 -m pip install --no-cache-dir /pikepdf/usr/src/wheels/*.whl \
+    && python3 -m pip install --no-cache-dir ./pikepdf/${PIKEPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/*.whl \
    && python3 -m pip list \
  && echo "Installing psycopg2" \
-    && python3 -m pip install --no-cache-dir /psycopg2/usr/src/wheels/psycopg2*.whl \
-    && python3 -m pip list
+    && python3 -m pip install --no-cache-dir ./psycopg2/${PSYCOPG2_VERSION}/${TARGETARCH}${TARGETVARIANT}/psycopg2*.whl \
+    && python3 -m pip list \
+  && echo "Cleaning up image layer" \
+    && cd ../ \
+    && rm -rf paperless-ngx

 WORKDIR /usr/src/paperless/src/

--- a/docker-builders/Dockerfile.jbig2enc
+++ b/docker-builders/Dockerfile.jbig2enc
@@ -29,7 +29,20 @@ RUN set -eux \
    && ./autogen.sh \
    && ./configure \
    && make \
+  && echo "Gathering package data" \
+    && dpkg-query -f '${Package;-40}${Version}\n' -W > ./pkg-list.txt \
  && echo "Cleaning up image" \
    && apt-get -y purge ${BUILD_PACKAGES} \
    && apt-get -y autoremove --purge \
-    && rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/* \
+  && echo "Moving files around" \
+    && mkdir build \
+    # Unlink a symlink that causes problems
+    && unlink ./src/.libs/libjbig2enc.la \
+    # Move what the link pointed to
+    && mv ./src/libjbig2enc.la ./build/ \
+    # Move the shared library .so files
+    && mv ./src/.libs/libjbig2enc* ./build/ \
+    # And move the cli binary
+    && mv ./src/jbig2 ./build/ \
+    && mv ./pkg-list.txt ./build/
--- a/docker-builders/Dockerfile.pikepdf
+++ b/docker-builders/Dockerfile.pikepdf
@@ -7,12 +7,17 @@
 # Default to pulling from the main repo registry when manually building
 ARG REPO="paperless-ngx/paperless-ngx"

+# This does nothing, except provide a name for a copy below
 ARG QPDF_VERSION
 FROM --platform=$BUILDPLATFORM ghcr.io/${REPO}/builder/qpdf:${QPDF_VERSION} as qpdf-builder

-# This does nothing, except provide a name for a copy below
-
-FROM python:3.9-slim-bullseye as main
+#
+# Stage: builder
+# Purpose:
+#  - Build the pikepdf wheel
+#  - Build any dependent wheels which can't be found
+#
+FROM python:3.9-slim-bullseye as builder

 LABEL org.opencontainers.image.description="A intermediate image with pikepdf wheel built"

@@ -100,3 +105,14 @@ RUN set -eux \
    && apt-get -y purge ${BUILD_PACKAGES} \
    && apt-get -y autoremove --purge \
    && rm -rf /var/lib/apt/lists/*
+
+#
+# Stage: package
+# Purpose: Holds the compiled .whl files in a tiny image to pull
+#
+FROM alpine:3.17 as package
+
+WORKDIR /usr/src/wheels/
+
+COPY --from=builder /usr/src/wheels/*.whl ./
+COPY --from=builder /usr/src/wheels/pkg-list.txt ./
--- a/docker-builders/Dockerfile.psycopg2
+++ b/docker-builders/Dockerfile.psycopg2
@@ -2,7 +2,12 @@
 # Inputs:
 #    - PSYCOPG2_VERSION - Version to build

-FROM python:3.9-slim-bullseye as main
+#
+# Stage: builder
+# Purpose:
+#  - Build the psycopg2 wheel
+#
+FROM python:3.9-slim-bullseye as builder

 LABEL org.opencontainers.image.description="A intermediate image with psycopg2 wheel built"

@@ -48,3 +53,14 @@ RUN set -eux \
    && apt-get -y purge ${BUILD_PACKAGES} \
    && apt-get -y autoremove --purge \
    && rm -rf /var/lib/apt/lists/*
+
+#
+# Stage: package
+# Purpose: Holds the compiled .whl files in a tiny image to pull
+#
+FROM alpine:3.17 as package
+
+WORKDIR /usr/src/wheels/
+
+COPY --from=builder /usr/src/wheels/*.whl ./
+COPY --from=builder /usr/src/wheels/pkg-list.txt ./
--- a/docker-builders/README.md
+++ b/docker-builders/README.md
@@ -0,0 +1,57 @@
+# Installer Library
+
+This folder contains the Dockerfiles for building certain installers or libraries, which are then pulled into the main image.
+
+## [jbig2enc](https://github.com/agl/jbig2enc)
+
+### Why
+
+JBIG is an image coding which can achieve better compression of images for PDFs.
+
+### What
+
+The Docker image builds a shared library file and utility, which is copied into the correct location in the final image.
+
+### Updating
+
+1. Ensure the given qpdf version is present in [Debian bookworm](https://packages.debian.org/bookworm/qpdf)
+2. Update `.build-config.json` to the given version
+3. If the Debian specific version has incremented, update `Dockerfile.qpdf`
+
+See Also:
+
+- [OCRMyPDF Documentation](https://ocrmypdf.readthedocs.io/en/latest/jbig2.html)
+
+## [psycopg2](https://www.psycopg.org/)
+
+### Why
+
+The pre-built wheels of psycopg2 are built on Debian 9, which provides a quite old version of libpq-dev. This causes issue with authentication methods.
+
+### What
+
+The image builds psycopg2 wheels on Debian 10 and places the produced wheels into `/usr/src/wheels/`.
+
+See Also:
+
+- [Issue 266](https://github.com/paperless-ngx/paperless-ngx/issues/266)
+
+## [qpdf](https://qpdf.readthedocs.io/en/stable/index.html)
+
+### Why
+
+qpdf and it's library provide tools to read, manipulate and fix up PDFs. Version 11 is also required by `pikepdf` 6+ and Debian 9 does not provide above version 10.
+
+### What
+
+The Docker image cross compiles .deb installers for each supported architecture of the main image. The installers are placed in `/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/`
+
+## [pikepdf](https://pikepdf.readthedocs.io/en/latest/)
+
+### Why
+
+Required by OCRMyPdf, this is a general purpose library for PDF manipulation in Python via the qpdf libraries.
+
+### What
+
+The built wheels are placed into `/usr/src/wheels/`
--- a/docker/docker-prepare.sh
+++ b/docker/docker-prepare.sh
@@ -80,7 +80,7 @@ django_checks() {

 search_index() {

-	local -r index_version=1
+	local -r index_version=2
 	local -r index_version_file=${DATA_DIR}/.index_version

 	if [[ (! -f "${index_version_file}") || $(<"${index_version_file}") != "$index_version" ]]; then
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -121,7 +121,17 @@ Executed after the consumer sees a new document in the consumption
 folder, but before any processing of the document is performed. This
 script can access the following relevant environment variables set:

- `DOCUMENT_SOURCE_PATH`
+| Environment Variable    | Description                                                  |
+| ----------------------- | ------------------------------------------------------------ |
+| `DOCUMENT_SOURCE_PATH`  | Original path of the consumed document                       |
+| `DOCUMENT_WORKING_PATH` | Path to a copy of the original that consumption will work on |
+
+!!! note
+
+    Pre-consume scripts which modify the document should only change
+    the `DOCUMENT_WORKING_PATH` file or a second consume task may
+    be triggered, leading to failures as two tasks work on the
+    same document path

 A simple but common example for this would be creating a simple script
 like this:
@@ -130,7 +140,7 @@ like this:

 ```bash
 #!/usr/bin/env bash
-pdf2pdfocr.py -i ${DOCUMENT_SOURCE_PATH}
+pdf2pdfocr.py -i ${DOCUMENT_WORKING_PATH}
 ```

 `/etc/paperless.conf`
@@ -157,27 +167,37 @@ Executed after the consumer has successfully processed a document and
 has moved it into paperless. It receives the following environment
 variables:

- `DOCUMENT_ID`
- `DOCUMENT_FILE_NAME`
- `DOCUMENT_CREATED`
- `DOCUMENT_MODIFIED`
- `DOCUMENT_ADDED`
- `DOCUMENT_SOURCE_PATH`
- `DOCUMENT_ARCHIVE_PATH`
- `DOCUMENT_THUMBNAIL_PATH`
- `DOCUMENT_DOWNLOAD_URL`
- `DOCUMENT_THUMBNAIL_URL`
- `DOCUMENT_CORRESPONDENT`
- `DOCUMENT_TAGS`
- `DOCUMENT_ORIGINAL_FILENAME`
+| Environment Variable         | Description                                   |
+| ---------------------------- | --------------------------------------------- |
+| `DOCUMENT_ID`                | Database primary key of the document          |
+| `DOCUMENT_FILE_NAME`         | Formatted filename, not including paths       |
+| `DOCUMENT_CREATED`           | Date & time when document created             |
+| `DOCUMENT_MODIFIED`          | Date & time when document was last modified   |
+| `DOCUMENT_ADDED`             | Date & time when document was added           |
+| `DOCUMENT_SOURCE_PATH`       | Path to the original document file            |
+| `DOCUMENT_ARCHIVE_PATH`      | Path to the generate archive file (if any)    |
+| `DOCUMENT_THUMBNAIL_PATH`    | Path to the generated thumbnail               |
+| `DOCUMENT_DOWNLOAD_URL`      | URL for document download                     |
+| `DOCUMENT_THUMBNAIL_URL`     | URL for the document thumbnail                |
+| `DOCUMENT_CORRESPONDENT`     | Assigned correspondent (if any)               |
+| `DOCUMENT_TAGS`              | Comma separated list of tags applied (if any) |
+| `DOCUMENT_ORIGINAL_FILENAME` | Filename of original document                 |

-The script can be in any language, but for a simple shell script
-example, you can take a look at
-[post-consumption-example.sh](https://github.com/paperless-ngx/paperless-ngx/blob/main/scripts/post-consumption-example.sh)
-in this project.
+The script can be in any language, A simple shell script example:
+
+```bash title="post-consumption-example"
+--8<-- "./scripts/post-consumption-example.sh"
+```
+
+!!! note

    The post consumption script cannot cancel the consumption process.

+!!! warning
+
+    The post consumption script should not modify the document files
+    directly
+
 The script's stdout and stderr will be logged line by line to the
 webserver log, along with the exit code of the script.

--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -2,6 +2,9 @@

 ## paperless-ngx 1.12.1

+_Note: Version 1.12.x introduced searching of comments which will work for comments added after the upgrade but a reindex of the search index is required in order to be able to search
+older comments. The Docker image will automatically perform this reindex, bare metal installations will have to perform this manually, see [the docs](https://docs.paperless-ngx.com/administration/#index)._
+
 ### Bug Fixes

 - Fix: comments not showing in search until after manual reindex in v1.12 [@shamoon](https://github.com/shamoon) ([#2513](https://github.com/paperless-ngx/paperless-ngx/pull/2513))
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -41,6 +41,7 @@ markdown_extensions:
      anchor_linenums: true
  - pymdownx.superfences
  - pymdownx.inlinehilite
+  - pymdownx.snippets
 strict: true
 nav:
    - index.md
--- a/src-ui/messages.xlf
+++ b/src-ui/messages.xlf
--- a/src-ui/src/app/components/document-detail/document-detail.component.ts
+++ b/src-ui/src/app/components/document-detail/document-detail.component.ts
@@ -204,6 +204,10 @@ export class DocumentDetailComponent
            )
            .subscribe({
              next: (titleValue) => {
+                // In the rare case when the field changed just after debounced event was fired.
+                // We dont want to overwrite whats actually in the text field, so just return
+                if (titleValue !== this.titleInput.value) return
+
                this.title = titleValue
                this.documentForm.patchValue({ title: titleValue })
              },
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
@@ -26,11 +26,11 @@
        </div>
        <p class="card-text">
          <span *ngIf="document.__search_hit__ && document.__search_hit__.highlights" [innerHtml]="document.__search_hit__.highlights"></span>
-          <span *ngIf="document.__search_hit__ && document.__search_hit__.comment_highlights" class="d-block">
+          <span *ngFor="let highlight of searchCommentHighlights" class="d-block">
            <svg width="1em" height="1em" fill="currentColor" class="me-2">
              <use xlink:href="assets/bootstrap-icons.svg#chat-left-text"/>
            </svg>
-            <span [innerHtml]="document.__search_hit__.comment_highlights"></span>
+            <span [innerHtml]="highlight"></span>
          </span>
          <span *ngIf="!document.__search_hit__" class="result-content">{{contentTrimmed}}</span>
        </p>
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts
@@ -70,6 +70,22 @@ export class DocumentCardLargeComponent {
    }
  }

+  get searchCommentHighlights() {
+    let highlights = []
+    if (
+      this.document['__search_hit__'] &&
+      this.document['__search_hit__'].comment_highlights
+    ) {
+      // only show comments with a match
+      highlights = (
+        this.document['__search_hit__'].comment_highlights as string
+      )
+        .split(',')
+        .filter((higlight) => higlight.includes('<span'))
+    }
+    return highlights
+  }
+
  getIsThumbInverted() {
    return this.settingsService.get(SETTINGS_KEYS.DARK_MODE_THUMB_INVERTED)
  }
--- a/src-ui/src/app/components/manage/settings/settings.component.html
+++ b/src-ui/src/app/components/manage/settings/settings.component.html
@@ -143,7 +143,7 @@
            <p i18n>
              <em>No tracking data is collected by the app in any way.</em>
            </p>
-            <app-input-check i18n-title title="Enable update checking" formControlName="updateCheckingEnabled" i18n-hint hint="Note that for users of thirdy-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release."></app-input-check>
+            <app-input-check i18n-title title="Enable update checking" formControlName="updateCheckingEnabled" i18n-hint hint="Note that for users of third-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release."></app-input-check>
          </div>
        </div>

--- a/src-ui/src/environments/environment.prod.ts
+++ b/src-ui/src/environments/environment.prod.ts
@@ -5,7 +5,7 @@ export const environment = {
  apiBaseUrl: document.baseURI + 'api/',
  apiVersion: '2',
  appTitle: 'Paperless-ngx',
-  version: '1.12.1',
+  version: '1.12.1-dev',
  webSocketHost: window.location.host,
  webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:',
  webSocketBaseUrl: base_url.pathname + 'ws/',
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -4,7 +4,6 @@ import shutil
 import tempfile
 from dataclasses import dataclass
 from functools import lru_cache
-from math import ceil
 from pathlib import Path
 from typing import List
 from typing import Optional
@@ -12,10 +11,9 @@ from typing import Optional
 import magic
 from django.conf import settings
 from pdf2image import convert_from_path
+from pdf2image.exceptions import PDFPageCountError
 from pikepdf import Page
-from pikepdf import PasswordError
 from pikepdf import Pdf
-from pikepdf import PdfImage
 from PIL import Image
 from PIL import ImageSequence
 from pyzbar import pyzbar
@@ -154,52 +152,15 @@ def scan_file_for_barcodes(
    (page_number, barcode_text) tuples
    """

-    def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]:
-        detected_barcodes = []
-        with Pdf.open(pdf_filepath) as pdf:
-            for page_num, page in enumerate(pdf.pages):
-                for image_key in page.images:
-                    pdfimage = PdfImage(page.images[image_key])
-
-                    # This type is known to have issues:
-                    # https://github.com/pikepdf/pikepdf/issues/401
-                    if "/CCITTFaxDecode" in pdfimage.filters:
-                        raise BarcodeImageFormatError(
-                            "Unable to decode CCITTFaxDecode images",
-                        )
-
-                    # Not all images can be transcoded to a PIL image, which
-                    # is what pyzbar expects to receive, so this may
-                    # raise an exception, triggering fallback
-                    pillow_img = pdfimage.as_pil_image()
-
-                    # Scale the image down
-                    # See: https://github.com/paperless-ngx/paperless-ngx/issues/2385
-                    # TLDR: zbar has issues with larger images
-                    width, height = pillow_img.size
-                    if width > 1024:
-                        scaler = ceil(width / 1024)
-                        new_width = int(width / scaler)
-                        new_height = int(height / scaler)
-                        pillow_img = pillow_img.resize((new_width, new_height))
-
-                    width, height = pillow_img.size
-                    if height > 2048:
-                        scaler = ceil(height / 2048)
-                        new_width = int(width / scaler)
-                        new_height = int(height / scaler)
-                        pillow_img = pillow_img.resize((new_width, new_height))
-
-                    for barcode_value in barcode_reader(pillow_img):
-                        detected_barcodes.append(Barcode(page_num, barcode_value))
-
-        return detected_barcodes
-
    def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]:
        detected_barcodes = []
        # use a temporary directory in case the file is too big to handle in memory
        with tempfile.TemporaryDirectory() as path:
-            pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
+            pages_from_path = convert_from_path(
+                pdf_filepath,
+                dpi=300,
+                output_folder=path,
+            )
            for current_page_number, page in enumerate(pages_from_path):
                for barcode_value in barcode_reader(page):
                    detected_barcodes.append(
@@ -219,27 +180,19 @@ def scan_file_for_barcodes(
        # Always try pikepdf first, it's usually fine, faster and
        # uses less memory
        try:
-            barcodes = _pikepdf_barcode_scan(pdf_filepath)
+            barcodes = _pdf2image_barcode_scan(pdf_filepath)
        # Password protected files can't be checked
-        except PasswordError as e:
+        # This is the exception raised for those
+        except PDFPageCountError as e:
            logger.warning(
                f"File is likely password protected, not checking for barcodes: {e}",
            )
-        # Handle pikepdf related image decoding issues with a fallback to page
-        # by page conversion to images in a temporary directory
-        except Exception as e:
-            logger.warning(
-                f"Falling back to pdf2image because: {e}",
-            )
-            try:
-                barcodes = _pdf2image_barcode_scan(pdf_filepath)
        # This file is really borked, allow the consumption to continue
        # but it may fail further on
        except Exception as e:  # pragma: no cover
            logger.warning(
                f"Exception during barcode scanning: {e}",
            )
-
    else:
        logger.warning(
            f"Unsupported file format for barcode reader: {str(mime_type)}",
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,7 +1,10 @@
 import datetime
 import hashlib
 import os
+import shutil
+import tempfile
 import uuid
+from pathlib import Path
 from subprocess import CompletedProcess
 from subprocess import run
 from typing import Optional
@@ -94,7 +97,8 @@ class Consumer(LoggingMixin):

    def __init__(self):
        super().__init__()
-        self.path = None
+        self.path: Optional[Path] = None
+        self.original_path: Optional[Path] = None
        self.filename = None
        self.override_title = None
        self.override_correspondent_id = None
@@ -167,16 +171,18 @@ class Consumer(LoggingMixin):

        self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")

-        filepath_arg = os.path.normpath(self.path)
+        working_file_path = str(self.path)
+        original_file_path = str(self.original_path)

        script_env = os.environ.copy()
-        script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg
+        script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
+        script_env["DOCUMENT_WORKING_PATH"] = working_file_path

        try:
            completed_proc = run(
                args=[
                    settings.PRE_CONSUME_SCRIPT,
-                    filepath_arg,
+                    original_file_path,
                ],
                env=script_env,
                capture_output=True,
@@ -195,7 +201,7 @@ class Consumer(LoggingMixin):
                exception=e,
            )

-    def run_post_consume_script(self, document):
+    def run_post_consume_script(self, document: Document):
        if not settings.POST_CONSUME_SCRIPT:
            return

@@ -285,8 +291,8 @@ class Consumer(LoggingMixin):
        Return the document object if it was successfully created.
        """

-        self.path = path
-        self.filename = override_filename or os.path.basename(path)
+        self.path = Path(path).resolve()
+        self.filename = override_filename or self.path.name
        self.override_title = override_title
        self.override_correspondent_id = override_correspondent_id
        self.override_document_type_id = override_document_type_id
@@ -311,6 +317,15 @@ class Consumer(LoggingMixin):

        self.log("info", f"Consuming {self.filename}")

+        # For the actual work, copy the file into a tempdir
+        self.original_path = self.path
+        tempdir = tempfile.TemporaryDirectory(
+            prefix="paperless-ngx",
+            dir=settings.SCRATCH_DIR,
+        )
+        self.path = Path(tempdir.name) / Path(self.filename)
+        shutil.copy(self.original_path, self.path)
+
        # Determine the parser class.

        mime_type = magic.from_file(self.path, mime=True)
@@ -453,11 +468,12 @@ class Consumer(LoggingMixin):
                # Delete the file only if it was successfully consumed
                self.log("debug", f"Deleting file {self.path}")
                os.unlink(self.path)
+                self.original_path.unlink()

                # https://github.com/jonaswinkler/paperless-ng/discussions/1037
                shadow_file = os.path.join(
-                    os.path.dirname(self.path),
-                    "._" + os.path.basename(self.path),
+                    os.path.dirname(self.original_path),
+                    "._" + os.path.basename(self.original_path),
                )

                if os.path.isfile(shadow_file):
@@ -474,6 +490,7 @@ class Consumer(LoggingMixin):
            )
        finally:
            document_parser.cleanup()
+            tempdir.cleanup()

        self.run_post_consume_script(document)

--- a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png
+++ b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png
--- a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png
+++ b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase):
        with tempfile.NamedTemporaryFile() as script:
            with override_settings(PRE_CONSUME_SCRIPT=script.name):
                c = Consumer()
-                c.path = "path-to-file"
+                c.original_path = "path-to-file"
+                c.path = "/tmp/somewhere/path-to-file"
                c.run_pre_consume_script()

                m.assert_called_once()
@@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase):
                args, kwargs = m.call_args

                command = kwargs["args"]
+                environment = kwargs["env"]

                self.assertEqual(command[0], script.name)
                self.assertEqual(command[1], "path-to-file")

+                self.assertDictContainsSubset(
+                    {
+                        "DOCUMENT_SOURCE_PATH": c.original_path,
+                        "DOCUMENT_WORKING_PATH": c.path,
+                    },
+                    environment,
+                )
+
    @mock.patch("documents.consumer.Consumer.log")
    def test_script_with_output(self, mocked_log):
        """
@@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase):

                m.assert_called_once()

-                args, kwargs = m.call_args
+                _, kwargs = m.call_args

                command = kwargs["args"]
+                environment = kwargs["env"]

                self.assertEqual(command[0], script.name)
                self.assertEqual(command[1], str(doc.pk))
@@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase):
                self.assertEqual(command[7], "my_bank")
                self.assertCountEqual(command[8].split(","), ["a", "b"])

+                self.assertDictContainsSubset(
+                    {
+                        "DOCUMENT_ID": str(doc.pk),
+                        "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
+                        "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
+                        "DOCUMENT_CORRESPONDENT": "my_bank",
+                        "DOCUMENT_TAGS": "a,b",
+                    },
+                    environment,
+                )
+
    def test_script_exit_non_zero(self):
        """
        GIVEN:
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@@ -3,6 +3,7 @@ import shutil
 import tempfile
 from collections import namedtuple
 from contextlib import contextmanager
+from unittest import mock

 from django.apps import apps
 from django.db import connection
@@ -86,6 +87,30 @@ class DirectoriesMixin:
        remove_dirs(self.dirs)


+class ConsumerProgressMixin:
+    def setUp(self) -> None:
+        self.send_progress_patcher = mock.patch(
+            "documents.consumer.Consumer._send_progress",
+        )
+        self.send_progress_mock = self.send_progress_patcher.start()
+        super().setUp()
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        self.send_progress_patcher.stop()
+
+
+class DocumentConsumeDelayMixin:
+    def setUp(self) -> None:
+        self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay")
+        self.consume_file_mock = self.consume_file_patcher.start()
+        super().setUp()
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        self.consume_file_patcher.stop()
+
+
 class TestMigrations(TransactionTestCase):
    @property
    def app(self):
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -477,21 +477,14 @@ class DocumentViewSet(
 class SearchResultSerializer(DocumentSerializer):
    def to_representation(self, instance):
        doc = Document.objects.get(id=instance["id"])
-        comments = ""
-        if hasattr(instance.results.q, "subqueries"):
-            commentTerm = instance.results.q.subqueries[0]
        comments = ",".join(
-                [
-                    str(c.comment)
-                    for c in Comment.objects.filter(document=instance["id"])
-                    if commentTerm.text in c.comment
-                ],
+            [str(c.comment) for c in Comment.objects.filter(document=instance["id"])],
        )
        r = super().to_representation(doc)
        r["__search_hit__"] = {
            "score": instance.score,
            "highlights": instance.highlights("content", text=doc.content),
-            "comment_highlights": instance.highlights("content", text=comments)
+            "comment_highlights": instance.highlights("comments", text=comments)
            if doc
            else None,
            "rank": instance.rank,
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -271,6 +271,16 @@ class MailDocumentParser(DocumentParser):
                "paperHeight": "11.7",
                "scale": "1.0",
            }
+
+            # Set the output format of the resulting PDF
+            # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
+            if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
+                data["pdfFormat"] = "PDF/A-2b"
+            elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
+                data["pdfFormat"] = "PDF/A-1a"
+            elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
+                data["pdfFormat"] = "PDF/A-3b"
+
            try:
                response = requests.post(
                    url,
--- a/src/paperless_mail/tests/test_parsers.py
+++ b/src/paperless_mail/tests/test_parsers.py
@@ -573,8 +573,8 @@ class TestParser(TestCase):
            self.parser.gotenberg_server + "/forms/chromium/convert/html",
            mock_post.call_args.args[0],
        )
-        self.assertEqual({}, mock_post.call_args.kwargs["headers"])
-        self.assertEqual(
+        self.assertDictEqual({}, mock_post.call_args.kwargs["headers"])
+        self.assertDictEqual(
            {
                "marginTop": "0.1",
                "marginBottom": "0.1",
@@ -583,6 +583,7 @@ class TestParser(TestCase):
                "paperWidth": "8.27",
                "paperHeight": "11.7",
                "scale": "1.0",
+                "pdfFormat": "PDF/A-2b",
            },
            mock_post.call_args.kwargs["data"],
        )
@@ -663,8 +664,8 @@ class TestParser(TestCase):
            self.parser.gotenberg_server + "/forms/chromium/convert/html",
            mock_post.call_args.args[0],
        )
-        self.assertEqual({}, mock_post.call_args.kwargs["headers"])
-        self.assertEqual(
+        self.assertDictEqual({}, mock_post.call_args.kwargs["headers"])
+        self.assertDictEqual(
            {
                "marginTop": "0.1",
                "marginBottom": "0.1",
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -95,9 +95,19 @@ class TikaDocumentParser(DocumentParser):
                ),
            }
            headers = {}
+            data = {}
+
+            # Set the output format of the resulting PDF
+            # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
+            if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
+                data["pdfFormat"] = "PDF/A-2b"
+            elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
+                data["pdfFormat"] = "PDF/A-1a"
+            elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
+                data["pdfFormat"] = "PDF/A-3b"

            try:
-                response = requests.post(url, files=files, headers=headers)
+                response = requests.post(url, files=files, headers=headers, data=data)
                response.raise_for_status()  # ensure we notice bad responses
            except Exception as err:
                raise ParseError(