From f78e93a3645e93c49bd8a704cea3dc96c7b3da45 Mon Sep 17 00:00:00 2001 From: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Date: Wed, 25 Jan 2023 10:53:08 -0800 Subject: [PATCH 01/18] Try to prevent title debounce overwriting --- .../components/document-detail/document-detail.component.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts index f99f547e6..19f85398b 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.ts @@ -204,6 +204,10 @@ export class DocumentDetailComponent ) .subscribe({ next: (titleValue) => { + // In the rare case when the field changed just after debounced event was fired. + // We dont want to overwrite whats actually in the text field, so just return + if (titleValue !== this.titleInput.value) return + this.title = titleValue this.documentForm.patchValue({ title: titleValue }) }, From c430b9f8cf5e57cc8bc819d66b97a4cb99196f16 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 25 Jan 2023 12:29:57 -0800 Subject: [PATCH 02/18] Resets version to -dev tagging --- src-ui/src/environments/environment.prod.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src-ui/src/environments/environment.prod.ts b/src-ui/src/environments/environment.prod.ts index 832f69378..16cbe7df6 100644 --- a/src-ui/src/environments/environment.prod.ts +++ b/src-ui/src/environments/environment.prod.ts @@ -5,7 +5,7 @@ export const environment = { apiBaseUrl: document.baseURI + 'api/', apiVersion: '2', appTitle: 'Paperless-ngx', - version: '1.12.1', + version: '1.12.1-dev', webSocketHost: window.location.host, webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:', webSocketBaseUrl: base_url.pathname + 'ws/', From e625ac21c33c50b4abe44ad2c06ec0a32c8cbe7c Mon Sep 17 00:00:00 2001 From: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Date: Wed, 25 Jan 2023 20:27:18 -0800 Subject: [PATCH 03/18] Update index version to force reindex, note in release notes --- docker/docker-prepare.sh | 2 +- docs/changelog.md | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/docker-prepare.sh b/docker/docker-prepare.sh index dad49774b..af2bfe2a7 100755 --- a/docker/docker-prepare.sh +++ b/docker/docker-prepare.sh @@ -80,7 +80,7 @@ django_checks() { search_index() { - local -r index_version=1 + local -r index_version=2 local -r index_version_file=${DATA_DIR}/.index_version if [[ (! -f "${index_version_file}") || $(<"${index_version_file}") != "$index_version" ]]; then diff --git a/docs/changelog.md b/docs/changelog.md index 0e5a6fcba..5a9371781 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,9 @@ ## paperless-ngx 1.12.1 +_Note: Version 1.12.x introduced searching of comments which will work for comments added after the upgrade but a reindex of the search index is required in order to be able to search +older comments. The Docker image will automatically perform this reindex, bare metal installations will have to perform this manually, see [the docs](https://docs.paperless-ngx.com/administration/#index)._ + ### Bug Fixes - Fix: comments not showing in search until after manual reindex in v1.12 [@shamoon](https://github.com/shamoon) ([#2513](https://github.com/paperless-ngx/paperless-ngx/pull/2513)) From f1204d2749bc5b783edaebb722eeacacfdedb9b8 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 25 Jan 2023 08:09:35 -0800 Subject: [PATCH 04/18] Updates the installer library to be static in the final image, saving the installers into Git and curl-ing the correct revision --- .github/workflows/ci.yml | 6 - .github/workflows/installer-library.yml | 139 ++++++++++++++++++++++++ Dockerfile | 53 ++++----- docker-builders/Dockerfile.jbig2enc | 15 ++- docker-builders/Dockerfile.pikepdf | 22 +++- docker-builders/Dockerfile.psycopg2 | 18 ++- docker-builders/README.md | 51 +++++++++ 7 files changed, 263 insertions(+), 41 deletions(-) create mode 100644 docker-builders/README.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ac0b89611..adf03d4bb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -212,12 +212,6 @@ jobs: name: Prepare Docker Pipeline Data if: github.event_name == 'push' && (startsWith(github.ref, 'refs/heads/feature-') || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/beta' || contains(github.ref, 'beta.rc') || startsWith(github.ref, 'refs/tags/v')) runs-on: ubuntu-22.04 - # If the push triggered the installer library workflow, wait for it to - # complete here. This ensures the required versions for the final - # image have been built, while not waiting at all if the versions haven't changed - concurrency: - group: build-installer-library - cancel-in-progress: false needs: - documentation - tests-backend diff --git a/.github/workflows/installer-library.yml b/.github/workflows/installer-library.yml index 32aaf85ee..56064ad86 100644 --- a/.github/workflows/installer-library.yml +++ b/.github/workflows/installer-library.yml @@ -169,3 +169,142 @@ jobs: PIKEPDF_VERSION=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }} PILLOW_VERSION=${{ needs.prepare-docker-build.outputs.pillow-version }} LXML_VERSION=${{ needs.prepare-docker-build.outputs.lxml-version }} + + commit-binary-files: + name: Store installers + needs: + - prepare-docker-build + - build-qpdf-debs + - build-jbig2enc + - build-psycopg2-wheel + - build-pikepdf-wheel + runs-on: ubuntu-22.04 + steps: + - + name: Checkout + uses: actions/checkout@v3 + with: + ref: binary-library + - + name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - + name: Install system dependencies + run: | + sudo apt-get update -qq + sudo apt-get install -qq --no-install-recommends tree + - + name: Extract qpdf files + run: | + version=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).version }} + tag=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).image_tag }} + + docker pull --quiet ${tag} + docker create --name qpdf-extract ${tag} + + mkdir --parents qpdf/${version}/amd64 + docker cp qpdf-extract:/usr/src/qpdf/${version}/amd64 qpdf/${version} + + mkdir --parents qpdf/${version}/arm64 + docker cp qpdf-extract:/usr/src/qpdf/${version}/arm64 qpdf/${version} + + mkdir --parents qpdf/${version}/armv7 + docker cp qpdf-extract:/usr/src/qpdf/${version}/armv7 qpdf/${version} + - + name: Extract psycopg2 files + run: | + version=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).version }} + tag=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).image_tag }} + + docker pull --quiet --platform linux/amd64 ${tag} + docker create --platform linux/amd64 --name psycopg2-extract ${tag} + mkdir --parents psycopg2/${version}/amd64 + docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/amd64 + mv psycopg2/${version}/amd64/wheels/* psycopg2/${version}/amd64 + rm -r psycopg2/${version}/amd64/wheels/ + docker rm psycopg2-extract + + docker pull --quiet --platform linux/arm64 ${tag} + docker create --platform linux/arm64 --name psycopg2-extract ${tag} + mkdir --parents psycopg2/${version}/arm64 + docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/arm64 + mv psycopg2/${version}/arm64/wheels/* psycopg2/${version}/arm64 + rm -r psycopg2/${version}/arm64/wheels/ + docker rm psycopg2-extract + + docker pull --quiet --platform linux/arm/v7 ${tag} + docker create --platform linux/arm/v7 --name psycopg2-extract ${tag} + mkdir --parents psycopg2/${version}/armv7 + docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/armv7 + mv psycopg2/${version}/armv7/wheels/* psycopg2/${version}/armv7 + rm -r psycopg2/${version}/armv7/wheels/ + docker rm psycopg2-extract + - + name: Extract pikepdf files + run: | + version=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }} + tag=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).image_tag }} + + docker pull --quiet --platform linux/amd64 ${tag} + docker create --platform linux/amd64 --name pikepdf-extract ${tag} + mkdir --parents pikepdf/${version}/amd64 + docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/amd64 + mv pikepdf/${version}/amd64/wheels/* pikepdf/${version}/amd64 + rm -r pikepdf/${version}/amd64/wheels/ + docker rm pikepdf-extract + + docker pull --quiet --platform linux/arm64 ${tag} + docker create --platform linux/arm64 --name pikepdf-extract ${tag} + mkdir --parents pikepdf/${version}/arm64 + docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/arm64 + mv pikepdf/${version}/arm64/wheels/* pikepdf/${version}/arm64 + rm -r pikepdf/${version}/arm64/wheels/ + docker rm pikepdf-extract + + docker pull --quiet --platform linux/arm/v7 ${tag} + docker create --platform linux/arm/v7 --name pikepdf-extract ${tag} + mkdir --parents pikepdf/${version}/armv7 + docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/armv7 + mv pikepdf/${version}/armv7/wheels/* pikepdf/${version}/armv7 + rm -r pikepdf/${version}/armv7/wheels/ + docker rm pikepdf-extract + - + name: Extract jbig2enc files + run: | + version=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).version }} + tag=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).image_tag }} + + docker pull --quiet --platform linux/amd64 ${tag} + docker create --platform linux/amd64 --name jbig2enc-extract ${tag} + mkdir --parents jbig2enc/${version}/amd64 + docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/amd64/ + mv jbig2enc/${version}/amd64/build/* jbig2enc/${version}/amd64/ + docker rm jbig2enc-extract + + docker pull --quiet --platform linux/arm64 ${tag} + docker create --platform linux/arm64 --name jbig2enc-extract ${tag} + mkdir --parents jbig2enc/${version}/arm64 + docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/arm64 + mv jbig2enc/${version}/arm64/build/* jbig2enc/${version}/arm64/ + docker rm jbig2enc-extract + + docker pull --quiet --platform linux/arm/v7 ${tag} + docker create --platform linux/arm/v7 --name jbig2enc-extract ${tag} + mkdir --parents jbig2enc/${version}/armv7 + docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/armv7 + mv jbig2enc/${version}/armv7/build/* jbig2enc/${version}/armv7/ + docker rm jbig2enc-extract + - + name: Show file structure + run: | + tree . + - + name: Commit files + run: | + git config --global user.name "github-actions" + git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" + git add pikepdf/ qpdf/ psycopg2/ jbig2enc/ + git commit -m "Updating installer packages" || true + git push origin || true diff --git a/Dockerfile b/Dockerfile index 9522728d9..6588802bb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,5 @@ # syntax=docker/dockerfile:1.4 -# Pull the installer images from the library -# These are all built previously -# They provide either a .deb or .whl - -ARG JBIG2ENC_VERSION -ARG QPDF_VERSION -ARG PIKEPDF_VERSION -ARG PSYCOPG2_VERSION - -FROM ghcr.io/paperless-ngx/paperless-ngx/builder/jbig2enc:${JBIG2ENC_VERSION} as jbig2enc-builder -FROM --platform=$BUILDPLATFORM ghcr.io/paperless-ngx/paperless-ngx/builder/qpdf:${QPDF_VERSION} as qpdf-builder -FROM ghcr.io/paperless-ngx/paperless-ngx/builder/pikepdf:${PIKEPDF_VERSION} as pikepdf-builder -FROM ghcr.io/paperless-ngx/paperless-ngx/builder/psycopg2:${PSYCOPG2_VERSION} as psycopg2-builder - FROM --platform=$BUILDPLATFORM node:16-bullseye-slim AS compile-frontend # This stage compiles the frontend @@ -58,24 +44,21 @@ LABEL org.opencontainers.image.url="https://github.com/paperless-ngx/paperless-n LABEL org.opencontainers.image.licenses="GPL-3.0-only" ARG DEBIAN_FRONTEND=noninteractive -# Buildx provided +# Buildx provided, must be defined to use though ARG TARGETARCH ARG TARGETVARIANT # Workflow provided +ARG JBIG2ENC_VERSION ARG QPDF_VERSION +ARG PIKEPDF_VERSION +ARG PSYCOPG2_VERSION # # Begin installation and configuration # Order the steps below from least often changed to most # -# copy jbig2enc -# Basically will never change again -COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/.libs/libjbig2enc* /usr/local/lib/ -COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/jbig2 /usr/local/bin/ -COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/*.h /usr/local/include/ - # Packages need for running ARG RUNTIME_PACKAGES="\ # Python @@ -198,19 +181,29 @@ RUN set -eux \ # Install the built packages from the installer library images # Use mounts to avoid copying installer files into the image # These change sometimes -RUN --mount=type=bind,from=qpdf-builder,target=/qpdf \ - --mount=type=bind,from=psycopg2-builder,target=/psycopg2 \ - --mount=type=bind,from=pikepdf-builder,target=/pikepdf \ - set -eux \ +RUN set -eux \ + && echo "Getting binaries" \ + && mkdir paperless-ngx \ + && curl --fail --silent --show-error --output paperless-ngx.tar.gz --location https://github.com/paperless-ngx/paperless-ngx/archive/41d6e7e407af09a0882736d50c89b6e015997bff.tar.gz \ + && tar -xf paperless-ngx.tar.gz --directory paperless-ngx --strip-components=1 \ + && cd paperless-ngx \ + # Setting a specific revision ensures we know what this installed + # and ensures cache breaking on changes + && echo "Installing jbig2enc" \ + && cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/jbig2 /usr/local/bin/ \ + && cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/libjbig2enc* /usr/local/lib/ \ && echo "Installing qpdf" \ - && apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \ - && apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \ + && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \ + && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \ && echo "Installing pikepdf and dependencies" \ - && python3 -m pip install --no-cache-dir /pikepdf/usr/src/wheels/*.whl \ + && python3 -m pip install --no-cache-dir ./pikepdf/${PIKEPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/*.whl \ && python3 -m pip list \ && echo "Installing psycopg2" \ - && python3 -m pip install --no-cache-dir /psycopg2/usr/src/wheels/psycopg2*.whl \ - && python3 -m pip list + && python3 -m pip install --no-cache-dir ./psycopg2/${PSYCOPG2_VERSION}/${TARGETARCH}${TARGETVARIANT}/psycopg2*.whl \ + && python3 -m pip list \ + && echo "Cleaning up image layer" \ + && cd ../ \ + && rm -rf paperless-ngx WORKDIR /usr/src/paperless/src/ diff --git a/docker-builders/Dockerfile.jbig2enc b/docker-builders/Dockerfile.jbig2enc index 90318084f..388bdd1f7 100644 --- a/docker-builders/Dockerfile.jbig2enc +++ b/docker-builders/Dockerfile.jbig2enc @@ -29,7 +29,20 @@ RUN set -eux \ && ./autogen.sh \ && ./configure \ && make \ + && echo "Gathering package data" \ + && dpkg-query -f '${Package;-40}${Version}\n' -W > ./pkg-list.txt \ && echo "Cleaning up image" \ && apt-get -y purge ${BUILD_PACKAGES} \ && apt-get -y autoremove --purge \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + && echo "Moving files around" \ + && mkdir build \ + # Unlink a symlink that causes problems + && unlink ./src/.libs/libjbig2enc.la \ + # Move what the link pointed to + && mv ./src/libjbig2enc.la ./build/ \ + # Move the shared library .so files + && mv ./src/.libs/libjbig2enc* ./build/ \ + # And move the cli binary + && mv ./src/jbig2 ./build/ \ + && mv ./pkg-list.txt ./build/ diff --git a/docker-builders/Dockerfile.pikepdf b/docker-builders/Dockerfile.pikepdf index c4d1ee1dc..e4181c538 100644 --- a/docker-builders/Dockerfile.pikepdf +++ b/docker-builders/Dockerfile.pikepdf @@ -7,12 +7,17 @@ # Default to pulling from the main repo registry when manually building ARG REPO="paperless-ngx/paperless-ngx" +# This does nothing, except provide a name for a copy below ARG QPDF_VERSION FROM --platform=$BUILDPLATFORM ghcr.io/${REPO}/builder/qpdf:${QPDF_VERSION} as qpdf-builder -# This does nothing, except provide a name for a copy below - -FROM python:3.9-slim-bullseye as main +# +# Stage: builder +# Purpose: +# - Build the pikepdf wheel +# - Build any dependent wheels which can't be found +# +FROM python:3.9-slim-bullseye as builder LABEL org.opencontainers.image.description="A intermediate image with pikepdf wheel built" @@ -100,3 +105,14 @@ RUN set -eux \ && apt-get -y purge ${BUILD_PACKAGES} \ && apt-get -y autoremove --purge \ && rm -rf /var/lib/apt/lists/* + +# +# Stage: package +# Purpose: Holds the compiled .whl files in a tiny image to pull +# +FROM alpine:3.17 as package + +WORKDIR /usr/src/wheels/ + +COPY --from=builder /usr/src/wheels/*.whl ./ +COPY --from=builder /usr/src/wheels/pkg-list.txt ./ diff --git a/docker-builders/Dockerfile.psycopg2 b/docker-builders/Dockerfile.psycopg2 index 8fcf5264b..e3f182435 100644 --- a/docker-builders/Dockerfile.psycopg2 +++ b/docker-builders/Dockerfile.psycopg2 @@ -2,7 +2,12 @@ # Inputs: # - PSYCOPG2_VERSION - Version to build -FROM python:3.9-slim-bullseye as main +# +# Stage: builder +# Purpose: +# - Build the psycopg2 wheel +# +FROM python:3.9-slim-bullseye as builder LABEL org.opencontainers.image.description="A intermediate image with psycopg2 wheel built" @@ -48,3 +53,14 @@ RUN set -eux \ && apt-get -y purge ${BUILD_PACKAGES} \ && apt-get -y autoremove --purge \ && rm -rf /var/lib/apt/lists/* + +# +# Stage: package +# Purpose: Holds the compiled .whl files in a tiny image to pull +# +FROM alpine:3.17 as package + +WORKDIR /usr/src/wheels/ + +COPY --from=builder /usr/src/wheels/*.whl ./ +COPY --from=builder /usr/src/wheels/pkg-list.txt ./ diff --git a/docker-builders/README.md b/docker-builders/README.md new file mode 100644 index 000000000..14e684ccf --- /dev/null +++ b/docker-builders/README.md @@ -0,0 +1,51 @@ +# Installer Library + +This folder contains the Dockerfiles for building certain installers or libraries, which are then pulled into the main image. + +## [jbig2enc](https://github.com/agl/jbig2enc) + +### Why + +JBIG is an image coding which can achieve better compression of images for PDFs. + +### What + +The Docker image builds a shared library file and utility, which is copied into the correct location in the final image. + +See Also: + +- [OCRMyPDF Documentation](https://ocrmypdf.readthedocs.io/en/latest/jbig2.html) + +## [psycopg2](https://www.psycopg.org/) + +### Why + +The pre-built wheels of psycopg2 are built on Debian 9, which provides a quite old version of libpq-dev. This causes issue with authentication methods. + +### What + +The image builds psycopg2 wheels on Debian 10 and places the produced wheels into `/usr/src/wheels/`. + +See Also: + +- [Issue 266](https://github.com/paperless-ngx/paperless-ngx/issues/266) + +## [qpdf](https://qpdf.readthedocs.io/en/stable/index.html) + +### Why + +qpdf and it's library provide tools to read, manipulate and fix up PDFs. Version 11 is also required by `pikepdf` 6+ and Debian 9 does not provide above version 10. + +### What + +The Docker image cross compiles .deb installers for each supported architecture of the main image. The installers are placed in `/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/` + +## [pikepdf](https://pikepdf.readthedocs.io/en/latest/) + +### Why + +Required by OCRMyPdf, this is a general purpose library for PDF manipulation in Python via the qpdf libraries. + +### What + +The built wheels are placed into `/usr/src/wheels/` From 8fcb7efbd2087154d20fad3e43c918c827c8ac76 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Wed, 25 Jan 2023 20:00:30 -0800 Subject: [PATCH 05/18] Adds some basic steps for updating --- docker-builders/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docker-builders/README.md b/docker-builders/README.md index 14e684ccf..6202719c6 100644 --- a/docker-builders/README.md +++ b/docker-builders/README.md @@ -12,6 +12,12 @@ JBIG is an image coding which can achieve better compression of images for PDFs. The Docker image builds a shared library file and utility, which is copied into the correct location in the final image. +### Updating + +1. Ensure the given qpdf version is present in [Debian bookworm](https://packages.debian.org/bookworm/qpdf) +2. Update `.build-config.json` to the given version +3. If the Debian specific version has incremented, update `Dockerfile.qpdf` + See Also: - [OCRMyPDF Documentation](https://ocrmypdf.readthedocs.io/en/latest/jbig2.html) From 590d129cd3c1226db4bd866db93aacff918cbdba Mon Sep 17 00:00:00 2001 From: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Date: Fri, 27 Jan 2023 09:36:54 -0800 Subject: [PATCH 06/18] Fix typo, update translation strings --- src-ui/messages.xlf | 645 +++++++++--------- .../manage/settings/settings.component.html | 2 +- 2 files changed, 334 insertions(+), 313 deletions(-) diff --git a/src-ui/messages.xlf b/src-ui/messages.xlf index 21ac728b3..edd742d45 100644 --- a/src-ui/messages.xlf +++ b/src-ui/messages.xlf @@ -5,242 +5,242 @@ Close - node_modules/src/alert/alert.ts - 47,48 - - - - Slide of - - node_modules/src/carousel/carousel.ts - 178,186 - - Currently selected slide number read by screen reader - - - Previous - - node_modules/src/carousel/carousel.ts - 213,215 - - - - Next - - node_modules/src/carousel/carousel.ts - 236 - - - - Select month - - node_modules/src/datepicker/datepicker-navigation-select.ts - 50,51 - - - node_modules/src/datepicker/datepicker-navigation-select.ts - 50,51 - - - - Select year - - node_modules/src/datepicker/datepicker-navigation-select.ts - 50,51 - - - node_modules/src/datepicker/datepicker-navigation-select.ts - 50,51 - - - - Previous month - - node_modules/src/datepicker/datepicker-navigation.ts - 60,63 - - - node_modules/src/datepicker/datepicker-navigation.ts - 60,63 - - - - Next month - - node_modules/src/datepicker/datepicker-navigation.ts - 60,63 - - - node_modules/src/datepicker/datepicker-navigation.ts - 60,63 - - - - «« - - node_modules/src/pagination/pagination.ts - 269,270 - - - - « - - node_modules/src/pagination/pagination.ts - 269,270 - - - - » - - node_modules/src/pagination/pagination.ts - 269,270 - - - - »» - - node_modules/src/pagination/pagination.ts - 269,270 - - - - First - - node_modules/src/pagination/pagination.ts - 269,271 - - - - Previous - - node_modules/src/pagination/pagination.ts - 269,271 - - - - Next - - node_modules/src/pagination/pagination.ts - 269,271 - - - - Last - - node_modules/src/pagination/pagination.ts - 269,271 - - - - - - node_modules/src/progressbar/progressbar.ts - 30,33 + node_modules/src/ngb-config.ts + 13 HH - node_modules/src/timepicker/timepicker.ts - 230,231 - - - - Hours - - node_modules/src/timepicker/timepicker.ts - 255,258 - - - - MM - - node_modules/src/timepicker/timepicker.ts - 280,282 - - - - Minutes - - node_modules/src/timepicker/timepicker.ts - 298,299 - - - - Increment hours - - node_modules/src/timepicker/timepicker.ts - 328,329 - - - - Decrement hours - - node_modules/src/timepicker/timepicker.ts - 350,356 - - - - Increment minutes - - node_modules/src/timepicker/timepicker.ts - 383,384 - - - - Decrement minutes - - node_modules/src/timepicker/timepicker.ts - 412,416 - - - - SS - - node_modules/src/timepicker/timepicker.ts - 429 - - - - Seconds - - node_modules/src/timepicker/timepicker.ts - 429 - - - - Increment seconds - - node_modules/src/timepicker/timepicker.ts - 429 - - - - Decrement seconds - - node_modules/src/timepicker/timepicker.ts - 429 - - - - - - node_modules/src/timepicker/timepicker.ts - 429 - - - - - - node_modules/src/timepicker/timepicker.ts - 429 + node_modules/src/ngb-config.ts + 13 Close - node_modules/src/toast/toast.ts - 74,75 + node_modules/src/ngb-config.ts + 13 + + + + «« + + node_modules/src/ngb-config.ts + 13 + + + + Select month + + node_modules/src/ngb-config.ts + 13 + + + node_modules/src/ngb-config.ts + 13 + + + + Previous month + + node_modules/src/ngb-config.ts + 13 + + + node_modules/src/ngb-config.ts + 13 + + + + + + node_modules/src/ngb-config.ts + 13 + + + + Slide of + + node_modules/src/ngb-config.ts + 13 + + Currently selected slide number read by screen reader + + + Hours + + node_modules/src/ngb-config.ts + 13 + + + + « + + node_modules/src/ngb-config.ts + 13 + + + + Previous + + node_modules/src/ngb-config.ts + 13 + + + + MM + + node_modules/src/ngb-config.ts + 13 + + + + » + + node_modules/src/ngb-config.ts + 13 + + + + Select year + + node_modules/src/ngb-config.ts + 13 + + + node_modules/src/ngb-config.ts + 13 + + + + Next month + + node_modules/src/ngb-config.ts + 13 + + + node_modules/src/ngb-config.ts + 13 + + + + Next + + node_modules/src/ngb-config.ts + 13 + + + + Minutes + + node_modules/src/ngb-config.ts + 13 + + + + »» + + node_modules/src/ngb-config.ts + 13 + + + + Increment hours + + node_modules/src/ngb-config.ts + 13 + + + + First + + node_modules/src/ngb-config.ts + 13 + + + + Previous + + node_modules/src/ngb-config.ts + 13 + + + + Decrement hours + + node_modules/src/ngb-config.ts + 13 + + + + Next + + node_modules/src/ngb-config.ts + 13 + + + + Increment minutes + + node_modules/src/ngb-config.ts + 13 + + + + Last + + node_modules/src/ngb-config.ts + 13 + + + + Decrement minutes + + node_modules/src/ngb-config.ts + 13 + + + + SS + + node_modules/src/ngb-config.ts + 13 + + + + Seconds + + node_modules/src/ngb-config.ts + 13 + + + + Increment seconds + + node_modules/src/ngb-config.ts + 13 + + + + Decrement seconds + + node_modules/src/ngb-config.ts + 13 + + + + + + node_modules/src/ngb-config.ts + 13 + + + + + + node_modules/src/ngb-config.ts + 13 @@ -967,7 +967,7 @@ src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 37 + 38 src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.html @@ -1006,7 +1006,7 @@ src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 38 + 39 src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.html @@ -1208,102 +1208,109 @@ 15 + + Rule order + + src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html + 16 + + Paperless will only process mails that match all of the filters specified below. src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 18 + 19 Filter from src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 19 + 20 Filter subject src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 20 + 21 Filter body src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 21 + 22 Filter attachment filename src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 22 + 23 Only consume documents which entirely match this filename if specified. Wildcards such as *.pdf or *invoice* are allowed. Case insensitive. src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 22 + 23 Action src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 25 + 26 Action is only performed when documents are consumed from the mail. Mails without attachments remain entirely untouched. src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 25 + 26 Action parameter src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 26 + 27 Assign title from src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 27 + 28 Assign document type src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 29 + 30 Assign correspondent from src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 30 + 31 Assign correspondent src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 31 + 32 Error src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 36 + 37 src/app/services/toast.service.ts @@ -1965,7 +1972,7 @@ of src/app/components/document-detail/document-detail.component.html - 5 + 5,6 @@ -1980,7 +1987,7 @@ src/app/components/document-list/document-card-large/document-card-large.component.html - 58 + 64 src/app/components/document-list/document-card-small/document-card-small.component.html @@ -2013,7 +2020,7 @@ src/app/components/document-list/document-card-large/document-card-large.component.html - 38 + 44 @@ -2262,7 +2269,7 @@ Confirm delete src/app/components/document-detail/document-detail.component.ts - 442 + 449 src/app/components/manage/management-list/management-list.component.ts @@ -2273,35 +2280,35 @@ Do you really want to delete document ""? src/app/components/document-detail/document-detail.component.ts - 443 + 450 The files for this document will be deleted permanently. This operation cannot be undone. src/app/components/document-detail/document-detail.component.ts - 444 + 451 Delete document src/app/components/document-detail/document-detail.component.ts - 446 + 453 Error deleting document: src/app/components/document-detail/document-detail.component.ts - 462 + 469 Redo OCR confirm src/app/components/document-detail/document-detail.component.ts - 482 + 489 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -2312,14 +2319,14 @@ This operation will permanently redo OCR for this document. src/app/components/document-detail/document-detail.component.ts - 483 + 490 This operation cannot be undone. src/app/components/document-detail/document-detail.component.ts - 484 + 491 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -2342,7 +2349,7 @@ Proceed src/app/components/document-detail/document-detail.component.ts - 486 + 493 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -2361,7 +2368,7 @@ Redo OCR operation will begin in the background. Close and re-open or reload this document after the operation has completed to see new content. src/app/components/document-detail/document-detail.component.ts - 494 + 501 @@ -2370,7 +2377,7 @@ )"/> src/app/components/document-detail/document-detail.component.ts - 505,507 + 512,514 @@ -2701,7 +2708,7 @@ Edit src/app/components/document-list/document-card-large/document-card-large.component.html - 43 + 49 src/app/components/document-list/document-card-small/document-card-small.component.html @@ -2752,14 +2759,14 @@ View src/app/components/document-list/document-card-large/document-card-large.component.html - 50 + 56 Filter by document type src/app/components/document-list/document-card-large/document-card-large.component.html - 63 + 69 src/app/components/document-list/document-list.component.html @@ -2770,7 +2777,7 @@ Filter by storage path src/app/components/document-list/document-card-large/document-card-large.component.html - 70 + 76 src/app/components/document-list/document-list.component.html @@ -2781,40 +2788,40 @@ Created: src/app/components/document-list/document-card-large/document-card-large.component.html - 85 + 91,92 src/app/components/document-list/document-card-small/document-card-small.component.html - 48 + 48,49 Added: src/app/components/document-list/document-card-large/document-card-large.component.html - 86 + 92,93 src/app/components/document-list/document-card-small/document-card-small.component.html - 49 + 49,50 Modified: src/app/components/document-list/document-card-large/document-card-large.component.html - 87 + 93,94 src/app/components/document-list/document-card-small/document-card-small.component.html - 50 + 50,51 Score: src/app/components/document-list/document-card-large/document-card-large.component.html - 98 + 104 @@ -2926,7 +2933,7 @@ ASN src/app/components/document-list/document-list.component.html - 127 + 128,127 src/app/components/document-list/filter-editor/filter-editor.component.ts @@ -3420,21 +3427,21 @@ Short: src/app/components/manage/settings/settings.component.html - 56 + 56,57 Medium: src/app/components/manage/settings/settings.component.html - 60 + 60,61 Long: src/app/components/manage/settings/settings.component.html - 64 + 64,65 @@ -3532,14 +3539,14 @@ Update checking works by pinging the the public Github API for the latest release to determine whether a new version is available. Actual updating of the app must still be performed manually. src/app/components/manage/settings/settings.component.html - 139,142 + 140,142 - No tracking data is collected by the app in any way. + No tracking data is collected by the app in any way. src/app/components/manage/settings/settings.component.html - 144 + 144,146 @@ -3549,8 +3556,8 @@ 146 - - Note that for users of thirdy-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release. + + Note that for users of third-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release. src/app/components/manage/settings/settings.component.html 146 @@ -3658,7 +3665,7 @@ Mail src/app/components/manage/settings/settings.component.html - 231 + 232,231 @@ -4191,18 +4198,25 @@ 15 + + Document with ASN already exists. + + src/app/services/consumer-status.service.ts + 16 + + File not found. src/app/services/consumer-status.service.ts - 16 + 17 Pre-consume script does not exist. src/app/services/consumer-status.service.ts - 17 + 18 Pre-Consume is a term that appears like that in the documentation as well and does not need a specific translation @@ -4210,7 +4224,7 @@ Error while executing pre-consume script. src/app/services/consumer-status.service.ts - 18 + 19 Pre-Consume is a term that appears like that in the documentation as well and does not need a specific translation @@ -4218,7 +4232,7 @@ Post-consume script does not exist. src/app/services/consumer-status.service.ts - 19 + 20 Post-Consume is a term that appears like that in the documentation as well and does not need a specific translation @@ -4226,7 +4240,7 @@ Error while executing post-consume script. src/app/services/consumer-status.service.ts - 20 + 21 Post-Consume is a term that appears like that in the documentation as well and does not need a specific translation @@ -4234,49 +4248,49 @@ Received new file. src/app/services/consumer-status.service.ts - 21 + 22 File type not supported. src/app/services/consumer-status.service.ts - 22 + 23 Processing document... src/app/services/consumer-status.service.ts - 23 + 24 Generating thumbnail... src/app/services/consumer-status.service.ts - 24 + 25 Retrieving date from document... src/app/services/consumer-status.service.ts - 25 + 26 Saving document... src/app/services/consumer-status.service.ts - 26 + 27 Finished. src/app/services/consumer-status.service.ts - 27 + 28 @@ -4336,165 +4350,172 @@ 145 + + Arabic + + src/app/services/settings.service.ts + 151 + + Belarusian src/app/services/settings.service.ts - 151 + 157 Czech src/app/services/settings.service.ts - 157 + 163 Danish src/app/services/settings.service.ts - 163 + 169 German src/app/services/settings.service.ts - 169 + 175 English (GB) src/app/services/settings.service.ts - 175 + 181 Spanish src/app/services/settings.service.ts - 181 + 187 French src/app/services/settings.service.ts - 187 + 193 Italian src/app/services/settings.service.ts - 193 + 199 Luxembourgish src/app/services/settings.service.ts - 199 + 205 Dutch src/app/services/settings.service.ts - 205 + 211 Polish src/app/services/settings.service.ts - 211 + 217 Portuguese (Brazil) src/app/services/settings.service.ts - 217 + 223 Portuguese src/app/services/settings.service.ts - 223 + 229 Romanian src/app/services/settings.service.ts - 229 + 235 Russian src/app/services/settings.service.ts - 235 + 241 Slovenian src/app/services/settings.service.ts - 241 + 247 Serbian src/app/services/settings.service.ts - 247 + 253 Swedish src/app/services/settings.service.ts - 253 + 259 Turkish src/app/services/settings.service.ts - 259 + 265 Chinese Simplified src/app/services/settings.service.ts - 265 + 271 ISO 8601 src/app/services/settings.service.ts - 282 + 288 Successfully completed one-time migratration of settings to the database! src/app/services/settings.service.ts - 393 + 399 Unable to migrate settings to the database, please try saving manually. src/app/services/settings.service.ts - 394 + 400 diff --git a/src-ui/src/app/components/manage/settings/settings.component.html b/src-ui/src/app/components/manage/settings/settings.component.html index eb279f4b6..079f62cf2 100644 --- a/src-ui/src/app/components/manage/settings/settings.component.html +++ b/src-ui/src/app/components/manage/settings/settings.component.html @@ -143,7 +143,7 @@

No tracking data is collected by the app in any way.

- + From a884647a7c15c835a4958a9b93746f5a3d11ba47 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 26 Jan 2023 09:15:37 -0800 Subject: [PATCH 07/18] Changes to use buildx imagetools to extract the manifest, supporting new attestation manifests --- .github/scripts/cleanup-tags.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/scripts/cleanup-tags.py b/.github/scripts/cleanup-tags.py index 9b299d048..9312d05c5 100644 --- a/.github/scripts/cleanup-tags.py +++ b/.github/scripts/cleanup-tags.py @@ -155,8 +155,10 @@ class RegistryTagsCleaner: proc = subprocess.run( [ shutil.which("docker"), - "manifest", + "buildx", + "imagetools", "inspect", + "--raw", full_name, ], capture_output=True, From 215691ac1a5c3a351a314e73c5358e6658c0f838 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 26 Jan 2023 10:15:14 -0800 Subject: [PATCH 08/18] Changes the still pull check to be using Python Docker SDK --- .github/scripts/cleanup-tags.py | 67 ++++++++++++++++++++++++++++++ .github/workflows/cleanup-tags.yml | 18 ++------ 2 files changed, 71 insertions(+), 14 deletions(-) diff --git a/.github/scripts/cleanup-tags.py b/.github/scripts/cleanup-tags.py index 9312d05c5..590344a2c 100644 --- a/.github/scripts/cleanup-tags.py +++ b/.github/scripts/cleanup-tags.py @@ -15,6 +15,8 @@ from github import ContainerPackage from github import GithubBranchApi from github import GithubContainerRegistryApi +import docker + logger = logging.getLogger("cleanup-tags") @@ -151,6 +153,8 @@ class RegistryTagsCleaner: for tag in sorted(self.tags_to_keep): full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}" logger.info(f"Checking manifest for {full_name}") + # TODO: It would be nice to use RegistryData from docker + # except the ID doesn't map to anything in the manifest try: proc = subprocess.run( [ @@ -243,6 +247,65 @@ class RegistryTagsCleaner: # By default, keep anything which is tagged self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys())) + def check_tags_pull(self): + """ + This method uses the Docker Python SDK to confirm all tags which were + kept still pull, for all platforms. + + TODO: This is much slower (although more comprehensive). Maybe a Pool? + """ + logger.info("Beginning confirmation step") + client = docker.from_env() + imgs = [] + for tag in sorted(self.tags_to_keep): + repository = f"ghcr.io/{self.repo_owner}/{self.package_name}" + for arch, variant in [("amd64", None), ("arm64", None), ("arm", "v7")]: + # From 11.2.0 onwards, qpdf is cross compiled, so there is a single arch, amd64 + # skip others in this case + if "qpdf" in self.package_name and arch != "amd64" and tag == "11.2.0": + continue + # Skip beta and release candidate tags + elif "beta" in tag: + continue + + # Build the platform name + if variant is not None: + platform = f"linux/{arch}/{variant}" + else: + platform = f"linux/{arch}" + + try: + logger.info(f"Pulling {repository}:{tag} for {platform}") + image = client.images.pull( + repository=repository, + tag=tag, + platform=platform, + ) + imgs.append(image) + except docker.errors.APIError as e: + logger.error( + f"Failed to pull {repository}:{tag}: {e}", + ) + + # Prevent out of space errors by removing after a few + # pulls + if len(imgs) > 50: + for image in imgs: + try: + client.images.remove(image.id) + except docker.errors.APIError as e: + err_str = str(e) + # Ignore attempts to remove images that are partly shared + # Ignore images which are somehow gone already + if ( + "must be forced" not in err_str + and "No such image" not in err_str + ): + logger.error( + f"Remove image ghcr.io/{self.repo_owner}/{self.package_name}:{tag} failed: {e}", + ) + imgs = [] + class MainImageTagsCleaner(RegistryTagsCleaner): def decide_what_tags_to_keep(self): @@ -399,6 +462,10 @@ def _main(): # Clean images which are untagged cleaner.clean_untagged(args.is_manifest) + # Verify remaining tags still pull + if args.is_manifest: + cleaner.check_tags_pull() + if __name__ == "__main__": _main() diff --git a/.github/workflows/cleanup-tags.yml b/.github/workflows/cleanup-tags.yml index 6877e55bb..090fcc532 100644 --- a/.github/workflows/cleanup-tags.yml +++ b/.github/workflows/cleanup-tags.yml @@ -62,9 +62,9 @@ jobs: with: python-version: "3.10" - - name: Install httpx + name: Install Python libraries run: | - python -m pip install httpx + python -m pip install httpx docker # # Clean up primary package # @@ -72,7 +72,7 @@ jobs: name: Cleanup for package "${{ matrix.primary-name }}" if: "${{ env.TOKEN != '' }}" run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --is-manifest --delete "${{ matrix.primary-name }}" + python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --is-manifest "${{ matrix.primary-name }}" # # Clean up registry cache package # @@ -80,14 +80,4 @@ jobs: name: Cleanup for package "${{ matrix.cache-name }}" if: "${{ env.TOKEN != '' }}" run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --delete "${{ matrix.cache-name }}" - # - # Verify tags which are left still pull - # - - - name: Check all tags still pull - run: | - ghcr_name=$(echo "ghcr.io/${GITHUB_REPOSITORY_OWNER}/${{ matrix.primary-name }}" | awk '{ print tolower($0) }') - echo "Pulling all tags of ${ghcr_name}" - docker pull --quiet --all-tags ${ghcr_name} - docker image list + python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged "${{ matrix.cache-name }}" From 3ccb83e49c2e8780fcbb999d475eac6f64f131e6 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 26 Jan 2023 13:33:39 -0800 Subject: [PATCH 09/18] Restores deletion --- .github/workflows/cleanup-tags.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cleanup-tags.yml b/.github/workflows/cleanup-tags.yml index 090fcc532..5992b4442 100644 --- a/.github/workflows/cleanup-tags.yml +++ b/.github/workflows/cleanup-tags.yml @@ -72,7 +72,7 @@ jobs: name: Cleanup for package "${{ matrix.primary-name }}" if: "${{ env.TOKEN != '' }}" run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --is-manifest "${{ matrix.primary-name }}" + python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --is-manifest --delete "${{ matrix.primary-name }}" # # Clean up registry cache package # @@ -80,4 +80,4 @@ jobs: name: Cleanup for package "${{ matrix.cache-name }}" if: "${{ env.TOKEN != '' }}" run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged "${{ matrix.cache-name }}" + python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --delete "${{ matrix.cache-name }}" From 44212d492de271be1f723e3c0493b9a90c25047c Mon Sep 17 00:00:00 2001 From: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Date: Fri, 27 Jan 2023 10:02:25 -0800 Subject: [PATCH 10/18] Fix whoosh auto-highlighting for comments --- src/documents/views.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/documents/views.py b/src/documents/views.py index 854f2da2b..6a719fe70 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -477,21 +477,14 @@ class DocumentViewSet( class SearchResultSerializer(DocumentSerializer): def to_representation(self, instance): doc = Document.objects.get(id=instance["id"]) - comments = "" - if hasattr(instance.results.q, "subqueries"): - commentTerm = instance.results.q.subqueries[0] - comments = ",".join( - [ - str(c.comment) - for c in Comment.objects.filter(document=instance["id"]) - if commentTerm.text in c.comment - ], - ) + comments = ",".join( + [str(c.comment) for c in Comment.objects.filter(document=instance["id"])], + ) r = super().to_representation(doc) r["__search_hit__"] = { "score": instance.score, "highlights": instance.highlights("content", text=doc.content), - "comment_highlights": instance.highlights("content", text=comments) + "comment_highlights": instance.highlights("comments", text=comments) if doc else None, "rank": instance.rank, From 4dc0c7bbe273544a4b520e38ed6686c1b3036c4f Mon Sep 17 00:00:00 2001 From: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Date: Fri, 27 Jan 2023 10:29:40 -0800 Subject: [PATCH 11/18] Better display of multiple comment hits --- .../document-card-large.component.html | 4 ++-- .../document-card-large.component.ts | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html index c114a2d6e..b18524e38 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html @@ -26,11 +26,11 @@

- + - + {{contentTrimmed}}

diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts index b43187879..5d24042b9 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts @@ -70,6 +70,22 @@ export class DocumentCardLargeComponent { } } + get searchCommentHighlights() { + let highlights = [] + if ( + this.document['__search_hit__'] && + this.document['__search_hit__'].comment_highlights + ) { + // only show comments with a match + highlights = ( + this.document['__search_hit__'].comment_highlights as string + ) + .split(',') + .filter((higlight) => higlight.includes(' Date: Thu, 26 Jan 2023 08:00:02 -0800 Subject: [PATCH 12/18] Adds setting to Gotenberg API call for outputting the correct PDF/A format --- src/paperless_mail/parsers.py | 10 ++++++++++ src/paperless_tika/parsers.py | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index cc5d4e3c8..f1ee263aa 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -271,6 +271,16 @@ class MailDocumentParser(DocumentParser): "paperHeight": "11.7", "scale": "1.0", } + + # Set the output format of the resulting PDF + # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno + if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + data["pdfFormat"] = "PDF/A-2b" + elif settings.OCR_OUTPUT_TYPE == "pdfa-1": + data["pdfFormat"] = "PDF/A-1a" + elif settings.OCR_OUTPUT_TYPE == "pdfa-3": + data["pdfFormat"] = "PDF/A-3b" + try: response = requests.post( url, diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 1cfb1eecb..f34ecbbab 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -95,9 +95,19 @@ class TikaDocumentParser(DocumentParser): ), } headers = {} + data = {} + + # Set the output format of the resulting PDF + # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno + if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + data["pdfFormat"] = "PDF/A-2b" + elif settings.OCR_OUTPUT_TYPE == "pdfa-1": + data["pdfFormat"] = "PDF/A-1a" + elif settings.OCR_OUTPUT_TYPE == "pdfa-3": + data["pdfFormat"] = "PDF/A-3b" try: - response = requests.post(url, files=files, headers=headers) + response = requests.post(url, files=files, headers=headers, data=data) response.raise_for_status() # ensure we notice bad responses except Exception as err: raise ParseError( From 583f05af2db22325d7fac65b4fd70e604b6bb9e5 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 26 Jan 2023 08:23:11 -0800 Subject: [PATCH 13/18] Fixes test parameters --- src/paperless_mail/tests/test_parsers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/paperless_mail/tests/test_parsers.py b/src/paperless_mail/tests/test_parsers.py index e02267970..809a1192f 100644 --- a/src/paperless_mail/tests/test_parsers.py +++ b/src/paperless_mail/tests/test_parsers.py @@ -573,8 +573,8 @@ class TestParser(TestCase): self.parser.gotenberg_server + "/forms/chromium/convert/html", mock_post.call_args.args[0], ) - self.assertEqual({}, mock_post.call_args.kwargs["headers"]) - self.assertEqual( + self.assertDictEqual({}, mock_post.call_args.kwargs["headers"]) + self.assertDictEqual( { "marginTop": "0.1", "marginBottom": "0.1", @@ -583,6 +583,7 @@ class TestParser(TestCase): "paperWidth": "8.27", "paperHeight": "11.7", "scale": "1.0", + "pdfFormat": "PDF/A-2b", }, mock_post.call_args.kwargs["data"], ) @@ -663,8 +664,8 @@ class TestParser(TestCase): self.parser.gotenberg_server + "/forms/chromium/convert/html", mock_post.call_args.args[0], ) - self.assertEqual({}, mock_post.call_args.kwargs["headers"]) - self.assertEqual( + self.assertDictEqual({}, mock_post.call_args.kwargs["headers"]) + self.assertDictEqual( { "marginTop": "0.1", "marginBottom": "0.1", From 2ab77fbaf7a42f60c23e8b28cf2af6080d84b919 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 27 Jan 2023 08:34:00 -0800 Subject: [PATCH 14/18] Removes pikepdf based scanning, fixes up unit testing (+ commenting) --- src/documents/barcodes.py | 73 +-- ...n.png => barcode-39-PATCHT-distortion.png} | Bin ....png => barcode-39-PATCHT-distortion2.png} | Bin src/documents/tests/test_barcodes.py | 448 +++++++++++++----- 4 files changed, 352 insertions(+), 169 deletions(-) rename src/documents/tests/samples/barcodes/{barcode-39-PATCHT-distorsion.png => barcode-39-PATCHT-distortion.png} (100%) rename src/documents/tests/samples/barcodes/{barcode-39-PATCHT-distorsion2.png => barcode-39-PATCHT-distortion2.png} (100%) diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 82b8afecc..6e3ecfe05 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -4,7 +4,6 @@ import shutil import tempfile from dataclasses import dataclass from functools import lru_cache -from math import ceil from pathlib import Path from typing import List from typing import Optional @@ -12,10 +11,9 @@ from typing import Optional import magic from django.conf import settings from pdf2image import convert_from_path +from pdf2image.exceptions import PDFPageCountError from pikepdf import Page -from pikepdf import PasswordError from pikepdf import Pdf -from pikepdf import PdfImage from PIL import Image from PIL import ImageSequence from pyzbar import pyzbar @@ -154,52 +152,15 @@ def scan_file_for_barcodes( (page_number, barcode_text) tuples """ - def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]: - detected_barcodes = [] - with Pdf.open(pdf_filepath) as pdf: - for page_num, page in enumerate(pdf.pages): - for image_key in page.images: - pdfimage = PdfImage(page.images[image_key]) - - # This type is known to have issues: - # https://github.com/pikepdf/pikepdf/issues/401 - if "/CCITTFaxDecode" in pdfimage.filters: - raise BarcodeImageFormatError( - "Unable to decode CCITTFaxDecode images", - ) - - # Not all images can be transcoded to a PIL image, which - # is what pyzbar expects to receive, so this may - # raise an exception, triggering fallback - pillow_img = pdfimage.as_pil_image() - - # Scale the image down - # See: https://github.com/paperless-ngx/paperless-ngx/issues/2385 - # TLDR: zbar has issues with larger images - width, height = pillow_img.size - if width > 1024: - scaler = ceil(width / 1024) - new_width = int(width / scaler) - new_height = int(height / scaler) - pillow_img = pillow_img.resize((new_width, new_height)) - - width, height = pillow_img.size - if height > 2048: - scaler = ceil(height / 2048) - new_width = int(width / scaler) - new_height = int(height / scaler) - pillow_img = pillow_img.resize((new_width, new_height)) - - for barcode_value in barcode_reader(pillow_img): - detected_barcodes.append(Barcode(page_num, barcode_value)) - - return detected_barcodes - def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]: detected_barcodes = [] # use a temporary directory in case the file is too big to handle in memory with tempfile.TemporaryDirectory() as path: - pages_from_path = convert_from_path(pdf_filepath, output_folder=path) + pages_from_path = convert_from_path( + pdf_filepath, + dpi=300, + output_folder=path, + ) for current_page_number, page in enumerate(pages_from_path): for barcode_value in barcode_reader(page): detected_barcodes.append( @@ -219,27 +180,19 @@ def scan_file_for_barcodes( # Always try pikepdf first, it's usually fine, faster and # uses less memory try: - barcodes = _pikepdf_barcode_scan(pdf_filepath) + barcodes = _pdf2image_barcode_scan(pdf_filepath) # Password protected files can't be checked - except PasswordError as e: + # This is the exception raised for those + except PDFPageCountError as e: logger.warning( f"File is likely password protected, not checking for barcodes: {e}", ) - # Handle pikepdf related image decoding issues with a fallback to page - # by page conversion to images in a temporary directory - except Exception as e: + # This file is really borked, allow the consumption to continue + # but it may fail further on + except Exception as e: # pragma: no cover logger.warning( - f"Falling back to pdf2image because: {e}", + f"Exception during barcode scanning: {e}", ) - try: - barcodes = _pdf2image_barcode_scan(pdf_filepath) - # This file is really borked, allow the consumption to continue - # but it may fail further on - except Exception as e: # pragma: no cover - logger.warning( - f"Exception during barcode scanning: {e}", - ) - else: logger.warning( f"Unsupported file format for barcode reader: {str(mime_type)}", diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png similarity index 100% rename from src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png rename to src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png similarity index 100% rename from src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png rename to src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 7beeee288..8d8b2acfb 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -3,7 +3,6 @@ import shutil import tempfile from unittest import mock -import pikepdf from django.conf import settings from django.test import override_settings from django.test import TestCase @@ -23,13 +22,29 @@ class TestBarcode(DirectoriesMixin, TestCase): BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes") - def test_barcode_reader(self): + def test_barcode_reader_png(self): + """ + GIVEN: + - PNG file with separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join(self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT.png") img = Image.open(test_file) - separator_barcode = str(settings.CONSUMER_BARCODE_STRING) + separator_barcode = settings.CONSUMER_BARCODE_STRING self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - def test_barcode_reader2(self): + def test_barcode_reader_pbm(self): + """ + GIVEN: + - Netpbm bitmap file with separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pbm", @@ -38,25 +53,49 @@ class TestBarcode(DirectoriesMixin, TestCase): separator_barcode = str(settings.CONSUMER_BARCODE_STRING) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - def test_barcode_reader_distorsion(self): + def test_barcode_reader_distortion_scratchy(self): + """ + GIVEN: + - Image containing high noise + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, - "barcode-39-PATCHT-distorsion.png", + "barcode-39-PATCHT-distortion.png", ) img = Image.open(test_file) separator_barcode = str(settings.CONSUMER_BARCODE_STRING) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - def test_barcode_reader_distorsion2(self): + def test_barcode_reader_distortion_stretched(self): + """ + GIVEN: + - Image with a stretched barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, - "barcode-39-PATCHT-distorsion2.png", + "barcode-39-PATCHT-distortion2.png", ) img = Image.open(test_file) separator_barcode = str(settings.CONSUMER_BARCODE_STRING) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) def test_barcode_reader_unreadable(self): + """ + GIVEN: + - Image with a truly unreadable barcode + WHEN: + - Image is scanned for codes + THEN: + - No barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT-unreadable.png", @@ -65,6 +104,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), []) def test_barcode_reader_qr(self): + """ + GIVEN: + - Image file with QR separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "qr-code-PATCHT.png", @@ -74,6 +121,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) def test_barcode_reader_128(self): + """ + GIVEN: + - Image file with 128 style separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-128-PATCHT.png", @@ -83,11 +138,27 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) def test_barcode_reader_no_barcode(self): + """ + GIVEN: + - Image file with no barcode + WHEN: + - Image is scanned for codes + THEN: + - No barcode is detected + """ test_file = os.path.join(self.SAMPLE_DIR, "simple.png") img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), []) + self.assertListEqual(barcodes.barcode_reader(img), []) def test_barcode_reader_custom_separator(self): + """ + GIVEN: + - Image file with custom separator barcode value + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-39-custom.png", @@ -96,6 +167,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) def test_barcode_reader_custom_qr_separator(self): + """ + GIVEN: + - Image file with custom separator barcode value as a QR code + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-qr-custom.png", @@ -104,6 +183,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) def test_barcode_reader_custom_128_separator(self): + """ + GIVEN: + - Image file with custom separator 128 barcode value + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-128-custom.png", @@ -164,6 +251,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"]) def test_get_mime_type(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ tiff_file = os.path.join( self.SAMPLE_DIR, "simple.tiff", @@ -194,6 +289,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.get_file_mime_type(png_file), "image/png") def test_convert_from_tiff_to_pdf(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ test_file = os.path.join( os.path.dirname(__file__), "samples", @@ -207,6 +310,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(file_extension, ".pdf") def test_convert_error_from_pdf_to_pdf(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ test_file = os.path.join( self.SAMPLE_DIR, "simple.pdf", @@ -216,6 +327,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertIsNone(barcodes.convert_from_tiff_to_pdf(dst)) def test_scan_file_for_separating_barcodes(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pdf", @@ -231,6 +350,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertListEqual(separator_page_numbers, [0]) def test_scan_file_for_separating_barcodes_none_present(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf") doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, @@ -242,7 +369,15 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, []) - def test_scan_file_for_separating_barcodes3(self): + def test_scan_file_for_separating_barcodes_middle_page(self): + """ + GIVEN: + - PDF file containing a separator on page 1 (zero indexed) + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 1 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", @@ -257,7 +392,15 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [1]) - def test_scan_file_for_separating_barcodes4(self): + def test_scan_file_for_separating_barcodes_multiple_pages(self): + """ + GIVEN: + - PDF file containing a separator on pages 2 and 5 (zero indexed) + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on pages 2 and 5 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "several-patcht-codes.pdf", @@ -272,7 +415,16 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [2, 5]) - def test_scan_file_for_separating_barcodes_upsidedown(self): + def test_scan_file_for_separating_barcodes_upside_down(self): + """ + GIVEN: + - PDF file containing a separator on page 1 (zero indexed) + - The barcode is upside down + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 1 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle_reverse.pdf", @@ -287,66 +439,6 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [1]) - def test_scan_file_for_barcodes_pillow_transcode_error(self): - """ - GIVEN: - - A PDF containing an image which cannot be transcoded to a PIL image - WHEN: - - The image tries to be transcoded to a PIL image, but fails - THEN: - - The barcode reader is still called - """ - - def _build_device_n_pdf(self, save_path: str): - # Based on the pikepdf tests - # https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py - pdf = pikepdf.new() - pdf.add_blank_page(page_size=(72, 72)) - imobj = pikepdf.Stream( - pdf, - bytes(range(0, 256)), - BitsPerComponent=8, - ColorSpace=pikepdf.Array( - [ - pikepdf.Name.DeviceN, - pikepdf.Array([pikepdf.Name.Black]), - pikepdf.Name.DeviceCMYK, - pikepdf.Stream( - pdf, - b"{0 0 0 4 -1 roll}", # Colorspace conversion function - FunctionType=4, - Domain=[0.0, 1.0], - Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0], - ), - ], - ), - Width=16, - Height=16, - Type=pikepdf.Name.XObject, - Subtype=pikepdf.Name.Image, - ) - pim = pikepdf.PdfImage(imobj) - self.assertEqual(pim.mode, "DeviceN") - self.assertTrue(pim.is_device_n) - - pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do") - pdf.pages[0].Resources = pikepdf.Dictionary( - XObject=pikepdf.Dictionary(Im0=imobj), - ) - pdf.save(save_path) - - with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf: - # Build an offending file - _build_device_n_pdf(self, str(device_n_pdf.name)) - with mock.patch("documents.barcodes.barcode_reader") as reader: - reader.return_value = list() - - _ = barcodes.scan_file_for_barcodes( - str(device_n_pdf.name), - ) - - reader.assert_called() - def test_scan_file_for_separating_barcodes_fax_decode(self): """ GIVEN: @@ -371,6 +463,15 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertListEqual(separator_page_numbers, [1]) def test_scan_file_for_separating_qr_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode is a QR code + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-qr.pdf", @@ -387,6 +488,15 @@ class TestBarcode(DirectoriesMixin, TestCase): @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode separation value is customized + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-39-custom.pdf", @@ -403,6 +513,16 @@ class TestBarcode(DirectoriesMixin, TestCase): @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_qr_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode separation value is customized + - The barcode is a QR code + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-qr-custom.pdf", @@ -419,6 +539,16 @@ class TestBarcode(DirectoriesMixin, TestCase): @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_128_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode separation value is customized + - The barcode is a 128 code + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-128-custom.pdf", @@ -434,6 +564,16 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertListEqual(separator_page_numbers, [0]) def test_scan_file_for_separating_wrong_qr_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode value is customized + - The separation value is NOT customized + WHEN: + - File is scanned for barcodes + THEN: + - No split pages are detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-39-custom.pdf", @@ -474,13 +614,21 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertListEqual(separator_page_numbers, [1]) def test_separate_pages(self): + """ + GIVEN: + - Input PDF 2 pages after separation + WHEN: + - The input file separated at the barcode + THEN: + - Two new documents are produced + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", ) - pages = barcodes.separate_pages(test_file, [1]) + documents = barcodes.separate_pages(test_file, [1]) - self.assertEqual(len(pages), 2) + self.assertEqual(len(documents), 2) def test_separate_pages_double_code(self): """ @@ -493,8 +641,7 @@ class TestBarcode(DirectoriesMixin, TestCase): """ test_file = os.path.join( os.path.dirname(__file__), - "samples", - "barcodes", + self.BARCODE_SAMPLE_DIR, "patch-code-t-double.pdf", ) pages = barcodes.separate_pages(test_file, [1, 2]) @@ -502,6 +649,15 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(len(pages), 2) def test_separate_pages_no_list(self): + """ + GIVEN: + - Input file to separate + WHEN: + - No separation pages are provided + THEN: + - No new documents are produced + - A warning is logged + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", @@ -517,16 +673,32 @@ class TestBarcode(DirectoriesMixin, TestCase): ) def test_save_to_dir(self): + """ + GIVEN: + - File to save to a directory + WHEN: + - The file is saved + THEN: + - The file exists + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pdf", ) - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - barcodes.save_to_dir(test_file, target_dir=tempdir) - target_file = os.path.join(tempdir, "patch-code-t.pdf") + barcodes.save_to_dir(test_file, target_dir=settings.SCRATCH_DIR) + target_file = os.path.join(settings.SCRATCH_DIR, "patch-code-t.pdf") self.assertTrue(os.path.isfile(target_file)) - def test_save_to_dir2(self): + def test_save_to_dir_not_existing(self): + """ + GIVEN: + - File to save to a directory + - The directory doesn't exist + WHEN: + - The file is saved + THEN: + - The file exists + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pdf", @@ -534,32 +706,51 @@ class TestBarcode(DirectoriesMixin, TestCase): nonexistingdir = "/nowhere" if os.path.isdir(nonexistingdir): self.fail("non-existing dir exists") - else: - with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - barcodes.save_to_dir(test_file, target_dir=nonexistingdir) - self.assertEqual( - cm.output, - [ - f"WARNING:paperless.barcodes:{str(test_file)} or {str(nonexistingdir)} don't exist.", - ], - ) - def test_save_to_dir3(self): + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: + barcodes.save_to_dir(test_file, target_dir=nonexistingdir) + self.assertEqual( + cm.output, + [ + f"WARNING:paperless.barcodes:{str(test_file)} or {str(nonexistingdir)} don't exist.", + ], + ) + + def test_save_to_dir_given_name(self): + """ + GIVEN: + - File to save to a directory + - There is a name override + WHEN: + - The file is saved + THEN: + - The file exists + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pdf", ) - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - barcodes.save_to_dir(test_file, newname="newname.pdf", target_dir=tempdir) - target_file = os.path.join(tempdir, "newname.pdf") + barcodes.save_to_dir( + test_file, + newname="newname.pdf", + target_dir=settings.SCRATCH_DIR, + ) + target_file = os.path.join(settings.SCRATCH_DIR, "newname.pdf") self.assertTrue(os.path.isfile(target_file)) def test_barcode_splitter(self): + """ + GIVEN: + - Input file containing barcodes + WHEN: + - Input file is split on barcodes + THEN: + - Correct number of files produced + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", ) - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, @@ -572,18 +763,33 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertTrue(len(separator_page_numbers) > 0) document_list = barcodes.separate_pages(test_file, separator_page_numbers) - self.assertTrue(document_list) - for document in document_list: - barcodes.save_to_dir(document, target_dir=tempdir) + self.assertGreater(len(document_list), 0) - target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") - target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") + for document in document_list: + barcodes.save_to_dir(document, target_dir=settings.SCRATCH_DIR) + + target_file1 = os.path.join( + settings.SCRATCH_DIR, + "patch-code-t-middle_document_0.pdf", + ) + target_file2 = os.path.join( + settings.SCRATCH_DIR, + "patch-code-t-middle_document_1.pdf", + ) self.assertTrue(os.path.isfile(target_file1)) self.assertTrue(os.path.isfile(target_file2)) @override_settings(CONSUMER_ENABLE_BARCODES=True) def test_consume_barcode_file(self): + """ + GIVEN: + - Input file with barcodes given to consume task + WHEN: + - Consume task returns + THEN: + - The file was split + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", @@ -600,6 +806,14 @@ class TestBarcode(DirectoriesMixin, TestCase): CONSUMER_BARCODE_TIFF_SUPPORT=True, ) def test_consume_barcode_tiff_file(self): + """ + GIVEN: + - TIFF image containing barcodes + WHEN: + - Consume task returns + THEN: + - The file was split + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.tiff", @@ -617,11 +831,13 @@ class TestBarcode(DirectoriesMixin, TestCase): @mock.patch("documents.consumer.Consumer.try_consume_file") def test_consume_barcode_unsupported_jpg_file(self, m): """ - This test assumes barcode and TIFF support are enabled and - the user uploads an unsupported image file (e.g. jpg) - - The function shouldn't try to scan for separating barcodes - and continue archiving the file as is. + GIVEN: + - JPEG image as input + WHEN: + - Consume task returns + THEN: + - Barcode reader reported warning + - Consumption continued with the file """ test_file = os.path.join( self.SAMPLE_DIR, @@ -629,8 +845,10 @@ class TestBarcode(DirectoriesMixin, TestCase): ) dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg") shutil.copy(test_file, dst) + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: self.assertIn("Success", tasks.consume_file(dst)) + self.assertListEqual( cm.output, [ @@ -652,8 +870,13 @@ class TestBarcode(DirectoriesMixin, TestCase): ) def test_consume_barcode_supported_no_extension_file(self): """ - This test assumes barcode and TIFF support are enabled and - the user uploads a supported image file, but without extension + GIVEN: + - TIFF image containing barcodes + - TIFF file is given without extension + WHEN: + - Consume task returns + THEN: + - The file was split """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, @@ -669,11 +892,10 @@ class TestBarcode(DirectoriesMixin, TestCase): """ GIVEN: - Password protected PDF - - pikepdf based scanning WHEN: - File is scanned for barcode THEN: - - Scanning handles the exception without exception + - Scanning handles the exception without crashing """ test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") doc_barcode_info = barcodes.scan_file_for_barcodes( @@ -808,7 +1030,15 @@ class TestBarcode(DirectoriesMixin, TestCase): @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) def test_asn_too_large(self): - + """ + GIVEN: + - ASN from barcode enabled + - Barcode contains too large an ASN value + WHEN: + - ASN from barcode checked for correctness + THEN: + - Exception is raised regarding size limits + """ src = os.path.join( os.path.dirname(__file__), "samples", From 4fce5aba63aab92f3f2346304f9e8e3eb9335006 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 27 Jan 2023 08:37:00 -0800 Subject: [PATCH 15/18] Moves ASN barcode testing into a dedicated class --- src/documents/tests/test_barcodes.py | 267 ++++++++++++++------------- src/documents/tests/utils.py | 25 +++ 2 files changed, 163 insertions(+), 129 deletions(-) diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 8d8b2acfb..1ff698858 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -1,6 +1,5 @@ import os import shutil -import tempfile from unittest import mock from django.conf import settings @@ -198,58 +197,6 @@ class TestBarcode(DirectoriesMixin, TestCase): img = Image.open(test_file) self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) - def test_barcode_reader_asn_normal(self): - """ - GIVEN: - - Image containing standard ASNxxxxx barcode - WHEN: - - Image is scanned for barcodes - THEN: - - The barcode is located - - The barcode value is correct - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-123.png", - ) - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["ASN00123"]) - - def test_barcode_reader_asn_invalid(self): - """ - GIVEN: - - Image containing invalid ASNxxxxx barcode - - The number portion of the ASN is not a number - WHEN: - - Image is scanned for barcodes - THEN: - - The barcode is located - - The barcode value is correct - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-invalid.png", - ) - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["ASNXYZXYZ"]) - - def test_barcode_reader_asn_custom_prefix(self): - """ - GIVEN: - - Image containing custom prefix barcode - WHEN: - - Image is scanned for barcodes - THEN: - - The barcode is located - - The barcode value is correct - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-custom-prefix.png", - ) - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"]) - def test_get_mime_type(self): """ GIVEN: @@ -908,6 +855,144 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, []) + +class TestAsnBarcodes(DirectoriesMixin, TestCase): + + SAMPLE_DIR = os.path.join( + os.path.dirname(__file__), + "samples", + ) + + BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes") + + def test_barcode_reader_asn_normal(self): + """ + GIVEN: + - Image containing standard ASNxxxxx barcode + WHEN: + - Image is scanned for barcodes + THEN: + - The barcode is located + - The barcode value is correct + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-123.png", + ) + img = Image.open(test_file) + self.assertEqual(barcodes.barcode_reader(img), ["ASN00123"]) + + def test_barcode_reader_asn_invalid(self): + """ + GIVEN: + - Image containing invalid ASNxxxxx barcode + - The number portion of the ASN is not a number + WHEN: + - Image is scanned for barcodes + THEN: + - The barcode is located + - The barcode value is correct + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-invalid.png", + ) + img = Image.open(test_file) + self.assertEqual(barcodes.barcode_reader(img), ["ASNXYZXYZ"]) + + def test_barcode_reader_asn_custom_prefix(self): + """ + GIVEN: + - Image containing custom prefix barcode + WHEN: + - Image is scanned for barcodes + THEN: + - The barcode is located + - The barcode value is correct + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-custom-prefix.png", + ) + img = Image.open(test_file) + self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"]) + + @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") + def test_scan_file_for_asn_custom_prefix(self): + """ + GIVEN: + - PDF containing an ASN barcode with custom prefix + - The ASN value is 123 + WHEN: + - File is scanned for barcodes + THEN: + - The ASN is located + - The ASN integer value is correct + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-custom-prefix.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertEqual(asn, 123) + + def test_scan_file_for_asn_barcode_invalid(self): + """ + GIVEN: + - PDF containing an ASN barcode + - The ASN value is XYZXYZ + WHEN: + - File is scanned for barcodes + THEN: + - The ASN is located + - The ASN value is not used + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-invalid.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertEqual(asn, None) + + @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) + def test_consume_barcode_file_asn_assignment(self): + """ + GIVEN: + - PDF containing an ASN barcode + - The ASN value is 123 + WHEN: + - File is scanned for barcodes + THEN: + - The ASN is located + - The ASN integer value is correct + - The ASN is provided as the override value to the consumer + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-123.pdf", + ) + + dst = os.path.join(settings.SCRATCH_DIR, "barcode-39-asn-123.pdf") + shutil.copy(test_file, dst) + + with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call: + tasks.consume_file(dst) + + args, kwargs = mocked_call.call_args + + self.assertEqual(kwargs["override_asn"], 123) + def test_scan_file_for_asn_barcode(self): """ GIVEN: @@ -952,82 +1037,6 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(asn, None) - def test_scan_file_for_asn_barcode_invalid(self): - """ - GIVEN: - - PDF containing an ASN barcode - - The ASN value is XYZXYZ - WHEN: - - File is scanned for barcodes - THEN: - - The ASN is located - - The ASN value is not used - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-invalid.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - - asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertEqual(asn, None) - - @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") - def test_scan_file_for_asn_custom_prefix(self): - """ - GIVEN: - - PDF containing an ASN barcode with custom prefix - - The ASN value is 123 - WHEN: - - File is scanned for barcodes - THEN: - - The ASN is located - - The ASN integer value is correct - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-custom-prefix.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertEqual(asn, 123) - - @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) - def test_consume_barcode_file_asn_assignment(self): - """ - GIVEN: - - PDF containing an ASN barcode - - The ASN value is 123 - WHEN: - - File is scanned for barcodes - THEN: - - The ASN is located - - The ASN integer value is correct - - The ASN is provided as the override value to the consumer - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-123.pdf", - ) - - dst = os.path.join(settings.SCRATCH_DIR, "barcode-39-asn-123.pdf") - shutil.copy(test_file, dst) - - with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call: - tasks.consume_file(dst) - - args, kwargs = mocked_call.call_args - - self.assertEqual(kwargs["override_asn"], 123) - @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) def test_asn_too_large(self): """ diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index c52c9be92..b2ec0d024 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -3,6 +3,7 @@ import shutil import tempfile from collections import namedtuple from contextlib import contextmanager +from unittest import mock from django.apps import apps from django.db import connection @@ -86,6 +87,30 @@ class DirectoriesMixin: remove_dirs(self.dirs) +class ConsumerProgressMixin: + def setUp(self) -> None: + self.send_progress_patcher = mock.patch( + "documents.consumer.Consumer._send_progress", + ) + self.send_progress_mock = self.send_progress_patcher.start() + super().setUp() + + def tearDown(self) -> None: + super().tearDown() + self.send_progress_patcher.stop() + + +class DocumentConsumeDelayMixin: + def setUp(self) -> None: + self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay") + self.consume_file_mock = self.consume_file_patcher.start() + super().setUp() + + def tearDown(self) -> None: + super().tearDown() + self.consume_file_patcher.stop() + + class TestMigrations(TransactionTestCase): @property def app(self): From 9784ea4a602df9f8cb1d627b2d370d472f3a6a48 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 27 Jan 2023 11:11:09 -0800 Subject: [PATCH 16/18] Minor tweak to password test to ensure the right lines were hit --- src/documents/tests/test_barcodes.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 1ff698858..4f7f1278a 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -845,9 +845,14 @@ class TestBarcode(DirectoriesMixin, TestCase): - Scanning handles the exception without crashing """ test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + warning = cm.output[0] + expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes" + self.assertTrue(warning.startswith(expected_str)) + separator_page_numbers = barcodes.get_separating_barcodes( doc_barcode_info.barcodes, ) From 7dd9a4e089dd5fcc68b887045e92f8c563a06828 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Sat, 28 Jan 2023 09:32:40 -0800 Subject: [PATCH 17/18] Changes the consumer to work on a temporary copy and provies that copy to the pre-consume script for modifications --- src/documents/consumer.py | 35 +++++++++++++++++++++------- src/documents/tests/test_consumer.py | 26 +++++++++++++++++++-- 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index bc344abb9..8c80304d3 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,7 +1,10 @@ import datetime import hashlib import os +import shutil +import tempfile import uuid +from pathlib import Path from subprocess import CompletedProcess from subprocess import run from typing import Optional @@ -94,7 +97,8 @@ class Consumer(LoggingMixin): def __init__(self): super().__init__() - self.path = None + self.path: Optional[Path] = None + self.original_path: Optional[Path] = None self.filename = None self.override_title = None self.override_correspondent_id = None @@ -167,16 +171,18 @@ class Consumer(LoggingMixin): self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") - filepath_arg = os.path.normpath(self.path) + working_file_path = str(self.path) + original_file_path = str(self.original_path) script_env = os.environ.copy() - script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg + script_env["DOCUMENT_SOURCE_PATH"] = original_file_path + script_env["DOCUMENT_WORKING_PATH"] = working_file_path try: completed_proc = run( args=[ settings.PRE_CONSUME_SCRIPT, - filepath_arg, + original_file_path, ], env=script_env, capture_output=True, @@ -195,7 +201,7 @@ class Consumer(LoggingMixin): exception=e, ) - def run_post_consume_script(self, document): + def run_post_consume_script(self, document: Document): if not settings.POST_CONSUME_SCRIPT: return @@ -285,8 +291,8 @@ class Consumer(LoggingMixin): Return the document object if it was successfully created. """ - self.path = path - self.filename = override_filename or os.path.basename(path) + self.path = Path(path).resolve() + self.filename = override_filename or self.path.name self.override_title = override_title self.override_correspondent_id = override_correspondent_id self.override_document_type_id = override_document_type_id @@ -311,6 +317,15 @@ class Consumer(LoggingMixin): self.log("info", f"Consuming {self.filename}") + # For the actual work, copy the file into a tempdir + self.original_path = self.path + tempdir = tempfile.TemporaryDirectory( + prefix="paperless-ngx", + dir=settings.SCRATCH_DIR, + ) + self.path = Path(tempdir.name) / Path(self.filename) + shutil.copy(self.original_path, self.path) + # Determine the parser class. mime_type = magic.from_file(self.path, mime=True) @@ -453,11 +468,12 @@ class Consumer(LoggingMixin): # Delete the file only if it was successfully consumed self.log("debug", f"Deleting file {self.path}") os.unlink(self.path) + self.original_path.unlink() # https://github.com/jonaswinkler/paperless-ng/discussions/1037 shadow_file = os.path.join( - os.path.dirname(self.path), - "._" + os.path.basename(self.path), + os.path.dirname(self.original_path), + "._" + os.path.basename(self.original_path), ) if os.path.isfile(shadow_file): @@ -474,6 +490,7 @@ class Consumer(LoggingMixin): ) finally: document_parser.cleanup() + tempdir.cleanup() self.run_post_consume_script(document) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index dc86de331..de368018f 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase): with tempfile.NamedTemporaryFile() as script: with override_settings(PRE_CONSUME_SCRIPT=script.name): c = Consumer() - c.path = "path-to-file" + c.original_path = "path-to-file" + c.path = "/tmp/somewhere/path-to-file" c.run_pre_consume_script() m.assert_called_once() @@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase): args, kwargs = m.call_args command = kwargs["args"] + environment = kwargs["env"] self.assertEqual(command[0], script.name) self.assertEqual(command[1], "path-to-file") + self.assertDictContainsSubset( + { + "DOCUMENT_SOURCE_PATH": c.original_path, + "DOCUMENT_WORKING_PATH": c.path, + }, + environment, + ) + @mock.patch("documents.consumer.Consumer.log") def test_script_with_output(self, mocked_log): """ @@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase): m.assert_called_once() - args, kwargs = m.call_args + _, kwargs = m.call_args command = kwargs["args"] + environment = kwargs["env"] self.assertEqual(command[0], script.name) self.assertEqual(command[1], str(doc.pk)) @@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase): self.assertEqual(command[7], "my_bank") self.assertCountEqual(command[8].split(","), ["a", "b"]) + self.assertDictContainsSubset( + { + "DOCUMENT_ID": str(doc.pk), + "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/", + "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/", + "DOCUMENT_CORRESPONDENT": "my_bank", + "DOCUMENT_TAGS": "a,b", + }, + environment, + ) + def test_script_exit_non_zero(self): """ GIVEN: From 7b9c0d65b99c55c7227f3cee6a6dcd2f829e7d67 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Sat, 28 Jan 2023 10:25:21 -0800 Subject: [PATCH 18/18] Documents the change to pre-consume script and improves the readability --- docs/advanced_usage.md | 60 ++++++++++++++++++++++++++++-------------- mkdocs.yml | 1 + 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index 61b1c072e..9a1abcfff 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -121,7 +121,17 @@ Executed after the consumer sees a new document in the consumption folder, but before any processing of the document is performed. This script can access the following relevant environment variables set: -- `DOCUMENT_SOURCE_PATH` +| Environment Variable | Description | +| ----------------------- | ------------------------------------------------------------ | +| `DOCUMENT_SOURCE_PATH` | Original path of the consumed document | +| `DOCUMENT_WORKING_PATH` | Path to a copy of the original that consumption will work on | + +!!! note + + Pre-consume scripts which modify the document should only change + the `DOCUMENT_WORKING_PATH` file or a second consume task may + be triggered, leading to failures as two tasks work on the + same document path A simple but common example for this would be creating a simple script like this: @@ -130,7 +140,7 @@ like this: ```bash #!/usr/bin/env bash -pdf2pdfocr.py -i ${DOCUMENT_SOURCE_PATH} +pdf2pdfocr.py -i ${DOCUMENT_WORKING_PATH} ``` `/etc/paperless.conf` @@ -157,26 +167,36 @@ Executed after the consumer has successfully processed a document and has moved it into paperless. It receives the following environment variables: -- `DOCUMENT_ID` -- `DOCUMENT_FILE_NAME` -- `DOCUMENT_CREATED` -- `DOCUMENT_MODIFIED` -- `DOCUMENT_ADDED` -- `DOCUMENT_SOURCE_PATH` -- `DOCUMENT_ARCHIVE_PATH` -- `DOCUMENT_THUMBNAIL_PATH` -- `DOCUMENT_DOWNLOAD_URL` -- `DOCUMENT_THUMBNAIL_URL` -- `DOCUMENT_CORRESPONDENT` -- `DOCUMENT_TAGS` -- `DOCUMENT_ORIGINAL_FILENAME` +| Environment Variable | Description | +| ---------------------------- | --------------------------------------------- | +| `DOCUMENT_ID` | Database primary key of the document | +| `DOCUMENT_FILE_NAME` | Formatted filename, not including paths | +| `DOCUMENT_CREATED` | Date & time when document created | +| `DOCUMENT_MODIFIED` | Date & time when document was last modified | +| `DOCUMENT_ADDED` | Date & time when document was added | +| `DOCUMENT_SOURCE_PATH` | Path to the original document file | +| `DOCUMENT_ARCHIVE_PATH` | Path to the generate archive file (if any) | +| `DOCUMENT_THUMBNAIL_PATH` | Path to the generated thumbnail | +| `DOCUMENT_DOWNLOAD_URL` | URL for document download | +| `DOCUMENT_THUMBNAIL_URL` | URL for the document thumbnail | +| `DOCUMENT_CORRESPONDENT` | Assigned correspondent (if any) | +| `DOCUMENT_TAGS` | Comma separated list of tags applied (if any) | +| `DOCUMENT_ORIGINAL_FILENAME` | Filename of original document | -The script can be in any language, but for a simple shell script -example, you can take a look at -[post-consumption-example.sh](https://github.com/paperless-ngx/paperless-ngx/blob/main/scripts/post-consumption-example.sh) -in this project. +The script can be in any language, A simple shell script example: -The post consumption script cannot cancel the consumption process. +```bash title="post-consumption-example" +--8<-- "./scripts/post-consumption-example.sh" +``` + +!!! note + + The post consumption script cannot cancel the consumption process. + +!!! warning + + The post consumption script should not modify the document files + directly The script's stdout and stderr will be logged line by line to the webserver log, along with the exit code of the script. diff --git a/mkdocs.yml b/mkdocs.yml index 6314a44d3..03f24c4f3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,6 +41,7 @@ markdown_extensions: anchor_linenums: true - pymdownx.superfences - pymdownx.inlinehilite + - pymdownx.snippets strict: true nav: - index.md