diff --git a/.github/scripts/cleanup-tags.py b/.github/scripts/cleanup-tags.py index 9b299d048..590344a2c 100644 --- a/.github/scripts/cleanup-tags.py +++ b/.github/scripts/cleanup-tags.py @@ -15,6 +15,8 @@ from github import ContainerPackage from github import GithubBranchApi from github import GithubContainerRegistryApi +import docker + logger = logging.getLogger("cleanup-tags") @@ -151,12 +153,16 @@ class RegistryTagsCleaner: for tag in sorted(self.tags_to_keep): full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}" logger.info(f"Checking manifest for {full_name}") + # TODO: It would be nice to use RegistryData from docker + # except the ID doesn't map to anything in the manifest try: proc = subprocess.run( [ shutil.which("docker"), - "manifest", + "buildx", + "imagetools", "inspect", + "--raw", full_name, ], capture_output=True, @@ -241,6 +247,65 @@ class RegistryTagsCleaner: # By default, keep anything which is tagged self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys())) + def check_tags_pull(self): + """ + This method uses the Docker Python SDK to confirm all tags which were + kept still pull, for all platforms. + + TODO: This is much slower (although more comprehensive). Maybe a Pool? + """ + logger.info("Beginning confirmation step") + client = docker.from_env() + imgs = [] + for tag in sorted(self.tags_to_keep): + repository = f"ghcr.io/{self.repo_owner}/{self.package_name}" + for arch, variant in [("amd64", None), ("arm64", None), ("arm", "v7")]: + # From 11.2.0 onwards, qpdf is cross compiled, so there is a single arch, amd64 + # skip others in this case + if "qpdf" in self.package_name and arch != "amd64" and tag == "11.2.0": + continue + # Skip beta and release candidate tags + elif "beta" in tag: + continue + + # Build the platform name + if variant is not None: + platform = f"linux/{arch}/{variant}" + else: + platform = f"linux/{arch}" + + try: + logger.info(f"Pulling {repository}:{tag} for {platform}") + image = client.images.pull( + repository=repository, + tag=tag, + platform=platform, + ) + imgs.append(image) + except docker.errors.APIError as e: + logger.error( + f"Failed to pull {repository}:{tag}: {e}", + ) + + # Prevent out of space errors by removing after a few + # pulls + if len(imgs) > 50: + for image in imgs: + try: + client.images.remove(image.id) + except docker.errors.APIError as e: + err_str = str(e) + # Ignore attempts to remove images that are partly shared + # Ignore images which are somehow gone already + if ( + "must be forced" not in err_str + and "No such image" not in err_str + ): + logger.error( + f"Remove image ghcr.io/{self.repo_owner}/{self.package_name}:{tag} failed: {e}", + ) + imgs = [] + class MainImageTagsCleaner(RegistryTagsCleaner): def decide_what_tags_to_keep(self): @@ -397,6 +462,10 @@ def _main(): # Clean images which are untagged cleaner.clean_untagged(args.is_manifest) + # Verify remaining tags still pull + if args.is_manifest: + cleaner.check_tags_pull() + if __name__ == "__main__": _main() diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ac0b89611..adf03d4bb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -212,12 +212,6 @@ jobs: name: Prepare Docker Pipeline Data if: github.event_name == 'push' && (startsWith(github.ref, 'refs/heads/feature-') || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/beta' || contains(github.ref, 'beta.rc') || startsWith(github.ref, 'refs/tags/v')) runs-on: ubuntu-22.04 - # If the push triggered the installer library workflow, wait for it to - # complete here. This ensures the required versions for the final - # image have been built, while not waiting at all if the versions haven't changed - concurrency: - group: build-installer-library - cancel-in-progress: false needs: - documentation - tests-backend diff --git a/.github/workflows/cleanup-tags.yml b/.github/workflows/cleanup-tags.yml index 6877e55bb..5992b4442 100644 --- a/.github/workflows/cleanup-tags.yml +++ b/.github/workflows/cleanup-tags.yml @@ -62,9 +62,9 @@ jobs: with: python-version: "3.10" - - name: Install httpx + name: Install Python libraries run: | - python -m pip install httpx + python -m pip install httpx docker # # Clean up primary package # @@ -81,13 +81,3 @@ jobs: if: "${{ env.TOKEN != '' }}" run: | python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --delete "${{ matrix.cache-name }}" - # - # Verify tags which are left still pull - # - - - name: Check all tags still pull - run: | - ghcr_name=$(echo "ghcr.io/${GITHUB_REPOSITORY_OWNER}/${{ matrix.primary-name }}" | awk '{ print tolower($0) }') - echo "Pulling all tags of ${ghcr_name}" - docker pull --quiet --all-tags ${ghcr_name} - docker image list diff --git a/.github/workflows/installer-library.yml b/.github/workflows/installer-library.yml index 32aaf85ee..56064ad86 100644 --- a/.github/workflows/installer-library.yml +++ b/.github/workflows/installer-library.yml @@ -169,3 +169,142 @@ jobs: PIKEPDF_VERSION=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }} PILLOW_VERSION=${{ needs.prepare-docker-build.outputs.pillow-version }} LXML_VERSION=${{ needs.prepare-docker-build.outputs.lxml-version }} + + commit-binary-files: + name: Store installers + needs: + - prepare-docker-build + - build-qpdf-debs + - build-jbig2enc + - build-psycopg2-wheel + - build-pikepdf-wheel + runs-on: ubuntu-22.04 + steps: + - + name: Checkout + uses: actions/checkout@v3 + with: + ref: binary-library + - + name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - + name: Install system dependencies + run: | + sudo apt-get update -qq + sudo apt-get install -qq --no-install-recommends tree + - + name: Extract qpdf files + run: | + version=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).version }} + tag=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).image_tag }} + + docker pull --quiet ${tag} + docker create --name qpdf-extract ${tag} + + mkdir --parents qpdf/${version}/amd64 + docker cp qpdf-extract:/usr/src/qpdf/${version}/amd64 qpdf/${version} + + mkdir --parents qpdf/${version}/arm64 + docker cp qpdf-extract:/usr/src/qpdf/${version}/arm64 qpdf/${version} + + mkdir --parents qpdf/${version}/armv7 + docker cp qpdf-extract:/usr/src/qpdf/${version}/armv7 qpdf/${version} + - + name: Extract psycopg2 files + run: | + version=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).version }} + tag=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).image_tag }} + + docker pull --quiet --platform linux/amd64 ${tag} + docker create --platform linux/amd64 --name psycopg2-extract ${tag} + mkdir --parents psycopg2/${version}/amd64 + docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/amd64 + mv psycopg2/${version}/amd64/wheels/* psycopg2/${version}/amd64 + rm -r psycopg2/${version}/amd64/wheels/ + docker rm psycopg2-extract + + docker pull --quiet --platform linux/arm64 ${tag} + docker create --platform linux/arm64 --name psycopg2-extract ${tag} + mkdir --parents psycopg2/${version}/arm64 + docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/arm64 + mv psycopg2/${version}/arm64/wheels/* psycopg2/${version}/arm64 + rm -r psycopg2/${version}/arm64/wheels/ + docker rm psycopg2-extract + + docker pull --quiet --platform linux/arm/v7 ${tag} + docker create --platform linux/arm/v7 --name psycopg2-extract ${tag} + mkdir --parents psycopg2/${version}/armv7 + docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/armv7 + mv psycopg2/${version}/armv7/wheels/* psycopg2/${version}/armv7 + rm -r psycopg2/${version}/armv7/wheels/ + docker rm psycopg2-extract + - + name: Extract pikepdf files + run: | + version=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }} + tag=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).image_tag }} + + docker pull --quiet --platform linux/amd64 ${tag} + docker create --platform linux/amd64 --name pikepdf-extract ${tag} + mkdir --parents pikepdf/${version}/amd64 + docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/amd64 + mv pikepdf/${version}/amd64/wheels/* pikepdf/${version}/amd64 + rm -r pikepdf/${version}/amd64/wheels/ + docker rm pikepdf-extract + + docker pull --quiet --platform linux/arm64 ${tag} + docker create --platform linux/arm64 --name pikepdf-extract ${tag} + mkdir --parents pikepdf/${version}/arm64 + docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/arm64 + mv pikepdf/${version}/arm64/wheels/* pikepdf/${version}/arm64 + rm -r pikepdf/${version}/arm64/wheels/ + docker rm pikepdf-extract + + docker pull --quiet --platform linux/arm/v7 ${tag} + docker create --platform linux/arm/v7 --name pikepdf-extract ${tag} + mkdir --parents pikepdf/${version}/armv7 + docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/armv7 + mv pikepdf/${version}/armv7/wheels/* pikepdf/${version}/armv7 + rm -r pikepdf/${version}/armv7/wheels/ + docker rm pikepdf-extract + - + name: Extract jbig2enc files + run: | + version=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).version }} + tag=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).image_tag }} + + docker pull --quiet --platform linux/amd64 ${tag} + docker create --platform linux/amd64 --name jbig2enc-extract ${tag} + mkdir --parents jbig2enc/${version}/amd64 + docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/amd64/ + mv jbig2enc/${version}/amd64/build/* jbig2enc/${version}/amd64/ + docker rm jbig2enc-extract + + docker pull --quiet --platform linux/arm64 ${tag} + docker create --platform linux/arm64 --name jbig2enc-extract ${tag} + mkdir --parents jbig2enc/${version}/arm64 + docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/arm64 + mv jbig2enc/${version}/arm64/build/* jbig2enc/${version}/arm64/ + docker rm jbig2enc-extract + + docker pull --quiet --platform linux/arm/v7 ${tag} + docker create --platform linux/arm/v7 --name jbig2enc-extract ${tag} + mkdir --parents jbig2enc/${version}/armv7 + docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/armv7 + mv jbig2enc/${version}/armv7/build/* jbig2enc/${version}/armv7/ + docker rm jbig2enc-extract + - + name: Show file structure + run: | + tree . + - + name: Commit files + run: | + git config --global user.name "github-actions" + git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" + git add pikepdf/ qpdf/ psycopg2/ jbig2enc/ + git commit -m "Updating installer packages" || true + git push origin || true diff --git a/Dockerfile b/Dockerfile index 9522728d9..6588802bb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,5 @@ # syntax=docker/dockerfile:1.4 -# Pull the installer images from the library -# These are all built previously -# They provide either a .deb or .whl - -ARG JBIG2ENC_VERSION -ARG QPDF_VERSION -ARG PIKEPDF_VERSION -ARG PSYCOPG2_VERSION - -FROM ghcr.io/paperless-ngx/paperless-ngx/builder/jbig2enc:${JBIG2ENC_VERSION} as jbig2enc-builder -FROM --platform=$BUILDPLATFORM ghcr.io/paperless-ngx/paperless-ngx/builder/qpdf:${QPDF_VERSION} as qpdf-builder -FROM ghcr.io/paperless-ngx/paperless-ngx/builder/pikepdf:${PIKEPDF_VERSION} as pikepdf-builder -FROM ghcr.io/paperless-ngx/paperless-ngx/builder/psycopg2:${PSYCOPG2_VERSION} as psycopg2-builder - FROM --platform=$BUILDPLATFORM node:16-bullseye-slim AS compile-frontend # This stage compiles the frontend @@ -58,24 +44,21 @@ LABEL org.opencontainers.image.url="https://github.com/paperless-ngx/paperless-n LABEL org.opencontainers.image.licenses="GPL-3.0-only" ARG DEBIAN_FRONTEND=noninteractive -# Buildx provided +# Buildx provided, must be defined to use though ARG TARGETARCH ARG TARGETVARIANT # Workflow provided +ARG JBIG2ENC_VERSION ARG QPDF_VERSION +ARG PIKEPDF_VERSION +ARG PSYCOPG2_VERSION # # Begin installation and configuration # Order the steps below from least often changed to most # -# copy jbig2enc -# Basically will never change again -COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/.libs/libjbig2enc* /usr/local/lib/ -COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/jbig2 /usr/local/bin/ -COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/*.h /usr/local/include/ - # Packages need for running ARG RUNTIME_PACKAGES="\ # Python @@ -198,19 +181,29 @@ RUN set -eux \ # Install the built packages from the installer library images # Use mounts to avoid copying installer files into the image # These change sometimes -RUN --mount=type=bind,from=qpdf-builder,target=/qpdf \ - --mount=type=bind,from=psycopg2-builder,target=/psycopg2 \ - --mount=type=bind,from=pikepdf-builder,target=/pikepdf \ - set -eux \ +RUN set -eux \ + && echo "Getting binaries" \ + && mkdir paperless-ngx \ + && curl --fail --silent --show-error --output paperless-ngx.tar.gz --location https://github.com/paperless-ngx/paperless-ngx/archive/41d6e7e407af09a0882736d50c89b6e015997bff.tar.gz \ + && tar -xf paperless-ngx.tar.gz --directory paperless-ngx --strip-components=1 \ + && cd paperless-ngx \ + # Setting a specific revision ensures we know what this installed + # and ensures cache breaking on changes + && echo "Installing jbig2enc" \ + && cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/jbig2 /usr/local/bin/ \ + && cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/libjbig2enc* /usr/local/lib/ \ && echo "Installing qpdf" \ - && apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \ - && apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \ + && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \ + && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \ && echo "Installing pikepdf and dependencies" \ - && python3 -m pip install --no-cache-dir /pikepdf/usr/src/wheels/*.whl \ + && python3 -m pip install --no-cache-dir ./pikepdf/${PIKEPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/*.whl \ && python3 -m pip list \ && echo "Installing psycopg2" \ - && python3 -m pip install --no-cache-dir /psycopg2/usr/src/wheels/psycopg2*.whl \ - && python3 -m pip list + && python3 -m pip install --no-cache-dir ./psycopg2/${PSYCOPG2_VERSION}/${TARGETARCH}${TARGETVARIANT}/psycopg2*.whl \ + && python3 -m pip list \ + && echo "Cleaning up image layer" \ + && cd ../ \ + && rm -rf paperless-ngx WORKDIR /usr/src/paperless/src/ diff --git a/docker-builders/Dockerfile.jbig2enc b/docker-builders/Dockerfile.jbig2enc index 90318084f..388bdd1f7 100644 --- a/docker-builders/Dockerfile.jbig2enc +++ b/docker-builders/Dockerfile.jbig2enc @@ -29,7 +29,20 @@ RUN set -eux \ && ./autogen.sh \ && ./configure \ && make \ + && echo "Gathering package data" \ + && dpkg-query -f '${Package;-40}${Version}\n' -W > ./pkg-list.txt \ && echo "Cleaning up image" \ && apt-get -y purge ${BUILD_PACKAGES} \ && apt-get -y autoremove --purge \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + && echo "Moving files around" \ + && mkdir build \ + # Unlink a symlink that causes problems + && unlink ./src/.libs/libjbig2enc.la \ + # Move what the link pointed to + && mv ./src/libjbig2enc.la ./build/ \ + # Move the shared library .so files + && mv ./src/.libs/libjbig2enc* ./build/ \ + # And move the cli binary + && mv ./src/jbig2 ./build/ \ + && mv ./pkg-list.txt ./build/ diff --git a/docker-builders/Dockerfile.pikepdf b/docker-builders/Dockerfile.pikepdf index c4d1ee1dc..e4181c538 100644 --- a/docker-builders/Dockerfile.pikepdf +++ b/docker-builders/Dockerfile.pikepdf @@ -7,12 +7,17 @@ # Default to pulling from the main repo registry when manually building ARG REPO="paperless-ngx/paperless-ngx" +# This does nothing, except provide a name for a copy below ARG QPDF_VERSION FROM --platform=$BUILDPLATFORM ghcr.io/${REPO}/builder/qpdf:${QPDF_VERSION} as qpdf-builder -# This does nothing, except provide a name for a copy below - -FROM python:3.9-slim-bullseye as main +# +# Stage: builder +# Purpose: +# - Build the pikepdf wheel +# - Build any dependent wheels which can't be found +# +FROM python:3.9-slim-bullseye as builder LABEL org.opencontainers.image.description="A intermediate image with pikepdf wheel built" @@ -100,3 +105,14 @@ RUN set -eux \ && apt-get -y purge ${BUILD_PACKAGES} \ && apt-get -y autoremove --purge \ && rm -rf /var/lib/apt/lists/* + +# +# Stage: package +# Purpose: Holds the compiled .whl files in a tiny image to pull +# +FROM alpine:3.17 as package + +WORKDIR /usr/src/wheels/ + +COPY --from=builder /usr/src/wheels/*.whl ./ +COPY --from=builder /usr/src/wheels/pkg-list.txt ./ diff --git a/docker-builders/Dockerfile.psycopg2 b/docker-builders/Dockerfile.psycopg2 index 8fcf5264b..e3f182435 100644 --- a/docker-builders/Dockerfile.psycopg2 +++ b/docker-builders/Dockerfile.psycopg2 @@ -2,7 +2,12 @@ # Inputs: # - PSYCOPG2_VERSION - Version to build -FROM python:3.9-slim-bullseye as main +# +# Stage: builder +# Purpose: +# - Build the psycopg2 wheel +# +FROM python:3.9-slim-bullseye as builder LABEL org.opencontainers.image.description="A intermediate image with psycopg2 wheel built" @@ -48,3 +53,14 @@ RUN set -eux \ && apt-get -y purge ${BUILD_PACKAGES} \ && apt-get -y autoremove --purge \ && rm -rf /var/lib/apt/lists/* + +# +# Stage: package +# Purpose: Holds the compiled .whl files in a tiny image to pull +# +FROM alpine:3.17 as package + +WORKDIR /usr/src/wheels/ + +COPY --from=builder /usr/src/wheels/*.whl ./ +COPY --from=builder /usr/src/wheels/pkg-list.txt ./ diff --git a/docker-builders/README.md b/docker-builders/README.md new file mode 100644 index 000000000..6202719c6 --- /dev/null +++ b/docker-builders/README.md @@ -0,0 +1,57 @@ +# Installer Library + +This folder contains the Dockerfiles for building certain installers or libraries, which are then pulled into the main image. + +## [jbig2enc](https://github.com/agl/jbig2enc) + +### Why + +JBIG is an image coding which can achieve better compression of images for PDFs. + +### What + +The Docker image builds a shared library file and utility, which is copied into the correct location in the final image. + +### Updating + +1. Ensure the given qpdf version is present in [Debian bookworm](https://packages.debian.org/bookworm/qpdf) +2. Update `.build-config.json` to the given version +3. If the Debian specific version has incremented, update `Dockerfile.qpdf` + +See Also: + +- [OCRMyPDF Documentation](https://ocrmypdf.readthedocs.io/en/latest/jbig2.html) + +## [psycopg2](https://www.psycopg.org/) + +### Why + +The pre-built wheels of psycopg2 are built on Debian 9, which provides a quite old version of libpq-dev. This causes issue with authentication methods. + +### What + +The image builds psycopg2 wheels on Debian 10 and places the produced wheels into `/usr/src/wheels/`. + +See Also: + +- [Issue 266](https://github.com/paperless-ngx/paperless-ngx/issues/266) + +## [qpdf](https://qpdf.readthedocs.io/en/stable/index.html) + +### Why + +qpdf and it's library provide tools to read, manipulate and fix up PDFs. Version 11 is also required by `pikepdf` 6+ and Debian 9 does not provide above version 10. + +### What + +The Docker image cross compiles .deb installers for each supported architecture of the main image. The installers are placed in `/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/` + +## [pikepdf](https://pikepdf.readthedocs.io/en/latest/) + +### Why + +Required by OCRMyPdf, this is a general purpose library for PDF manipulation in Python via the qpdf libraries. + +### What + +The built wheels are placed into `/usr/src/wheels/` diff --git a/docker/docker-prepare.sh b/docker/docker-prepare.sh index dad49774b..af2bfe2a7 100755 --- a/docker/docker-prepare.sh +++ b/docker/docker-prepare.sh @@ -80,7 +80,7 @@ django_checks() { search_index() { - local -r index_version=1 + local -r index_version=2 local -r index_version_file=${DATA_DIR}/.index_version if [[ (! -f "${index_version_file}") || $(<"${index_version_file}") != "$index_version" ]]; then diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index 61b1c072e..9a1abcfff 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -121,7 +121,17 @@ Executed after the consumer sees a new document in the consumption folder, but before any processing of the document is performed. This script can access the following relevant environment variables set: -- `DOCUMENT_SOURCE_PATH` +| Environment Variable | Description | +| ----------------------- | ------------------------------------------------------------ | +| `DOCUMENT_SOURCE_PATH` | Original path of the consumed document | +| `DOCUMENT_WORKING_PATH` | Path to a copy of the original that consumption will work on | + +!!! note + + Pre-consume scripts which modify the document should only change + the `DOCUMENT_WORKING_PATH` file or a second consume task may + be triggered, leading to failures as two tasks work on the + same document path A simple but common example for this would be creating a simple script like this: @@ -130,7 +140,7 @@ like this: ```bash #!/usr/bin/env bash -pdf2pdfocr.py -i ${DOCUMENT_SOURCE_PATH} +pdf2pdfocr.py -i ${DOCUMENT_WORKING_PATH} ``` `/etc/paperless.conf` @@ -157,26 +167,36 @@ Executed after the consumer has successfully processed a document and has moved it into paperless. It receives the following environment variables: -- `DOCUMENT_ID` -- `DOCUMENT_FILE_NAME` -- `DOCUMENT_CREATED` -- `DOCUMENT_MODIFIED` -- `DOCUMENT_ADDED` -- `DOCUMENT_SOURCE_PATH` -- `DOCUMENT_ARCHIVE_PATH` -- `DOCUMENT_THUMBNAIL_PATH` -- `DOCUMENT_DOWNLOAD_URL` -- `DOCUMENT_THUMBNAIL_URL` -- `DOCUMENT_CORRESPONDENT` -- `DOCUMENT_TAGS` -- `DOCUMENT_ORIGINAL_FILENAME` +| Environment Variable | Description | +| ---------------------------- | --------------------------------------------- | +| `DOCUMENT_ID` | Database primary key of the document | +| `DOCUMENT_FILE_NAME` | Formatted filename, not including paths | +| `DOCUMENT_CREATED` | Date & time when document created | +| `DOCUMENT_MODIFIED` | Date & time when document was last modified | +| `DOCUMENT_ADDED` | Date & time when document was added | +| `DOCUMENT_SOURCE_PATH` | Path to the original document file | +| `DOCUMENT_ARCHIVE_PATH` | Path to the generate archive file (if any) | +| `DOCUMENT_THUMBNAIL_PATH` | Path to the generated thumbnail | +| `DOCUMENT_DOWNLOAD_URL` | URL for document download | +| `DOCUMENT_THUMBNAIL_URL` | URL for the document thumbnail | +| `DOCUMENT_CORRESPONDENT` | Assigned correspondent (if any) | +| `DOCUMENT_TAGS` | Comma separated list of tags applied (if any) | +| `DOCUMENT_ORIGINAL_FILENAME` | Filename of original document | -The script can be in any language, but for a simple shell script -example, you can take a look at -[post-consumption-example.sh](https://github.com/paperless-ngx/paperless-ngx/blob/main/scripts/post-consumption-example.sh) -in this project. +The script can be in any language, A simple shell script example: -The post consumption script cannot cancel the consumption process. +```bash title="post-consumption-example" +--8<-- "./scripts/post-consumption-example.sh" +``` + +!!! note + + The post consumption script cannot cancel the consumption process. + +!!! warning + + The post consumption script should not modify the document files + directly The script's stdout and stderr will be logged line by line to the webserver log, along with the exit code of the script. diff --git a/docs/changelog.md b/docs/changelog.md index 0e5a6fcba..5a9371781 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,9 @@ ## paperless-ngx 1.12.1 +_Note: Version 1.12.x introduced searching of comments which will work for comments added after the upgrade but a reindex of the search index is required in order to be able to search +older comments. The Docker image will automatically perform this reindex, bare metal installations will have to perform this manually, see [the docs](https://docs.paperless-ngx.com/administration/#index)._ + ### Bug Fixes - Fix: comments not showing in search until after manual reindex in v1.12 [@shamoon](https://github.com/shamoon) ([#2513](https://github.com/paperless-ngx/paperless-ngx/pull/2513)) diff --git a/mkdocs.yml b/mkdocs.yml index 0d56abc68..1e692d68b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,6 +41,7 @@ markdown_extensions: anchor_linenums: true - pymdownx.superfences - pymdownx.inlinehilite + - pymdownx.snippets strict: true nav: - index.md diff --git a/src-ui/messages.xlf b/src-ui/messages.xlf index 21ac728b3..edd742d45 100644 --- a/src-ui/messages.xlf +++ b/src-ui/messages.xlf @@ -5,242 +5,242 @@ Close - node_modules/src/alert/alert.ts - 47,48 - - - - Slide of - - node_modules/src/carousel/carousel.ts - 178,186 - - Currently selected slide number read by screen reader - - - Previous - - node_modules/src/carousel/carousel.ts - 213,215 - - - - Next - - node_modules/src/carousel/carousel.ts - 236 - - - - Select month - - node_modules/src/datepicker/datepicker-navigation-select.ts - 50,51 - - - node_modules/src/datepicker/datepicker-navigation-select.ts - 50,51 - - - - Select year - - node_modules/src/datepicker/datepicker-navigation-select.ts - 50,51 - - - node_modules/src/datepicker/datepicker-navigation-select.ts - 50,51 - - - - Previous month - - node_modules/src/datepicker/datepicker-navigation.ts - 60,63 - - - node_modules/src/datepicker/datepicker-navigation.ts - 60,63 - - - - Next month - - node_modules/src/datepicker/datepicker-navigation.ts - 60,63 - - - node_modules/src/datepicker/datepicker-navigation.ts - 60,63 - - - - «« - - node_modules/src/pagination/pagination.ts - 269,270 - - - - « - - node_modules/src/pagination/pagination.ts - 269,270 - - - - » - - node_modules/src/pagination/pagination.ts - 269,270 - - - - »» - - node_modules/src/pagination/pagination.ts - 269,270 - - - - First - - node_modules/src/pagination/pagination.ts - 269,271 - - - - Previous - - node_modules/src/pagination/pagination.ts - 269,271 - - - - Next - - node_modules/src/pagination/pagination.ts - 269,271 - - - - Last - - node_modules/src/pagination/pagination.ts - 269,271 - - - - - - node_modules/src/progressbar/progressbar.ts - 30,33 + node_modules/src/ngb-config.ts + 13 HH - node_modules/src/timepicker/timepicker.ts - 230,231 - - - - Hours - - node_modules/src/timepicker/timepicker.ts - 255,258 - - - - MM - - node_modules/src/timepicker/timepicker.ts - 280,282 - - - - Minutes - - node_modules/src/timepicker/timepicker.ts - 298,299 - - - - Increment hours - - node_modules/src/timepicker/timepicker.ts - 328,329 - - - - Decrement hours - - node_modules/src/timepicker/timepicker.ts - 350,356 - - - - Increment minutes - - node_modules/src/timepicker/timepicker.ts - 383,384 - - - - Decrement minutes - - node_modules/src/timepicker/timepicker.ts - 412,416 - - - - SS - - node_modules/src/timepicker/timepicker.ts - 429 - - - - Seconds - - node_modules/src/timepicker/timepicker.ts - 429 - - - - Increment seconds - - node_modules/src/timepicker/timepicker.ts - 429 - - - - Decrement seconds - - node_modules/src/timepicker/timepicker.ts - 429 - - - - - - node_modules/src/timepicker/timepicker.ts - 429 - - - - - - node_modules/src/timepicker/timepicker.ts - 429 + node_modules/src/ngb-config.ts + 13 Close - node_modules/src/toast/toast.ts - 74,75 + node_modules/src/ngb-config.ts + 13 + + + + «« + + node_modules/src/ngb-config.ts + 13 + + + + Select month + + node_modules/src/ngb-config.ts + 13 + + + node_modules/src/ngb-config.ts + 13 + + + + Previous month + + node_modules/src/ngb-config.ts + 13 + + + node_modules/src/ngb-config.ts + 13 + + + + + + node_modules/src/ngb-config.ts + 13 + + + + Slide of + + node_modules/src/ngb-config.ts + 13 + + Currently selected slide number read by screen reader + + + Hours + + node_modules/src/ngb-config.ts + 13 + + + + « + + node_modules/src/ngb-config.ts + 13 + + + + Previous + + node_modules/src/ngb-config.ts + 13 + + + + MM + + node_modules/src/ngb-config.ts + 13 + + + + » + + node_modules/src/ngb-config.ts + 13 + + + + Select year + + node_modules/src/ngb-config.ts + 13 + + + node_modules/src/ngb-config.ts + 13 + + + + Next month + + node_modules/src/ngb-config.ts + 13 + + + node_modules/src/ngb-config.ts + 13 + + + + Next + + node_modules/src/ngb-config.ts + 13 + + + + Minutes + + node_modules/src/ngb-config.ts + 13 + + + + »» + + node_modules/src/ngb-config.ts + 13 + + + + Increment hours + + node_modules/src/ngb-config.ts + 13 + + + + First + + node_modules/src/ngb-config.ts + 13 + + + + Previous + + node_modules/src/ngb-config.ts + 13 + + + + Decrement hours + + node_modules/src/ngb-config.ts + 13 + + + + Next + + node_modules/src/ngb-config.ts + 13 + + + + Increment minutes + + node_modules/src/ngb-config.ts + 13 + + + + Last + + node_modules/src/ngb-config.ts + 13 + + + + Decrement minutes + + node_modules/src/ngb-config.ts + 13 + + + + SS + + node_modules/src/ngb-config.ts + 13 + + + + Seconds + + node_modules/src/ngb-config.ts + 13 + + + + Increment seconds + + node_modules/src/ngb-config.ts + 13 + + + + Decrement seconds + + node_modules/src/ngb-config.ts + 13 + + + + + + node_modules/src/ngb-config.ts + 13 + + + + + + node_modules/src/ngb-config.ts + 13 @@ -967,7 +967,7 @@ src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 37 + 38 src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.html @@ -1006,7 +1006,7 @@ src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 38 + 39 src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.html @@ -1208,102 +1208,109 @@ 15 + + Rule order + + src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html + 16 + + Paperless will only process mails that match all of the filters specified below. src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 18 + 19 Filter from src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 19 + 20 Filter subject src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 20 + 21 Filter body src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 21 + 22 Filter attachment filename src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 22 + 23 Only consume documents which entirely match this filename if specified. Wildcards such as *.pdf or *invoice* are allowed. Case insensitive. src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 22 + 23 Action src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 25 + 26 Action is only performed when documents are consumed from the mail. Mails without attachments remain entirely untouched. src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 25 + 26 Action parameter src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 26 + 27 Assign title from src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 27 + 28 Assign document type src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 29 + 30 Assign correspondent from src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 30 + 31 Assign correspondent src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 31 + 32 Error src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html - 36 + 37 src/app/services/toast.service.ts @@ -1965,7 +1972,7 @@ of src/app/components/document-detail/document-detail.component.html - 5 + 5,6 @@ -1980,7 +1987,7 @@ src/app/components/document-list/document-card-large/document-card-large.component.html - 58 + 64 src/app/components/document-list/document-card-small/document-card-small.component.html @@ -2013,7 +2020,7 @@ src/app/components/document-list/document-card-large/document-card-large.component.html - 38 + 44 @@ -2262,7 +2269,7 @@ Confirm delete src/app/components/document-detail/document-detail.component.ts - 442 + 449 src/app/components/manage/management-list/management-list.component.ts @@ -2273,35 +2280,35 @@ Do you really want to delete document ""? src/app/components/document-detail/document-detail.component.ts - 443 + 450 The files for this document will be deleted permanently. This operation cannot be undone. src/app/components/document-detail/document-detail.component.ts - 444 + 451 Delete document src/app/components/document-detail/document-detail.component.ts - 446 + 453 Error deleting document: src/app/components/document-detail/document-detail.component.ts - 462 + 469 Redo OCR confirm src/app/components/document-detail/document-detail.component.ts - 482 + 489 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -2312,14 +2319,14 @@ This operation will permanently redo OCR for this document. src/app/components/document-detail/document-detail.component.ts - 483 + 490 This operation cannot be undone. src/app/components/document-detail/document-detail.component.ts - 484 + 491 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -2342,7 +2349,7 @@ Proceed src/app/components/document-detail/document-detail.component.ts - 486 + 493 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -2361,7 +2368,7 @@ Redo OCR operation will begin in the background. Close and re-open or reload this document after the operation has completed to see new content. src/app/components/document-detail/document-detail.component.ts - 494 + 501 @@ -2370,7 +2377,7 @@ )"/> src/app/components/document-detail/document-detail.component.ts - 505,507 + 512,514 @@ -2701,7 +2708,7 @@ Edit src/app/components/document-list/document-card-large/document-card-large.component.html - 43 + 49 src/app/components/document-list/document-card-small/document-card-small.component.html @@ -2752,14 +2759,14 @@ View src/app/components/document-list/document-card-large/document-card-large.component.html - 50 + 56 Filter by document type src/app/components/document-list/document-card-large/document-card-large.component.html - 63 + 69 src/app/components/document-list/document-list.component.html @@ -2770,7 +2777,7 @@ Filter by storage path src/app/components/document-list/document-card-large/document-card-large.component.html - 70 + 76 src/app/components/document-list/document-list.component.html @@ -2781,40 +2788,40 @@ Created: src/app/components/document-list/document-card-large/document-card-large.component.html - 85 + 91,92 src/app/components/document-list/document-card-small/document-card-small.component.html - 48 + 48,49 Added: src/app/components/document-list/document-card-large/document-card-large.component.html - 86 + 92,93 src/app/components/document-list/document-card-small/document-card-small.component.html - 49 + 49,50 Modified: src/app/components/document-list/document-card-large/document-card-large.component.html - 87 + 93,94 src/app/components/document-list/document-card-small/document-card-small.component.html - 50 + 50,51 Score: src/app/components/document-list/document-card-large/document-card-large.component.html - 98 + 104 @@ -2926,7 +2933,7 @@ ASN src/app/components/document-list/document-list.component.html - 127 + 128,127 src/app/components/document-list/filter-editor/filter-editor.component.ts @@ -3420,21 +3427,21 @@ Short: src/app/components/manage/settings/settings.component.html - 56 + 56,57 Medium: src/app/components/manage/settings/settings.component.html - 60 + 60,61 Long: src/app/components/manage/settings/settings.component.html - 64 + 64,65 @@ -3532,14 +3539,14 @@ Update checking works by pinging the the public Github API for the latest release to determine whether a new version is available. Actual updating of the app must still be performed manually. src/app/components/manage/settings/settings.component.html - 139,142 + 140,142 - No tracking data is collected by the app in any way. + No tracking data is collected by the app in any way. src/app/components/manage/settings/settings.component.html - 144 + 144,146 @@ -3549,8 +3556,8 @@ 146 - - Note that for users of thirdy-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release. + + Note that for users of third-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release. src/app/components/manage/settings/settings.component.html 146 @@ -3658,7 +3665,7 @@ Mail src/app/components/manage/settings/settings.component.html - 231 + 232,231 @@ -4191,18 +4198,25 @@ 15 + + Document with ASN already exists. + + src/app/services/consumer-status.service.ts + 16 + + File not found. src/app/services/consumer-status.service.ts - 16 + 17 Pre-consume script does not exist. src/app/services/consumer-status.service.ts - 17 + 18 Pre-Consume is a term that appears like that in the documentation as well and does not need a specific translation @@ -4210,7 +4224,7 @@ Error while executing pre-consume script. src/app/services/consumer-status.service.ts - 18 + 19 Pre-Consume is a term that appears like that in the documentation as well and does not need a specific translation @@ -4218,7 +4232,7 @@ Post-consume script does not exist. src/app/services/consumer-status.service.ts - 19 + 20 Post-Consume is a term that appears like that in the documentation as well and does not need a specific translation @@ -4226,7 +4240,7 @@ Error while executing post-consume script. src/app/services/consumer-status.service.ts - 20 + 21 Post-Consume is a term that appears like that in the documentation as well and does not need a specific translation @@ -4234,49 +4248,49 @@ Received new file. src/app/services/consumer-status.service.ts - 21 + 22 File type not supported. src/app/services/consumer-status.service.ts - 22 + 23 Processing document... src/app/services/consumer-status.service.ts - 23 + 24 Generating thumbnail... src/app/services/consumer-status.service.ts - 24 + 25 Retrieving date from document... src/app/services/consumer-status.service.ts - 25 + 26 Saving document... src/app/services/consumer-status.service.ts - 26 + 27 Finished. src/app/services/consumer-status.service.ts - 27 + 28 @@ -4336,165 +4350,172 @@ 145 + + Arabic + + src/app/services/settings.service.ts + 151 + + Belarusian src/app/services/settings.service.ts - 151 + 157 Czech src/app/services/settings.service.ts - 157 + 163 Danish src/app/services/settings.service.ts - 163 + 169 German src/app/services/settings.service.ts - 169 + 175 English (GB) src/app/services/settings.service.ts - 175 + 181 Spanish src/app/services/settings.service.ts - 181 + 187 French src/app/services/settings.service.ts - 187 + 193 Italian src/app/services/settings.service.ts - 193 + 199 Luxembourgish src/app/services/settings.service.ts - 199 + 205 Dutch src/app/services/settings.service.ts - 205 + 211 Polish src/app/services/settings.service.ts - 211 + 217 Portuguese (Brazil) src/app/services/settings.service.ts - 217 + 223 Portuguese src/app/services/settings.service.ts - 223 + 229 Romanian src/app/services/settings.service.ts - 229 + 235 Russian src/app/services/settings.service.ts - 235 + 241 Slovenian src/app/services/settings.service.ts - 241 + 247 Serbian src/app/services/settings.service.ts - 247 + 253 Swedish src/app/services/settings.service.ts - 253 + 259 Turkish src/app/services/settings.service.ts - 259 + 265 Chinese Simplified src/app/services/settings.service.ts - 265 + 271 ISO 8601 src/app/services/settings.service.ts - 282 + 288 Successfully completed one-time migratration of settings to the database! src/app/services/settings.service.ts - 393 + 399 Unable to migrate settings to the database, please try saving manually. src/app/services/settings.service.ts - 394 + 400 diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts index f99f547e6..19f85398b 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.ts @@ -204,6 +204,10 @@ export class DocumentDetailComponent ) .subscribe({ next: (titleValue) => { + // In the rare case when the field changed just after debounced event was fired. + // We dont want to overwrite whats actually in the text field, so just return + if (titleValue !== this.titleInput.value) return + this.title = titleValue this.documentForm.patchValue({ title: titleValue }) }, diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html index c114a2d6e..b18524e38 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html @@ -26,11 +26,11 @@

- + - + {{contentTrimmed}}

diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts index b43187879..5d24042b9 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts @@ -70,6 +70,22 @@ export class DocumentCardLargeComponent { } } + get searchCommentHighlights() { + let highlights = [] + if ( + this.document['__search_hit__'] && + this.document['__search_hit__'].comment_highlights + ) { + // only show comments with a match + highlights = ( + this.document['__search_hit__'].comment_highlights as string + ) + .split(',') + .filter((higlight) => higlight.includes(' No tracking data is collected by the app in any way.

- + diff --git a/src-ui/src/environments/environment.prod.ts b/src-ui/src/environments/environment.prod.ts index 832f69378..16cbe7df6 100644 --- a/src-ui/src/environments/environment.prod.ts +++ b/src-ui/src/environments/environment.prod.ts @@ -5,7 +5,7 @@ export const environment = { apiBaseUrl: document.baseURI + 'api/', apiVersion: '2', appTitle: 'Paperless-ngx', - version: '1.12.1', + version: '1.12.1-dev', webSocketHost: window.location.host, webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:', webSocketBaseUrl: base_url.pathname + 'ws/', diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 82b8afecc..6e3ecfe05 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -4,7 +4,6 @@ import shutil import tempfile from dataclasses import dataclass from functools import lru_cache -from math import ceil from pathlib import Path from typing import List from typing import Optional @@ -12,10 +11,9 @@ from typing import Optional import magic from django.conf import settings from pdf2image import convert_from_path +from pdf2image.exceptions import PDFPageCountError from pikepdf import Page -from pikepdf import PasswordError from pikepdf import Pdf -from pikepdf import PdfImage from PIL import Image from PIL import ImageSequence from pyzbar import pyzbar @@ -154,52 +152,15 @@ def scan_file_for_barcodes( (page_number, barcode_text) tuples """ - def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]: - detected_barcodes = [] - with Pdf.open(pdf_filepath) as pdf: - for page_num, page in enumerate(pdf.pages): - for image_key in page.images: - pdfimage = PdfImage(page.images[image_key]) - - # This type is known to have issues: - # https://github.com/pikepdf/pikepdf/issues/401 - if "/CCITTFaxDecode" in pdfimage.filters: - raise BarcodeImageFormatError( - "Unable to decode CCITTFaxDecode images", - ) - - # Not all images can be transcoded to a PIL image, which - # is what pyzbar expects to receive, so this may - # raise an exception, triggering fallback - pillow_img = pdfimage.as_pil_image() - - # Scale the image down - # See: https://github.com/paperless-ngx/paperless-ngx/issues/2385 - # TLDR: zbar has issues with larger images - width, height = pillow_img.size - if width > 1024: - scaler = ceil(width / 1024) - new_width = int(width / scaler) - new_height = int(height / scaler) - pillow_img = pillow_img.resize((new_width, new_height)) - - width, height = pillow_img.size - if height > 2048: - scaler = ceil(height / 2048) - new_width = int(width / scaler) - new_height = int(height / scaler) - pillow_img = pillow_img.resize((new_width, new_height)) - - for barcode_value in barcode_reader(pillow_img): - detected_barcodes.append(Barcode(page_num, barcode_value)) - - return detected_barcodes - def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]: detected_barcodes = [] # use a temporary directory in case the file is too big to handle in memory with tempfile.TemporaryDirectory() as path: - pages_from_path = convert_from_path(pdf_filepath, output_folder=path) + pages_from_path = convert_from_path( + pdf_filepath, + dpi=300, + output_folder=path, + ) for current_page_number, page in enumerate(pages_from_path): for barcode_value in barcode_reader(page): detected_barcodes.append( @@ -219,27 +180,19 @@ def scan_file_for_barcodes( # Always try pikepdf first, it's usually fine, faster and # uses less memory try: - barcodes = _pikepdf_barcode_scan(pdf_filepath) + barcodes = _pdf2image_barcode_scan(pdf_filepath) # Password protected files can't be checked - except PasswordError as e: + # This is the exception raised for those + except PDFPageCountError as e: logger.warning( f"File is likely password protected, not checking for barcodes: {e}", ) - # Handle pikepdf related image decoding issues with a fallback to page - # by page conversion to images in a temporary directory - except Exception as e: + # This file is really borked, allow the consumption to continue + # but it may fail further on + except Exception as e: # pragma: no cover logger.warning( - f"Falling back to pdf2image because: {e}", + f"Exception during barcode scanning: {e}", ) - try: - barcodes = _pdf2image_barcode_scan(pdf_filepath) - # This file is really borked, allow the consumption to continue - # but it may fail further on - except Exception as e: # pragma: no cover - logger.warning( - f"Exception during barcode scanning: {e}", - ) - else: logger.warning( f"Unsupported file format for barcode reader: {str(mime_type)}", diff --git a/src/documents/consumer.py b/src/documents/consumer.py index bc344abb9..8c80304d3 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,7 +1,10 @@ import datetime import hashlib import os +import shutil +import tempfile import uuid +from pathlib import Path from subprocess import CompletedProcess from subprocess import run from typing import Optional @@ -94,7 +97,8 @@ class Consumer(LoggingMixin): def __init__(self): super().__init__() - self.path = None + self.path: Optional[Path] = None + self.original_path: Optional[Path] = None self.filename = None self.override_title = None self.override_correspondent_id = None @@ -167,16 +171,18 @@ class Consumer(LoggingMixin): self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") - filepath_arg = os.path.normpath(self.path) + working_file_path = str(self.path) + original_file_path = str(self.original_path) script_env = os.environ.copy() - script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg + script_env["DOCUMENT_SOURCE_PATH"] = original_file_path + script_env["DOCUMENT_WORKING_PATH"] = working_file_path try: completed_proc = run( args=[ settings.PRE_CONSUME_SCRIPT, - filepath_arg, + original_file_path, ], env=script_env, capture_output=True, @@ -195,7 +201,7 @@ class Consumer(LoggingMixin): exception=e, ) - def run_post_consume_script(self, document): + def run_post_consume_script(self, document: Document): if not settings.POST_CONSUME_SCRIPT: return @@ -285,8 +291,8 @@ class Consumer(LoggingMixin): Return the document object if it was successfully created. """ - self.path = path - self.filename = override_filename or os.path.basename(path) + self.path = Path(path).resolve() + self.filename = override_filename or self.path.name self.override_title = override_title self.override_correspondent_id = override_correspondent_id self.override_document_type_id = override_document_type_id @@ -311,6 +317,15 @@ class Consumer(LoggingMixin): self.log("info", f"Consuming {self.filename}") + # For the actual work, copy the file into a tempdir + self.original_path = self.path + tempdir = tempfile.TemporaryDirectory( + prefix="paperless-ngx", + dir=settings.SCRATCH_DIR, + ) + self.path = Path(tempdir.name) / Path(self.filename) + shutil.copy(self.original_path, self.path) + # Determine the parser class. mime_type = magic.from_file(self.path, mime=True) @@ -453,11 +468,12 @@ class Consumer(LoggingMixin): # Delete the file only if it was successfully consumed self.log("debug", f"Deleting file {self.path}") os.unlink(self.path) + self.original_path.unlink() # https://github.com/jonaswinkler/paperless-ng/discussions/1037 shadow_file = os.path.join( - os.path.dirname(self.path), - "._" + os.path.basename(self.path), + os.path.dirname(self.original_path), + "._" + os.path.basename(self.original_path), ) if os.path.isfile(shadow_file): @@ -474,6 +490,7 @@ class Consumer(LoggingMixin): ) finally: document_parser.cleanup() + tempdir.cleanup() self.run_post_consume_script(document) diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png similarity index 100% rename from src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png rename to src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png similarity index 100% rename from src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png rename to src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 7beeee288..4f7f1278a 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -1,9 +1,7 @@ import os import shutil -import tempfile from unittest import mock -import pikepdf from django.conf import settings from django.test import override_settings from django.test import TestCase @@ -23,13 +21,29 @@ class TestBarcode(DirectoriesMixin, TestCase): BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes") - def test_barcode_reader(self): + def test_barcode_reader_png(self): + """ + GIVEN: + - PNG file with separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join(self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT.png") img = Image.open(test_file) - separator_barcode = str(settings.CONSUMER_BARCODE_STRING) + separator_barcode = settings.CONSUMER_BARCODE_STRING self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - def test_barcode_reader2(self): + def test_barcode_reader_pbm(self): + """ + GIVEN: + - Netpbm bitmap file with separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pbm", @@ -38,25 +52,49 @@ class TestBarcode(DirectoriesMixin, TestCase): separator_barcode = str(settings.CONSUMER_BARCODE_STRING) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - def test_barcode_reader_distorsion(self): + def test_barcode_reader_distortion_scratchy(self): + """ + GIVEN: + - Image containing high noise + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, - "barcode-39-PATCHT-distorsion.png", + "barcode-39-PATCHT-distortion.png", ) img = Image.open(test_file) separator_barcode = str(settings.CONSUMER_BARCODE_STRING) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - def test_barcode_reader_distorsion2(self): + def test_barcode_reader_distortion_stretched(self): + """ + GIVEN: + - Image with a stretched barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, - "barcode-39-PATCHT-distorsion2.png", + "barcode-39-PATCHT-distortion2.png", ) img = Image.open(test_file) separator_barcode = str(settings.CONSUMER_BARCODE_STRING) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) def test_barcode_reader_unreadable(self): + """ + GIVEN: + - Image with a truly unreadable barcode + WHEN: + - Image is scanned for codes + THEN: + - No barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT-unreadable.png", @@ -65,6 +103,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), []) def test_barcode_reader_qr(self): + """ + GIVEN: + - Image file with QR separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "qr-code-PATCHT.png", @@ -74,6 +120,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) def test_barcode_reader_128(self): + """ + GIVEN: + - Image file with 128 style separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-128-PATCHT.png", @@ -83,11 +137,27 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) def test_barcode_reader_no_barcode(self): + """ + GIVEN: + - Image file with no barcode + WHEN: + - Image is scanned for codes + THEN: + - No barcode is detected + """ test_file = os.path.join(self.SAMPLE_DIR, "simple.png") img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), []) + self.assertListEqual(barcodes.barcode_reader(img), []) def test_barcode_reader_custom_separator(self): + """ + GIVEN: + - Image file with custom separator barcode value + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-39-custom.png", @@ -96,6 +166,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) def test_barcode_reader_custom_qr_separator(self): + """ + GIVEN: + - Image file with custom separator barcode value as a QR code + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-qr-custom.png", @@ -104,6 +182,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) def test_barcode_reader_custom_128_separator(self): + """ + GIVEN: + - Image file with custom separator 128 barcode value + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-128-custom.png", @@ -111,6 +197,679 @@ class TestBarcode(DirectoriesMixin, TestCase): img = Image.open(test_file) self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) + def test_get_mime_type(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ + tiff_file = os.path.join( + self.SAMPLE_DIR, + "simple.tiff", + ) + pdf_file = os.path.join( + self.SAMPLE_DIR, + "simple.pdf", + ) + png_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-128-custom.png", + ) + tiff_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile1") + pdf_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile2") + shutil.copy(tiff_file, tiff_file_no_extension) + shutil.copy(pdf_file, pdf_file_no_extension) + + self.assertEqual(barcodes.get_file_mime_type(tiff_file), "image/tiff") + self.assertEqual(barcodes.get_file_mime_type(pdf_file), "application/pdf") + self.assertEqual( + barcodes.get_file_mime_type(tiff_file_no_extension), + "image/tiff", + ) + self.assertEqual( + barcodes.get_file_mime_type(pdf_file_no_extension), + "application/pdf", + ) + self.assertEqual(barcodes.get_file_mime_type(png_file), "image/png") + + def test_convert_from_tiff_to_pdf(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ + test_file = os.path.join( + os.path.dirname(__file__), + "samples", + "simple.tiff", + ) + dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff") + shutil.copy(test_file, dst) + target_file = barcodes.convert_from_tiff_to_pdf(dst) + file_extension = os.path.splitext(os.path.basename(target_file))[1] + self.assertTrue(os.path.isfile(target_file)) + self.assertEqual(file_extension, ".pdf") + + def test_convert_error_from_pdf_to_pdf(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ + test_file = os.path.join( + self.SAMPLE_DIR, + "simple.pdf", + ) + dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf") + shutil.copy(test_file, dst) + self.assertIsNone(barcodes.convert_from_tiff_to_pdf(dst)) + + def test_scan_file_for_separating_barcodes(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, [0]) + + def test_scan_file_for_separating_barcodes_none_present(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ + test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf") + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, []) + + def test_scan_file_for_separating_barcodes_middle_page(self): + """ + GIVEN: + - PDF file containing a separator on page 1 (zero indexed) + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 1 (zero indexed) + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t-middle.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, [1]) + + def test_scan_file_for_separating_barcodes_multiple_pages(self): + """ + GIVEN: + - PDF file containing a separator on pages 2 and 5 (zero indexed) + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on pages 2 and 5 (zero indexed) + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "several-patcht-codes.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, [2, 5]) + + def test_scan_file_for_separating_barcodes_upside_down(self): + """ + GIVEN: + - PDF file containing a separator on page 1 (zero indexed) + - The barcode is upside down + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 1 (zero indexed) + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t-middle_reverse.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, [1]) + + def test_scan_file_for_separating_barcodes_fax_decode(self): + """ + GIVEN: + - A PDF containing an image encoded as CCITT Group 4 encoding + WHEN: + - Barcode processing happens with the file + THEN: + - The barcode is still detected + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-fax-image.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, [1]) + + def test_scan_file_for_separating_qr_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode is a QR code + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t-qr.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, [0]) + + @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") + def test_scan_file_for_separating_custom_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode separation value is customized + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-custom.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, [0]) + + @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") + def test_scan_file_for_separating_custom_qr_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode separation value is customized + - The barcode is a QR code + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-qr-custom.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, [0]) + + @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") + def test_scan_file_for_separating_custom_128_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode separation value is customized + - The barcode is a 128 code + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-128-custom.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, [0]) + + def test_scan_file_for_separating_wrong_qr_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode value is customized + - The separation value is NOT customized + WHEN: + - File is scanned for barcodes + THEN: + - No split pages are detected + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-custom.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, []) + + @override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC") + def test_scan_file_for_separating_qr_barcodes(self): + """ + GIVEN: + - Input PDF with certain QR codes that aren't detected at current size + WHEN: + - The input file is scanned for barcodes + THEN: + - QR codes are detected + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "many-qr-codes.pdf", + ) + + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertGreater(len(doc_barcode_info.barcodes), 0) + self.assertListEqual(separator_page_numbers, [1]) + + def test_separate_pages(self): + """ + GIVEN: + - Input PDF 2 pages after separation + WHEN: + - The input file separated at the barcode + THEN: + - Two new documents are produced + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t-middle.pdf", + ) + documents = barcodes.separate_pages(test_file, [1]) + + self.assertEqual(len(documents), 2) + + def test_separate_pages_double_code(self): + """ + GIVEN: + - Input PDF with two patch code pages in a row + WHEN: + - The input file is split + THEN: + - Only two files are output + """ + test_file = os.path.join( + os.path.dirname(__file__), + self.BARCODE_SAMPLE_DIR, + "patch-code-t-double.pdf", + ) + pages = barcodes.separate_pages(test_file, [1, 2]) + + self.assertEqual(len(pages), 2) + + def test_separate_pages_no_list(self): + """ + GIVEN: + - Input file to separate + WHEN: + - No separation pages are provided + THEN: + - No new documents are produced + - A warning is logged + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t-middle.pdf", + ) + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: + pages = barcodes.separate_pages(test_file, []) + self.assertEqual(pages, []) + self.assertEqual( + cm.output, + [ + "WARNING:paperless.barcodes:No pages to split on!", + ], + ) + + def test_save_to_dir(self): + """ + GIVEN: + - File to save to a directory + WHEN: + - The file is saved + THEN: + - The file exists + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t.pdf", + ) + barcodes.save_to_dir(test_file, target_dir=settings.SCRATCH_DIR) + target_file = os.path.join(settings.SCRATCH_DIR, "patch-code-t.pdf") + self.assertTrue(os.path.isfile(target_file)) + + def test_save_to_dir_not_existing(self): + """ + GIVEN: + - File to save to a directory + - The directory doesn't exist + WHEN: + - The file is saved + THEN: + - The file exists + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t.pdf", + ) + nonexistingdir = "/nowhere" + if os.path.isdir(nonexistingdir): + self.fail("non-existing dir exists") + + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: + barcodes.save_to_dir(test_file, target_dir=nonexistingdir) + self.assertEqual( + cm.output, + [ + f"WARNING:paperless.barcodes:{str(test_file)} or {str(nonexistingdir)} don't exist.", + ], + ) + + def test_save_to_dir_given_name(self): + """ + GIVEN: + - File to save to a directory + - There is a name override + WHEN: + - The file is saved + THEN: + - The file exists + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t.pdf", + ) + barcodes.save_to_dir( + test_file, + newname="newname.pdf", + target_dir=settings.SCRATCH_DIR, + ) + target_file = os.path.join(settings.SCRATCH_DIR, "newname.pdf") + self.assertTrue(os.path.isfile(target_file)) + + def test_barcode_splitter(self): + """ + GIVEN: + - Input file containing barcodes + WHEN: + - Input file is split on barcodes + THEN: + - Correct number of files produced + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t-middle.pdf", + ) + + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(test_file, doc_barcode_info.pdf_path) + self.assertTrue(len(separator_page_numbers) > 0) + + document_list = barcodes.separate_pages(test_file, separator_page_numbers) + self.assertGreater(len(document_list), 0) + + for document in document_list: + barcodes.save_to_dir(document, target_dir=settings.SCRATCH_DIR) + + target_file1 = os.path.join( + settings.SCRATCH_DIR, + "patch-code-t-middle_document_0.pdf", + ) + target_file2 = os.path.join( + settings.SCRATCH_DIR, + "patch-code-t-middle_document_1.pdf", + ) + + self.assertTrue(os.path.isfile(target_file1)) + self.assertTrue(os.path.isfile(target_file2)) + + @override_settings(CONSUMER_ENABLE_BARCODES=True) + def test_consume_barcode_file(self): + """ + GIVEN: + - Input file with barcodes given to consume task + WHEN: + - Consume task returns + THEN: + - The file was split + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t-middle.pdf", + ) + + dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf") + shutil.copy(test_file, dst) + + with mock.patch("documents.tasks.async_to_sync"): + self.assertEqual(tasks.consume_file(dst), "File successfully split") + + @override_settings( + CONSUMER_ENABLE_BARCODES=True, + CONSUMER_BARCODE_TIFF_SUPPORT=True, + ) + def test_consume_barcode_tiff_file(self): + """ + GIVEN: + - TIFF image containing barcodes + WHEN: + - Consume task returns + THEN: + - The file was split + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t-middle.tiff", + ) + dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff") + shutil.copy(test_file, dst) + + with mock.patch("documents.tasks.async_to_sync"): + self.assertEqual(tasks.consume_file(dst), "File successfully split") + + @override_settings( + CONSUMER_ENABLE_BARCODES=True, + CONSUMER_BARCODE_TIFF_SUPPORT=True, + ) + @mock.patch("documents.consumer.Consumer.try_consume_file") + def test_consume_barcode_unsupported_jpg_file(self, m): + """ + GIVEN: + - JPEG image as input + WHEN: + - Consume task returns + THEN: + - Barcode reader reported warning + - Consumption continued with the file + """ + test_file = os.path.join( + self.SAMPLE_DIR, + "simple.jpg", + ) + dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg") + shutil.copy(test_file, dst) + + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: + self.assertIn("Success", tasks.consume_file(dst)) + + self.assertListEqual( + cm.output, + [ + "WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg", + ], + ) + m.assert_called_once() + + args, kwargs = m.call_args + self.assertIsNone(kwargs["override_filename"]) + self.assertIsNone(kwargs["override_title"]) + self.assertIsNone(kwargs["override_correspondent_id"]) + self.assertIsNone(kwargs["override_document_type_id"]) + self.assertIsNone(kwargs["override_tag_ids"]) + + @override_settings( + CONSUMER_ENABLE_BARCODES=True, + CONSUMER_BARCODE_TIFF_SUPPORT=True, + ) + def test_consume_barcode_supported_no_extension_file(self): + """ + GIVEN: + - TIFF image containing barcodes + - TIFF file is given without extension + WHEN: + - Consume task returns + THEN: + - The file was split + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "patch-code-t-middle.tiff", + ) + dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle") + shutil.copy(test_file, dst) + + with mock.patch("documents.tasks.async_to_sync"): + self.assertEqual(tasks.consume_file(dst), "File successfully split") + + def test_scan_file_for_separating_barcodes_password(self): + """ + GIVEN: + - Password protected PDF + WHEN: + - File is scanned for barcode + THEN: + - Scanning handles the exception without crashing + """ + test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + warning = cm.output[0] + expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes" + self.assertTrue(warning.startswith(expected_str)) + + separator_page_numbers = barcodes.get_separating_barcodes( + doc_barcode_info.barcodes, + ) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertListEqual(separator_page_numbers, []) + + +class TestAsnBarcodes(DirectoriesMixin, TestCase): + + SAMPLE_DIR = os.path.join( + os.path.dirname(__file__), + "samples", + ) + + BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes") + def test_barcode_reader_asn_normal(self): """ GIVEN: @@ -163,528 +922,81 @@ class TestBarcode(DirectoriesMixin, TestCase): img = Image.open(test_file) self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"]) - def test_get_mime_type(self): - tiff_file = os.path.join( - self.SAMPLE_DIR, - "simple.tiff", - ) - pdf_file = os.path.join( - self.SAMPLE_DIR, - "simple.pdf", - ) - png_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-128-custom.png", - ) - tiff_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile1") - pdf_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile2") - shutil.copy(tiff_file, tiff_file_no_extension) - shutil.copy(pdf_file, pdf_file_no_extension) - - self.assertEqual(barcodes.get_file_mime_type(tiff_file), "image/tiff") - self.assertEqual(barcodes.get_file_mime_type(pdf_file), "application/pdf") - self.assertEqual( - barcodes.get_file_mime_type(tiff_file_no_extension), - "image/tiff", - ) - self.assertEqual( - barcodes.get_file_mime_type(pdf_file_no_extension), - "application/pdf", - ) - self.assertEqual(barcodes.get_file_mime_type(png_file), "image/png") - - def test_convert_from_tiff_to_pdf(self): - test_file = os.path.join( - os.path.dirname(__file__), - "samples", - "simple.tiff", - ) - dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff") - shutil.copy(test_file, dst) - target_file = barcodes.convert_from_tiff_to_pdf(dst) - file_extension = os.path.splitext(os.path.basename(target_file))[1] - self.assertTrue(os.path.isfile(target_file)) - self.assertEqual(file_extension, ".pdf") - - def test_convert_error_from_pdf_to_pdf(self): - test_file = os.path.join( - self.SAMPLE_DIR, - "simple.pdf", - ) - dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf") - shutil.copy(test_file, dst) - self.assertIsNone(barcodes.convert_from_tiff_to_pdf(dst)) - - def test_scan_file_for_separating_barcodes(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [0]) - - def test_scan_file_for_separating_barcodes_none_present(self): - test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf") - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, []) - - def test_scan_file_for_separating_barcodes3(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t-middle.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [1]) - - def test_scan_file_for_separating_barcodes4(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "several-patcht-codes.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [2, 5]) - - def test_scan_file_for_separating_barcodes_upsidedown(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t-middle_reverse.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [1]) - - def test_scan_file_for_barcodes_pillow_transcode_error(self): + @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") + def test_scan_file_for_asn_custom_prefix(self): """ GIVEN: - - A PDF containing an image which cannot be transcoded to a PIL image + - PDF containing an ASN barcode with custom prefix + - The ASN value is 123 WHEN: - - The image tries to be transcoded to a PIL image, but fails + - File is scanned for barcodes THEN: - - The barcode reader is still called + - The ASN is located + - The ASN integer value is correct """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-custom-prefix.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - def _build_device_n_pdf(self, save_path: str): - # Based on the pikepdf tests - # https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py - pdf = pikepdf.new() - pdf.add_blank_page(page_size=(72, 72)) - imobj = pikepdf.Stream( - pdf, - bytes(range(0, 256)), - BitsPerComponent=8, - ColorSpace=pikepdf.Array( - [ - pikepdf.Name.DeviceN, - pikepdf.Array([pikepdf.Name.Black]), - pikepdf.Name.DeviceCMYK, - pikepdf.Stream( - pdf, - b"{0 0 0 4 -1 roll}", # Colorspace conversion function - FunctionType=4, - Domain=[0.0, 1.0], - Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0], - ), - ], - ), - Width=16, - Height=16, - Type=pikepdf.Name.XObject, - Subtype=pikepdf.Name.Image, - ) - pim = pikepdf.PdfImage(imobj) - self.assertEqual(pim.mode, "DeviceN") - self.assertTrue(pim.is_device_n) + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertEqual(asn, 123) - pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do") - pdf.pages[0].Resources = pikepdf.Dictionary( - XObject=pikepdf.Dictionary(Im0=imobj), - ) - pdf.save(save_path) - - with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf: - # Build an offending file - _build_device_n_pdf(self, str(device_n_pdf.name)) - with mock.patch("documents.barcodes.barcode_reader") as reader: - reader.return_value = list() - - _ = barcodes.scan_file_for_barcodes( - str(device_n_pdf.name), - ) - - reader.assert_called() - - def test_scan_file_for_separating_barcodes_fax_decode(self): + def test_scan_file_for_asn_barcode_invalid(self): """ GIVEN: - - A PDF containing an image encoded as CCITT Group 4 encoding + - PDF containing an ASN barcode + - The ASN value is XYZXYZ WHEN: - - Barcode processing happens with the file + - File is scanned for barcodes THEN: - - The barcode is still detected + - The ASN is located + - The ASN value is not used """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, - "barcode-fax-image.pdf", + "barcode-39-asn-invalid.pdf", ) doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) + + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [1]) + self.assertEqual(asn, None) - def test_scan_file_for_separating_qr_barcodes(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t-qr.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [0]) - - @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") - def test_scan_file_for_separating_custom_barcodes(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-custom.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [0]) - - @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") - def test_scan_file_for_separating_custom_qr_barcodes(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-qr-custom.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [0]) - - @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") - def test_scan_file_for_separating_custom_128_barcodes(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-128-custom.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, [0]) - - def test_scan_file_for_separating_wrong_qr_barcodes(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-custom.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, []) - - @override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC") - def test_scan_file_for_separating_qr_barcodes(self): + @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) + def test_consume_barcode_file_asn_assignment(self): """ GIVEN: - - Input PDF with certain QR codes that aren't detected at current size + - PDF containing an ASN barcode + - The ASN value is 123 WHEN: - - The input file is scanned for barcodes + - File is scanned for barcodes THEN: - - QR codes are detected + - The ASN is located + - The ASN integer value is correct + - The ASN is provided as the override value to the consumer """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, - "many-qr-codes.pdf", + "barcode-39-asn-123.pdf", ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertGreater(len(doc_barcode_info.barcodes), 0) - self.assertListEqual(separator_page_numbers, [1]) - - def test_separate_pages(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t-middle.pdf", - ) - pages = barcodes.separate_pages(test_file, [1]) - - self.assertEqual(len(pages), 2) - - def test_separate_pages_double_code(self): - """ - GIVEN: - - Input PDF with two patch code pages in a row - WHEN: - - The input file is split - THEN: - - Only two files are output - """ - test_file = os.path.join( - os.path.dirname(__file__), - "samples", - "barcodes", - "patch-code-t-double.pdf", - ) - pages = barcodes.separate_pages(test_file, [1, 2]) - - self.assertEqual(len(pages), 2) - - def test_separate_pages_no_list(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t-middle.pdf", - ) - with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - pages = barcodes.separate_pages(test_file, []) - self.assertEqual(pages, []) - self.assertEqual( - cm.output, - [ - "WARNING:paperless.barcodes:No pages to split on!", - ], - ) - - def test_save_to_dir(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t.pdf", - ) - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - barcodes.save_to_dir(test_file, target_dir=tempdir) - target_file = os.path.join(tempdir, "patch-code-t.pdf") - self.assertTrue(os.path.isfile(target_file)) - - def test_save_to_dir2(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t.pdf", - ) - nonexistingdir = "/nowhere" - if os.path.isdir(nonexistingdir): - self.fail("non-existing dir exists") - else: - with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - barcodes.save_to_dir(test_file, target_dir=nonexistingdir) - self.assertEqual( - cm.output, - [ - f"WARNING:paperless.barcodes:{str(test_file)} or {str(nonexistingdir)} don't exist.", - ], - ) - - def test_save_to_dir3(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t.pdf", - ) - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - barcodes.save_to_dir(test_file, newname="newname.pdf", target_dir=tempdir) - target_file = os.path.join(tempdir, "newname.pdf") - self.assertTrue(os.path.isfile(target_file)) - - def test_barcode_splitter(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t-middle.pdf", - ) - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(test_file, doc_barcode_info.pdf_path) - self.assertTrue(len(separator_page_numbers) > 0) - - document_list = barcodes.separate_pages(test_file, separator_page_numbers) - self.assertTrue(document_list) - for document in document_list: - barcodes.save_to_dir(document, target_dir=tempdir) - - target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") - target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") - - self.assertTrue(os.path.isfile(target_file1)) - self.assertTrue(os.path.isfile(target_file2)) - - @override_settings(CONSUMER_ENABLE_BARCODES=True) - def test_consume_barcode_file(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t-middle.pdf", - ) - - dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf") + dst = os.path.join(settings.SCRATCH_DIR, "barcode-39-asn-123.pdf") shutil.copy(test_file, dst) - with mock.patch("documents.tasks.async_to_sync"): - self.assertEqual(tasks.consume_file(dst), "File successfully split") + with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call: + tasks.consume_file(dst) - @override_settings( - CONSUMER_ENABLE_BARCODES=True, - CONSUMER_BARCODE_TIFF_SUPPORT=True, - ) - def test_consume_barcode_tiff_file(self): - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t-middle.tiff", - ) - dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff") - shutil.copy(test_file, dst) + args, kwargs = mocked_call.call_args - with mock.patch("documents.tasks.async_to_sync"): - self.assertEqual(tasks.consume_file(dst), "File successfully split") - - @override_settings( - CONSUMER_ENABLE_BARCODES=True, - CONSUMER_BARCODE_TIFF_SUPPORT=True, - ) - @mock.patch("documents.consumer.Consumer.try_consume_file") - def test_consume_barcode_unsupported_jpg_file(self, m): - """ - This test assumes barcode and TIFF support are enabled and - the user uploads an unsupported image file (e.g. jpg) - - The function shouldn't try to scan for separating barcodes - and continue archiving the file as is. - """ - test_file = os.path.join( - self.SAMPLE_DIR, - "simple.jpg", - ) - dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg") - shutil.copy(test_file, dst) - with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - self.assertIn("Success", tasks.consume_file(dst)) - self.assertListEqual( - cm.output, - [ - "WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg", - ], - ) - m.assert_called_once() - - args, kwargs = m.call_args - self.assertIsNone(kwargs["override_filename"]) - self.assertIsNone(kwargs["override_title"]) - self.assertIsNone(kwargs["override_correspondent_id"]) - self.assertIsNone(kwargs["override_document_type_id"]) - self.assertIsNone(kwargs["override_tag_ids"]) - - @override_settings( - CONSUMER_ENABLE_BARCODES=True, - CONSUMER_BARCODE_TIFF_SUPPORT=True, - ) - def test_consume_barcode_supported_no_extension_file(self): - """ - This test assumes barcode and TIFF support are enabled and - the user uploads a supported image file, but without extension - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "patch-code-t-middle.tiff", - ) - dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle") - shutil.copy(test_file, dst) - - with mock.patch("documents.tasks.async_to_sync"): - self.assertEqual(tasks.consume_file(dst), "File successfully split") - - def test_scan_file_for_separating_barcodes_password(self): - """ - GIVEN: - - Password protected PDF - - pikepdf based scanning - WHEN: - - File is scanned for barcode - THEN: - - Scanning handles the exception without exception - """ - test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - separator_page_numbers = barcodes.get_separating_barcodes( - doc_barcode_info.barcodes, - ) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertListEqual(separator_page_numbers, []) + self.assertEqual(kwargs["override_asn"], 123) def test_scan_file_for_asn_barcode(self): """ @@ -730,85 +1042,17 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(asn, None) - def test_scan_file_for_asn_barcode_invalid(self): - """ - GIVEN: - - PDF containing an ASN barcode - - The ASN value is XYZXYZ - WHEN: - - File is scanned for barcodes - THEN: - - The ASN is located - - The ASN value is not used - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-invalid.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - - asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertEqual(asn, None) - - @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") - def test_scan_file_for_asn_custom_prefix(self): - """ - GIVEN: - - PDF containing an ASN barcode with custom prefix - - The ASN value is 123 - WHEN: - - File is scanned for barcodes - THEN: - - The ASN is located - - The ASN integer value is correct - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-custom-prefix.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertEqual(asn, 123) - - @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) - def test_consume_barcode_file_asn_assignment(self): - """ - GIVEN: - - PDF containing an ASN barcode - - The ASN value is 123 - WHEN: - - File is scanned for barcodes - THEN: - - The ASN is located - - The ASN integer value is correct - - The ASN is provided as the override value to the consumer - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-123.pdf", - ) - - dst = os.path.join(settings.SCRATCH_DIR, "barcode-39-asn-123.pdf") - shutil.copy(test_file, dst) - - with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call: - tasks.consume_file(dst) - - args, kwargs = mocked_call.call_args - - self.assertEqual(kwargs["override_asn"], 123) - @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) def test_asn_too_large(self): - + """ + GIVEN: + - ASN from barcode enabled + - Barcode contains too large an ASN value + WHEN: + - ASN from barcode checked for correctness + THEN: + - Exception is raised regarding size limits + """ src = os.path.join( os.path.dirname(__file__), "samples", diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index dc86de331..de368018f 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase): with tempfile.NamedTemporaryFile() as script: with override_settings(PRE_CONSUME_SCRIPT=script.name): c = Consumer() - c.path = "path-to-file" + c.original_path = "path-to-file" + c.path = "/tmp/somewhere/path-to-file" c.run_pre_consume_script() m.assert_called_once() @@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase): args, kwargs = m.call_args command = kwargs["args"] + environment = kwargs["env"] self.assertEqual(command[0], script.name) self.assertEqual(command[1], "path-to-file") + self.assertDictContainsSubset( + { + "DOCUMENT_SOURCE_PATH": c.original_path, + "DOCUMENT_WORKING_PATH": c.path, + }, + environment, + ) + @mock.patch("documents.consumer.Consumer.log") def test_script_with_output(self, mocked_log): """ @@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase): m.assert_called_once() - args, kwargs = m.call_args + _, kwargs = m.call_args command = kwargs["args"] + environment = kwargs["env"] self.assertEqual(command[0], script.name) self.assertEqual(command[1], str(doc.pk)) @@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase): self.assertEqual(command[7], "my_bank") self.assertCountEqual(command[8].split(","), ["a", "b"]) + self.assertDictContainsSubset( + { + "DOCUMENT_ID": str(doc.pk), + "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/", + "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/", + "DOCUMENT_CORRESPONDENT": "my_bank", + "DOCUMENT_TAGS": "a,b", + }, + environment, + ) + def test_script_exit_non_zero(self): """ GIVEN: diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index c52c9be92..b2ec0d024 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -3,6 +3,7 @@ import shutil import tempfile from collections import namedtuple from contextlib import contextmanager +from unittest import mock from django.apps import apps from django.db import connection @@ -86,6 +87,30 @@ class DirectoriesMixin: remove_dirs(self.dirs) +class ConsumerProgressMixin: + def setUp(self) -> None: + self.send_progress_patcher = mock.patch( + "documents.consumer.Consumer._send_progress", + ) + self.send_progress_mock = self.send_progress_patcher.start() + super().setUp() + + def tearDown(self) -> None: + super().tearDown() + self.send_progress_patcher.stop() + + +class DocumentConsumeDelayMixin: + def setUp(self) -> None: + self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay") + self.consume_file_mock = self.consume_file_patcher.start() + super().setUp() + + def tearDown(self) -> None: + super().tearDown() + self.consume_file_patcher.stop() + + class TestMigrations(TransactionTestCase): @property def app(self): diff --git a/src/documents/views.py b/src/documents/views.py index 854f2da2b..6a719fe70 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -477,21 +477,14 @@ class DocumentViewSet( class SearchResultSerializer(DocumentSerializer): def to_representation(self, instance): doc = Document.objects.get(id=instance["id"]) - comments = "" - if hasattr(instance.results.q, "subqueries"): - commentTerm = instance.results.q.subqueries[0] - comments = ",".join( - [ - str(c.comment) - for c in Comment.objects.filter(document=instance["id"]) - if commentTerm.text in c.comment - ], - ) + comments = ",".join( + [str(c.comment) for c in Comment.objects.filter(document=instance["id"])], + ) r = super().to_representation(doc) r["__search_hit__"] = { "score": instance.score, "highlights": instance.highlights("content", text=doc.content), - "comment_highlights": instance.highlights("content", text=comments) + "comment_highlights": instance.highlights("comments", text=comments) if doc else None, "rank": instance.rank, diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index cc5d4e3c8..f1ee263aa 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -271,6 +271,16 @@ class MailDocumentParser(DocumentParser): "paperHeight": "11.7", "scale": "1.0", } + + # Set the output format of the resulting PDF + # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno + if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + data["pdfFormat"] = "PDF/A-2b" + elif settings.OCR_OUTPUT_TYPE == "pdfa-1": + data["pdfFormat"] = "PDF/A-1a" + elif settings.OCR_OUTPUT_TYPE == "pdfa-3": + data["pdfFormat"] = "PDF/A-3b" + try: response = requests.post( url, diff --git a/src/paperless_mail/tests/test_parsers.py b/src/paperless_mail/tests/test_parsers.py index e02267970..809a1192f 100644 --- a/src/paperless_mail/tests/test_parsers.py +++ b/src/paperless_mail/tests/test_parsers.py @@ -573,8 +573,8 @@ class TestParser(TestCase): self.parser.gotenberg_server + "/forms/chromium/convert/html", mock_post.call_args.args[0], ) - self.assertEqual({}, mock_post.call_args.kwargs["headers"]) - self.assertEqual( + self.assertDictEqual({}, mock_post.call_args.kwargs["headers"]) + self.assertDictEqual( { "marginTop": "0.1", "marginBottom": "0.1", @@ -583,6 +583,7 @@ class TestParser(TestCase): "paperWidth": "8.27", "paperHeight": "11.7", "scale": "1.0", + "pdfFormat": "PDF/A-2b", }, mock_post.call_args.kwargs["data"], ) @@ -663,8 +664,8 @@ class TestParser(TestCase): self.parser.gotenberg_server + "/forms/chromium/convert/html", mock_post.call_args.args[0], ) - self.assertEqual({}, mock_post.call_args.kwargs["headers"]) - self.assertEqual( + self.assertDictEqual({}, mock_post.call_args.kwargs["headers"]) + self.assertDictEqual( { "marginTop": "0.1", "marginBottom": "0.1", diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 1cfb1eecb..f34ecbbab 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -95,9 +95,19 @@ class TikaDocumentParser(DocumentParser): ), } headers = {} + data = {} + + # Set the output format of the resulting PDF + # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno + if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + data["pdfFormat"] = "PDF/A-2b" + elif settings.OCR_OUTPUT_TYPE == "pdfa-1": + data["pdfFormat"] = "PDF/A-1a" + elif settings.OCR_OUTPUT_TYPE == "pdfa-3": + data["pdfFormat"] = "PDF/A-3b" try: - response = requests.post(url, files=files, headers=headers) + response = requests.post(url, files=files, headers=headers, data=data) response.raise_for_status() # ensure we notice bad responses except Exception as err: raise ParseError(