diff --git a/.github/scripts/cleanup-tags.py b/.github/scripts/cleanup-tags.py
index 9b299d048..590344a2c 100644
--- a/.github/scripts/cleanup-tags.py
+++ b/.github/scripts/cleanup-tags.py
@@ -15,6 +15,8 @@ from github import ContainerPackage
from github import GithubBranchApi
from github import GithubContainerRegistryApi
+import docker
+
logger = logging.getLogger("cleanup-tags")
@@ -151,12 +153,16 @@ class RegistryTagsCleaner:
for tag in sorted(self.tags_to_keep):
full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}"
logger.info(f"Checking manifest for {full_name}")
+ # TODO: It would be nice to use RegistryData from docker
+ # except the ID doesn't map to anything in the manifest
try:
proc = subprocess.run(
[
shutil.which("docker"),
- "manifest",
+ "buildx",
+ "imagetools",
"inspect",
+ "--raw",
full_name,
],
capture_output=True,
@@ -241,6 +247,65 @@ class RegistryTagsCleaner:
# By default, keep anything which is tagged
self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys()))
+ def check_tags_pull(self):
+ """
+ This method uses the Docker Python SDK to confirm all tags which were
+ kept still pull, for all platforms.
+
+ TODO: This is much slower (although more comprehensive). Maybe a Pool?
+ """
+ logger.info("Beginning confirmation step")
+ client = docker.from_env()
+ imgs = []
+ for tag in sorted(self.tags_to_keep):
+ repository = f"ghcr.io/{self.repo_owner}/{self.package_name}"
+ for arch, variant in [("amd64", None), ("arm64", None), ("arm", "v7")]:
+ # From 11.2.0 onwards, qpdf is cross compiled, so there is a single arch, amd64
+ # skip others in this case
+ if "qpdf" in self.package_name and arch != "amd64" and tag == "11.2.0":
+ continue
+ # Skip beta and release candidate tags
+ elif "beta" in tag:
+ continue
+
+ # Build the platform name
+ if variant is not None:
+ platform = f"linux/{arch}/{variant}"
+ else:
+ platform = f"linux/{arch}"
+
+ try:
+ logger.info(f"Pulling {repository}:{tag} for {platform}")
+ image = client.images.pull(
+ repository=repository,
+ tag=tag,
+ platform=platform,
+ )
+ imgs.append(image)
+ except docker.errors.APIError as e:
+ logger.error(
+ f"Failed to pull {repository}:{tag}: {e}",
+ )
+
+ # Prevent out of space errors by removing after a few
+ # pulls
+ if len(imgs) > 50:
+ for image in imgs:
+ try:
+ client.images.remove(image.id)
+ except docker.errors.APIError as e:
+ err_str = str(e)
+ # Ignore attempts to remove images that are partly shared
+ # Ignore images which are somehow gone already
+ if (
+ "must be forced" not in err_str
+ and "No such image" not in err_str
+ ):
+ logger.error(
+ f"Remove image ghcr.io/{self.repo_owner}/{self.package_name}:{tag} failed: {e}",
+ )
+ imgs = []
+
class MainImageTagsCleaner(RegistryTagsCleaner):
def decide_what_tags_to_keep(self):
@@ -397,6 +462,10 @@ def _main():
# Clean images which are untagged
cleaner.clean_untagged(args.is_manifest)
+ # Verify remaining tags still pull
+ if args.is_manifest:
+ cleaner.check_tags_pull()
+
if __name__ == "__main__":
_main()
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ac0b89611..adf03d4bb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -212,12 +212,6 @@ jobs:
name: Prepare Docker Pipeline Data
if: github.event_name == 'push' && (startsWith(github.ref, 'refs/heads/feature-') || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/beta' || contains(github.ref, 'beta.rc') || startsWith(github.ref, 'refs/tags/v'))
runs-on: ubuntu-22.04
- # If the push triggered the installer library workflow, wait for it to
- # complete here. This ensures the required versions for the final
- # image have been built, while not waiting at all if the versions haven't changed
- concurrency:
- group: build-installer-library
- cancel-in-progress: false
needs:
- documentation
- tests-backend
diff --git a/.github/workflows/cleanup-tags.yml b/.github/workflows/cleanup-tags.yml
index 6877e55bb..5992b4442 100644
--- a/.github/workflows/cleanup-tags.yml
+++ b/.github/workflows/cleanup-tags.yml
@@ -62,9 +62,9 @@ jobs:
with:
python-version: "3.10"
-
- name: Install httpx
+ name: Install Python libraries
run: |
- python -m pip install httpx
+ python -m pip install httpx docker
#
# Clean up primary package
#
@@ -81,13 +81,3 @@ jobs:
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --delete "${{ matrix.cache-name }}"
- #
- # Verify tags which are left still pull
- #
- -
- name: Check all tags still pull
- run: |
- ghcr_name=$(echo "ghcr.io/${GITHUB_REPOSITORY_OWNER}/${{ matrix.primary-name }}" | awk '{ print tolower($0) }')
- echo "Pulling all tags of ${ghcr_name}"
- docker pull --quiet --all-tags ${ghcr_name}
- docker image list
diff --git a/.github/workflows/installer-library.yml b/.github/workflows/installer-library.yml
index 32aaf85ee..56064ad86 100644
--- a/.github/workflows/installer-library.yml
+++ b/.github/workflows/installer-library.yml
@@ -169,3 +169,142 @@ jobs:
PIKEPDF_VERSION=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }}
PILLOW_VERSION=${{ needs.prepare-docker-build.outputs.pillow-version }}
LXML_VERSION=${{ needs.prepare-docker-build.outputs.lxml-version }}
+
+ commit-binary-files:
+ name: Store installers
+ needs:
+ - prepare-docker-build
+ - build-qpdf-debs
+ - build-jbig2enc
+ - build-psycopg2-wheel
+ - build-pikepdf-wheel
+ runs-on: ubuntu-22.04
+ steps:
+ -
+ name: Checkout
+ uses: actions/checkout@v3
+ with:
+ ref: binary-library
+ -
+ name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.9"
+ -
+ name: Install system dependencies
+ run: |
+ sudo apt-get update -qq
+ sudo apt-get install -qq --no-install-recommends tree
+ -
+ name: Extract qpdf files
+ run: |
+ version=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).version }}
+ tag=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).image_tag }}
+
+ docker pull --quiet ${tag}
+ docker create --name qpdf-extract ${tag}
+
+ mkdir --parents qpdf/${version}/amd64
+ docker cp qpdf-extract:/usr/src/qpdf/${version}/amd64 qpdf/${version}
+
+ mkdir --parents qpdf/${version}/arm64
+ docker cp qpdf-extract:/usr/src/qpdf/${version}/arm64 qpdf/${version}
+
+ mkdir --parents qpdf/${version}/armv7
+ docker cp qpdf-extract:/usr/src/qpdf/${version}/armv7 qpdf/${version}
+ -
+ name: Extract psycopg2 files
+ run: |
+ version=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).version }}
+ tag=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).image_tag }}
+
+ docker pull --quiet --platform linux/amd64 ${tag}
+ docker create --platform linux/amd64 --name psycopg2-extract ${tag}
+ mkdir --parents psycopg2/${version}/amd64
+ docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/amd64
+ mv psycopg2/${version}/amd64/wheels/* psycopg2/${version}/amd64
+ rm -r psycopg2/${version}/amd64/wheels/
+ docker rm psycopg2-extract
+
+ docker pull --quiet --platform linux/arm64 ${tag}
+ docker create --platform linux/arm64 --name psycopg2-extract ${tag}
+ mkdir --parents psycopg2/${version}/arm64
+ docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/arm64
+ mv psycopg2/${version}/arm64/wheels/* psycopg2/${version}/arm64
+ rm -r psycopg2/${version}/arm64/wheels/
+ docker rm psycopg2-extract
+
+ docker pull --quiet --platform linux/arm/v7 ${tag}
+ docker create --platform linux/arm/v7 --name psycopg2-extract ${tag}
+ mkdir --parents psycopg2/${version}/armv7
+ docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/armv7
+ mv psycopg2/${version}/armv7/wheels/* psycopg2/${version}/armv7
+ rm -r psycopg2/${version}/armv7/wheels/
+ docker rm psycopg2-extract
+ -
+ name: Extract pikepdf files
+ run: |
+ version=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }}
+ tag=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).image_tag }}
+
+ docker pull --quiet --platform linux/amd64 ${tag}
+ docker create --platform linux/amd64 --name pikepdf-extract ${tag}
+ mkdir --parents pikepdf/${version}/amd64
+ docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/amd64
+ mv pikepdf/${version}/amd64/wheels/* pikepdf/${version}/amd64
+ rm -r pikepdf/${version}/amd64/wheels/
+ docker rm pikepdf-extract
+
+ docker pull --quiet --platform linux/arm64 ${tag}
+ docker create --platform linux/arm64 --name pikepdf-extract ${tag}
+ mkdir --parents pikepdf/${version}/arm64
+ docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/arm64
+ mv pikepdf/${version}/arm64/wheels/* pikepdf/${version}/arm64
+ rm -r pikepdf/${version}/arm64/wheels/
+ docker rm pikepdf-extract
+
+ docker pull --quiet --platform linux/arm/v7 ${tag}
+ docker create --platform linux/arm/v7 --name pikepdf-extract ${tag}
+ mkdir --parents pikepdf/${version}/armv7
+ docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/armv7
+ mv pikepdf/${version}/armv7/wheels/* pikepdf/${version}/armv7
+ rm -r pikepdf/${version}/armv7/wheels/
+ docker rm pikepdf-extract
+ -
+ name: Extract jbig2enc files
+ run: |
+ version=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).version }}
+ tag=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).image_tag }}
+
+ docker pull --quiet --platform linux/amd64 ${tag}
+ docker create --platform linux/amd64 --name jbig2enc-extract ${tag}
+ mkdir --parents jbig2enc/${version}/amd64
+ docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/amd64/
+ mv jbig2enc/${version}/amd64/build/* jbig2enc/${version}/amd64/
+ docker rm jbig2enc-extract
+
+ docker pull --quiet --platform linux/arm64 ${tag}
+ docker create --platform linux/arm64 --name jbig2enc-extract ${tag}
+ mkdir --parents jbig2enc/${version}/arm64
+ docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/arm64
+ mv jbig2enc/${version}/arm64/build/* jbig2enc/${version}/arm64/
+ docker rm jbig2enc-extract
+
+ docker pull --quiet --platform linux/arm/v7 ${tag}
+ docker create --platform linux/arm/v7 --name jbig2enc-extract ${tag}
+ mkdir --parents jbig2enc/${version}/armv7
+ docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/armv7
+ mv jbig2enc/${version}/armv7/build/* jbig2enc/${version}/armv7/
+ docker rm jbig2enc-extract
+ -
+ name: Show file structure
+ run: |
+ tree .
+ -
+ name: Commit files
+ run: |
+ git config --global user.name "github-actions"
+ git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
+ git add pikepdf/ qpdf/ psycopg2/ jbig2enc/
+ git commit -m "Updating installer packages" || true
+ git push origin || true
diff --git a/Dockerfile b/Dockerfile
index 9522728d9..6588802bb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,19 +1,5 @@
# syntax=docker/dockerfile:1.4
-# Pull the installer images from the library
-# These are all built previously
-# They provide either a .deb or .whl
-
-ARG JBIG2ENC_VERSION
-ARG QPDF_VERSION
-ARG PIKEPDF_VERSION
-ARG PSYCOPG2_VERSION
-
-FROM ghcr.io/paperless-ngx/paperless-ngx/builder/jbig2enc:${JBIG2ENC_VERSION} as jbig2enc-builder
-FROM --platform=$BUILDPLATFORM ghcr.io/paperless-ngx/paperless-ngx/builder/qpdf:${QPDF_VERSION} as qpdf-builder
-FROM ghcr.io/paperless-ngx/paperless-ngx/builder/pikepdf:${PIKEPDF_VERSION} as pikepdf-builder
-FROM ghcr.io/paperless-ngx/paperless-ngx/builder/psycopg2:${PSYCOPG2_VERSION} as psycopg2-builder
-
FROM --platform=$BUILDPLATFORM node:16-bullseye-slim AS compile-frontend
# This stage compiles the frontend
@@ -58,24 +44,21 @@ LABEL org.opencontainers.image.url="https://github.com/paperless-ngx/paperless-n
LABEL org.opencontainers.image.licenses="GPL-3.0-only"
ARG DEBIAN_FRONTEND=noninteractive
-# Buildx provided
+# Buildx provided, must be defined to use though
ARG TARGETARCH
ARG TARGETVARIANT
# Workflow provided
+ARG JBIG2ENC_VERSION
ARG QPDF_VERSION
+ARG PIKEPDF_VERSION
+ARG PSYCOPG2_VERSION
#
# Begin installation and configuration
# Order the steps below from least often changed to most
#
-# copy jbig2enc
-# Basically will never change again
-COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/.libs/libjbig2enc* /usr/local/lib/
-COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/jbig2 /usr/local/bin/
-COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/*.h /usr/local/include/
-
# Packages need for running
ARG RUNTIME_PACKAGES="\
# Python
@@ -198,19 +181,29 @@ RUN set -eux \
# Install the built packages from the installer library images
# Use mounts to avoid copying installer files into the image
# These change sometimes
-RUN --mount=type=bind,from=qpdf-builder,target=/qpdf \
- --mount=type=bind,from=psycopg2-builder,target=/psycopg2 \
- --mount=type=bind,from=pikepdf-builder,target=/pikepdf \
- set -eux \
+RUN set -eux \
+ && echo "Getting binaries" \
+ && mkdir paperless-ngx \
+ && curl --fail --silent --show-error --output paperless-ngx.tar.gz --location https://github.com/paperless-ngx/paperless-ngx/archive/41d6e7e407af09a0882736d50c89b6e015997bff.tar.gz \
+ && tar -xf paperless-ngx.tar.gz --directory paperless-ngx --strip-components=1 \
+ && cd paperless-ngx \
+ # Setting a specific revision ensures we know what this installed
+ # and ensures cache breaking on changes
+ && echo "Installing jbig2enc" \
+ && cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/jbig2 /usr/local/bin/ \
+ && cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/libjbig2enc* /usr/local/lib/ \
&& echo "Installing qpdf" \
- && apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \
- && apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \
+ && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \
+ && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \
&& echo "Installing pikepdf and dependencies" \
- && python3 -m pip install --no-cache-dir /pikepdf/usr/src/wheels/*.whl \
+ && python3 -m pip install --no-cache-dir ./pikepdf/${PIKEPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/*.whl \
&& python3 -m pip list \
&& echo "Installing psycopg2" \
- && python3 -m pip install --no-cache-dir /psycopg2/usr/src/wheels/psycopg2*.whl \
- && python3 -m pip list
+ && python3 -m pip install --no-cache-dir ./psycopg2/${PSYCOPG2_VERSION}/${TARGETARCH}${TARGETVARIANT}/psycopg2*.whl \
+ && python3 -m pip list \
+ && echo "Cleaning up image layer" \
+ && cd ../ \
+ && rm -rf paperless-ngx
WORKDIR /usr/src/paperless/src/
diff --git a/docker-builders/Dockerfile.jbig2enc b/docker-builders/Dockerfile.jbig2enc
index 90318084f..388bdd1f7 100644
--- a/docker-builders/Dockerfile.jbig2enc
+++ b/docker-builders/Dockerfile.jbig2enc
@@ -29,7 +29,20 @@ RUN set -eux \
&& ./autogen.sh \
&& ./configure \
&& make \
+ && echo "Gathering package data" \
+ && dpkg-query -f '${Package;-40}${Version}\n' -W > ./pkg-list.txt \
&& echo "Cleaning up image" \
&& apt-get -y purge ${BUILD_PACKAGES} \
&& apt-get -y autoremove --purge \
- && rm -rf /var/lib/apt/lists/*
+ && rm -rf /var/lib/apt/lists/* \
+ && echo "Moving files around" \
+ && mkdir build \
+ # Unlink a symlink that causes problems
+ && unlink ./src/.libs/libjbig2enc.la \
+ # Move what the link pointed to
+ && mv ./src/libjbig2enc.la ./build/ \
+ # Move the shared library .so files
+ && mv ./src/.libs/libjbig2enc* ./build/ \
+ # And move the cli binary
+ && mv ./src/jbig2 ./build/ \
+ && mv ./pkg-list.txt ./build/
diff --git a/docker-builders/Dockerfile.pikepdf b/docker-builders/Dockerfile.pikepdf
index c4d1ee1dc..e4181c538 100644
--- a/docker-builders/Dockerfile.pikepdf
+++ b/docker-builders/Dockerfile.pikepdf
@@ -7,12 +7,17 @@
# Default to pulling from the main repo registry when manually building
ARG REPO="paperless-ngx/paperless-ngx"
+# This does nothing, except provide a name for a copy below
ARG QPDF_VERSION
FROM --platform=$BUILDPLATFORM ghcr.io/${REPO}/builder/qpdf:${QPDF_VERSION} as qpdf-builder
-# This does nothing, except provide a name for a copy below
-
-FROM python:3.9-slim-bullseye as main
+#
+# Stage: builder
+# Purpose:
+# - Build the pikepdf wheel
+# - Build any dependent wheels which can't be found
+#
+FROM python:3.9-slim-bullseye as builder
LABEL org.opencontainers.image.description="A intermediate image with pikepdf wheel built"
@@ -100,3 +105,14 @@ RUN set -eux \
&& apt-get -y purge ${BUILD_PACKAGES} \
&& apt-get -y autoremove --purge \
&& rm -rf /var/lib/apt/lists/*
+
+#
+# Stage: package
+# Purpose: Holds the compiled .whl files in a tiny image to pull
+#
+FROM alpine:3.17 as package
+
+WORKDIR /usr/src/wheels/
+
+COPY --from=builder /usr/src/wheels/*.whl ./
+COPY --from=builder /usr/src/wheels/pkg-list.txt ./
diff --git a/docker-builders/Dockerfile.psycopg2 b/docker-builders/Dockerfile.psycopg2
index 8fcf5264b..e3f182435 100644
--- a/docker-builders/Dockerfile.psycopg2
+++ b/docker-builders/Dockerfile.psycopg2
@@ -2,7 +2,12 @@
# Inputs:
# - PSYCOPG2_VERSION - Version to build
-FROM python:3.9-slim-bullseye as main
+#
+# Stage: builder
+# Purpose:
+# - Build the psycopg2 wheel
+#
+FROM python:3.9-slim-bullseye as builder
LABEL org.opencontainers.image.description="A intermediate image with psycopg2 wheel built"
@@ -48,3 +53,14 @@ RUN set -eux \
&& apt-get -y purge ${BUILD_PACKAGES} \
&& apt-get -y autoremove --purge \
&& rm -rf /var/lib/apt/lists/*
+
+#
+# Stage: package
+# Purpose: Holds the compiled .whl files in a tiny image to pull
+#
+FROM alpine:3.17 as package
+
+WORKDIR /usr/src/wheels/
+
+COPY --from=builder /usr/src/wheels/*.whl ./
+COPY --from=builder /usr/src/wheels/pkg-list.txt ./
diff --git a/docker-builders/README.md b/docker-builders/README.md
new file mode 100644
index 000000000..6202719c6
--- /dev/null
+++ b/docker-builders/README.md
@@ -0,0 +1,57 @@
+# Installer Library
+
+This folder contains the Dockerfiles for building certain installers or libraries, which are then pulled into the main image.
+
+## [jbig2enc](https://github.com/agl/jbig2enc)
+
+### Why
+
+JBIG is an image coding which can achieve better compression of images for PDFs.
+
+### What
+
+The Docker image builds a shared library file and utility, which is copied into the correct location in the final image.
+
+### Updating
+
+1. Ensure the given qpdf version is present in [Debian bookworm](https://packages.debian.org/bookworm/qpdf)
+2. Update `.build-config.json` to the given version
+3. If the Debian specific version has incremented, update `Dockerfile.qpdf`
+
+See Also:
+
+- [OCRMyPDF Documentation](https://ocrmypdf.readthedocs.io/en/latest/jbig2.html)
+
+## [psycopg2](https://www.psycopg.org/)
+
+### Why
+
+The pre-built wheels of psycopg2 are built on Debian 9, which provides a quite old version of libpq-dev. This causes issue with authentication methods.
+
+### What
+
+The image builds psycopg2 wheels on Debian 10 and places the produced wheels into `/usr/src/wheels/`.
+
+See Also:
+
+- [Issue 266](https://github.com/paperless-ngx/paperless-ngx/issues/266)
+
+## [qpdf](https://qpdf.readthedocs.io/en/stable/index.html)
+
+### Why
+
+qpdf and it's library provide tools to read, manipulate and fix up PDFs. Version 11 is also required by `pikepdf` 6+ and Debian 9 does not provide above version 10.
+
+### What
+
+The Docker image cross compiles .deb installers for each supported architecture of the main image. The installers are placed in `/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/`
+
+## [pikepdf](https://pikepdf.readthedocs.io/en/latest/)
+
+### Why
+
+Required by OCRMyPdf, this is a general purpose library for PDF manipulation in Python via the qpdf libraries.
+
+### What
+
+The built wheels are placed into `/usr/src/wheels/`
diff --git a/docker/docker-prepare.sh b/docker/docker-prepare.sh
index dad49774b..af2bfe2a7 100755
--- a/docker/docker-prepare.sh
+++ b/docker/docker-prepare.sh
@@ -80,7 +80,7 @@ django_checks() {
search_index() {
- local -r index_version=1
+ local -r index_version=2
local -r index_version_file=${DATA_DIR}/.index_version
if [[ (! -f "${index_version_file}") || $(<"${index_version_file}") != "$index_version" ]]; then
diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md
index 61b1c072e..9a1abcfff 100644
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -121,7 +121,17 @@ Executed after the consumer sees a new document in the consumption
folder, but before any processing of the document is performed. This
script can access the following relevant environment variables set:
-- `DOCUMENT_SOURCE_PATH`
+| Environment Variable | Description |
+| ----------------------- | ------------------------------------------------------------ |
+| `DOCUMENT_SOURCE_PATH` | Original path of the consumed document |
+| `DOCUMENT_WORKING_PATH` | Path to a copy of the original that consumption will work on |
+
+!!! note
+
+ Pre-consume scripts which modify the document should only change
+ the `DOCUMENT_WORKING_PATH` file or a second consume task may
+ be triggered, leading to failures as two tasks work on the
+ same document path
A simple but common example for this would be creating a simple script
like this:
@@ -130,7 +140,7 @@ like this:
```bash
#!/usr/bin/env bash
-pdf2pdfocr.py -i ${DOCUMENT_SOURCE_PATH}
+pdf2pdfocr.py -i ${DOCUMENT_WORKING_PATH}
```
`/etc/paperless.conf`
@@ -157,26 +167,36 @@ Executed after the consumer has successfully processed a document and
has moved it into paperless. It receives the following environment
variables:
-- `DOCUMENT_ID`
-- `DOCUMENT_FILE_NAME`
-- `DOCUMENT_CREATED`
-- `DOCUMENT_MODIFIED`
-- `DOCUMENT_ADDED`
-- `DOCUMENT_SOURCE_PATH`
-- `DOCUMENT_ARCHIVE_PATH`
-- `DOCUMENT_THUMBNAIL_PATH`
-- `DOCUMENT_DOWNLOAD_URL`
-- `DOCUMENT_THUMBNAIL_URL`
-- `DOCUMENT_CORRESPONDENT`
-- `DOCUMENT_TAGS`
-- `DOCUMENT_ORIGINAL_FILENAME`
+| Environment Variable | Description |
+| ---------------------------- | --------------------------------------------- |
+| `DOCUMENT_ID` | Database primary key of the document |
+| `DOCUMENT_FILE_NAME` | Formatted filename, not including paths |
+| `DOCUMENT_CREATED` | Date & time when document created |
+| `DOCUMENT_MODIFIED` | Date & time when document was last modified |
+| `DOCUMENT_ADDED` | Date & time when document was added |
+| `DOCUMENT_SOURCE_PATH` | Path to the original document file |
+| `DOCUMENT_ARCHIVE_PATH` | Path to the generate archive file (if any) |
+| `DOCUMENT_THUMBNAIL_PATH` | Path to the generated thumbnail |
+| `DOCUMENT_DOWNLOAD_URL` | URL for document download |
+| `DOCUMENT_THUMBNAIL_URL` | URL for the document thumbnail |
+| `DOCUMENT_CORRESPONDENT` | Assigned correspondent (if any) |
+| `DOCUMENT_TAGS` | Comma separated list of tags applied (if any) |
+| `DOCUMENT_ORIGINAL_FILENAME` | Filename of original document |
-The script can be in any language, but for a simple shell script
-example, you can take a look at
-[post-consumption-example.sh](https://github.com/paperless-ngx/paperless-ngx/blob/main/scripts/post-consumption-example.sh)
-in this project.
+The script can be in any language, A simple shell script example:
-The post consumption script cannot cancel the consumption process.
+```bash title="post-consumption-example"
+--8<-- "./scripts/post-consumption-example.sh"
+```
+
+!!! note
+
+ The post consumption script cannot cancel the consumption process.
+
+!!! warning
+
+ The post consumption script should not modify the document files
+ directly
The script's stdout and stderr will be logged line by line to the
webserver log, along with the exit code of the script.
diff --git a/docs/changelog.md b/docs/changelog.md
index 0e5a6fcba..5a9371781 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -2,6 +2,9 @@
## paperless-ngx 1.12.1
+_Note: Version 1.12.x introduced searching of comments which will work for comments added after the upgrade but a reindex of the search index is required in order to be able to search
+older comments. The Docker image will automatically perform this reindex, bare metal installations will have to perform this manually, see [the docs](https://docs.paperless-ngx.com/administration/#index)._
+
### Bug Fixes
- Fix: comments not showing in search until after manual reindex in v1.12 [@shamoon](https://github.com/shamoon) ([#2513](https://github.com/paperless-ngx/paperless-ngx/pull/2513))
diff --git a/mkdocs.yml b/mkdocs.yml
index 0d56abc68..1e692d68b 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -41,6 +41,7 @@ markdown_extensions:
anchor_linenums: true
- pymdownx.superfences
- pymdownx.inlinehilite
+ - pymdownx.snippets
strict: true
nav:
- index.md
diff --git a/src-ui/messages.xlf b/src-ui/messages.xlf
index 21ac728b3..edd742d45 100644
--- a/src-ui/messages.xlf
+++ b/src-ui/messages.xlf
@@ -5,242 +5,242 @@
Close
- node_modules/src/alert/alert.ts
- 47,48
-
-
-
- Slide of
-
- node_modules/src/carousel/carousel.ts
- 178,186
-
- Currently selected slide number read by screen reader
-
-
- Previous
-
- node_modules/src/carousel/carousel.ts
- 213,215
-
-
-
- Next
-
- node_modules/src/carousel/carousel.ts
- 236
-
-
-
- Select month
-
- node_modules/src/datepicker/datepicker-navigation-select.ts
- 50,51
-
-
- node_modules/src/datepicker/datepicker-navigation-select.ts
- 50,51
-
-
-
- Select year
-
- node_modules/src/datepicker/datepicker-navigation-select.ts
- 50,51
-
-
- node_modules/src/datepicker/datepicker-navigation-select.ts
- 50,51
-
-
-
- Previous month
-
- node_modules/src/datepicker/datepicker-navigation.ts
- 60,63
-
-
- node_modules/src/datepicker/datepicker-navigation.ts
- 60,63
-
-
-
- Next month
-
- node_modules/src/datepicker/datepicker-navigation.ts
- 60,63
-
-
- node_modules/src/datepicker/datepicker-navigation.ts
- 60,63
-
-
-
- ««
-
- node_modules/src/pagination/pagination.ts
- 269,270
-
-
-
- «
-
- node_modules/src/pagination/pagination.ts
- 269,270
-
-
-
- »
-
- node_modules/src/pagination/pagination.ts
- 269,270
-
-
-
- »»
-
- node_modules/src/pagination/pagination.ts
- 269,270
-
-
-
- First
-
- node_modules/src/pagination/pagination.ts
- 269,271
-
-
-
- Previous
-
- node_modules/src/pagination/pagination.ts
- 269,271
-
-
-
- Next
-
- node_modules/src/pagination/pagination.ts
- 269,271
-
-
-
- Last
-
- node_modules/src/pagination/pagination.ts
- 269,271
-
-
-
-
-
- node_modules/src/progressbar/progressbar.ts
- 30,33
+ node_modules/src/ngb-config.ts
+ 13HH
- node_modules/src/timepicker/timepicker.ts
- 230,231
-
-
-
- Hours
-
- node_modules/src/timepicker/timepicker.ts
- 255,258
-
-
-
- MM
-
- node_modules/src/timepicker/timepicker.ts
- 280,282
-
-
-
- Minutes
-
- node_modules/src/timepicker/timepicker.ts
- 298,299
-
-
-
- Increment hours
-
- node_modules/src/timepicker/timepicker.ts
- 328,329
-
-
-
- Decrement hours
-
- node_modules/src/timepicker/timepicker.ts
- 350,356
-
-
-
- Increment minutes
-
- node_modules/src/timepicker/timepicker.ts
- 383,384
-
-
-
- Decrement minutes
-
- node_modules/src/timepicker/timepicker.ts
- 412,416
-
-
-
- SS
-
- node_modules/src/timepicker/timepicker.ts
- 429
-
-
-
- Seconds
-
- node_modules/src/timepicker/timepicker.ts
- 429
-
-
-
- Increment seconds
-
- node_modules/src/timepicker/timepicker.ts
- 429
-
-
-
- Decrement seconds
-
- node_modules/src/timepicker/timepicker.ts
- 429
-
-
-
-
-
- node_modules/src/timepicker/timepicker.ts
- 429
-
-
-
-
-
- node_modules/src/timepicker/timepicker.ts
- 429
+ node_modules/src/ngb-config.ts
+ 13Close
- node_modules/src/toast/toast.ts
- 74,75
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ ««
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Select month
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Previous month
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Slide of
+
+ node_modules/src/ngb-config.ts
+ 13
+
+ Currently selected slide number read by screen reader
+
+
+ Hours
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ «
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Previous
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ MM
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ »
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Select year
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Next month
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Next
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Minutes
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ »»
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Increment hours
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ First
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Previous
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Decrement hours
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Next
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Increment minutes
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Last
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Decrement minutes
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ SS
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Seconds
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Increment seconds
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+ Decrement seconds
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+
+
+ node_modules/src/ngb-config.ts
+ 13
+
+
+
+
+
+ node_modules/src/ngb-config.ts
+ 13
@@ -967,7 +967,7 @@
src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 37
+ 38src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.html
@@ -1006,7 +1006,7 @@
src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 38
+ 39src/app/components/common/edit-dialog/storage-path-edit-dialog/storage-path-edit-dialog.component.html
@@ -1208,102 +1208,109 @@
15
+
+ Rule order
+
+ src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
+ 16
+
+ Paperless will only process mails that match all of the filters specified below.src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 18
+ 19Filter fromsrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 19
+ 20Filter subjectsrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 20
+ 21Filter bodysrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 21
+ 22Filter attachment filenamesrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 22
+ 23Only consume documents which entirely match this filename if specified. Wildcards such as *.pdf or *invoice* are allowed. Case insensitive.src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 22
+ 23Actionsrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 25
+ 26Action is only performed when documents are consumed from the mail. Mails without attachments remain entirely untouched.src/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 25
+ 26Action parametersrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 26
+ 27Assign title fromsrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 27
+ 28Assign document typesrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 29
+ 30Assign correspondent fromsrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 30
+ 31Assign correspondentsrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 31
+ 32Errorsrc/app/components/common/edit-dialog/mail-rule-edit-dialog/mail-rule-edit-dialog.component.html
- 36
+ 37src/app/services/toast.service.ts
@@ -1965,7 +1972,7 @@
of src/app/components/document-detail/document-detail.component.html
- 5
+ 5,6
@@ -1980,7 +1987,7 @@
src/app/components/document-list/document-card-large/document-card-large.component.html
- 58
+ 64src/app/components/document-list/document-card-small/document-card-small.component.html
@@ -2013,7 +2020,7 @@
src/app/components/document-list/document-card-large/document-card-large.component.html
- 38
+ 44
@@ -2262,7 +2269,7 @@
Confirm deletesrc/app/components/document-detail/document-detail.component.ts
- 442
+ 449src/app/components/manage/management-list/management-list.component.ts
@@ -2273,35 +2280,35 @@
Do you really want to delete document ""?src/app/components/document-detail/document-detail.component.ts
- 443
+ 450The files for this document will be deleted permanently. This operation cannot be undone.src/app/components/document-detail/document-detail.component.ts
- 444
+ 451Delete documentsrc/app/components/document-detail/document-detail.component.ts
- 446
+ 453Error deleting document: src/app/components/document-detail/document-detail.component.ts
- 462
+ 469Redo OCR confirmsrc/app/components/document-detail/document-detail.component.ts
- 482
+ 489src/app/components/document-list/bulk-editor/bulk-editor.component.ts
@@ -2312,14 +2319,14 @@
This operation will permanently redo OCR for this document.src/app/components/document-detail/document-detail.component.ts
- 483
+ 490This operation cannot be undone.src/app/components/document-detail/document-detail.component.ts
- 484
+ 491src/app/components/document-list/bulk-editor/bulk-editor.component.ts
@@ -2342,7 +2349,7 @@
Proceedsrc/app/components/document-detail/document-detail.component.ts
- 486
+ 493src/app/components/document-list/bulk-editor/bulk-editor.component.ts
@@ -2361,7 +2368,7 @@
Redo OCR operation will begin in the background. Close and re-open or reload this document after the operation has completed to see new content.src/app/components/document-detail/document-detail.component.ts
- 494
+ 501
@@ -2370,7 +2377,7 @@
)"/>
src/app/components/document-detail/document-detail.component.ts
- 505,507
+ 512,514
@@ -2701,7 +2708,7 @@
Editsrc/app/components/document-list/document-card-large/document-card-large.component.html
- 43
+ 49src/app/components/document-list/document-card-small/document-card-small.component.html
@@ -2752,14 +2759,14 @@
Viewsrc/app/components/document-list/document-card-large/document-card-large.component.html
- 50
+ 56Filter by document typesrc/app/components/document-list/document-card-large/document-card-large.component.html
- 63
+ 69src/app/components/document-list/document-list.component.html
@@ -2770,7 +2777,7 @@
Filter by storage pathsrc/app/components/document-list/document-card-large/document-card-large.component.html
- 70
+ 76src/app/components/document-list/document-list.component.html
@@ -2781,40 +2788,40 @@
Created: src/app/components/document-list/document-card-large/document-card-large.component.html
- 85
+ 91,92src/app/components/document-list/document-card-small/document-card-small.component.html
- 48
+ 48,49Added: src/app/components/document-list/document-card-large/document-card-large.component.html
- 86
+ 92,93src/app/components/document-list/document-card-small/document-card-small.component.html
- 49
+ 49,50Modified: src/app/components/document-list/document-card-large/document-card-large.component.html
- 87
+ 93,94src/app/components/document-list/document-card-small/document-card-small.component.html
- 50
+ 50,51Score:src/app/components/document-list/document-card-large/document-card-large.component.html
- 98
+ 104
@@ -2926,7 +2933,7 @@
ASNsrc/app/components/document-list/document-list.component.html
- 127
+ 128,127src/app/components/document-list/filter-editor/filter-editor.component.ts
@@ -3420,21 +3427,21 @@
Short: src/app/components/manage/settings/settings.component.html
- 56
+ 56,57Medium: src/app/components/manage/settings/settings.component.html
- 60
+ 60,61Long: src/app/components/manage/settings/settings.component.html
- 64
+ 64,65
@@ -3532,14 +3539,14 @@
Update checking works by pinging the the public Github API for the latest release to determine whether a new version is available. Actual updating of the app must still be performed manually. src/app/components/manage/settings/settings.component.html
- 139,142
+ 140,142
- No tracking data is collected by the app in any way.
+ No tracking data is collected by the app in any way.src/app/components/manage/settings/settings.component.html
- 144
+ 144,146
@@ -3549,8 +3556,8 @@
146
-
- Note that for users of thirdy-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release.
+
+ Note that for users of third-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release.src/app/components/manage/settings/settings.component.html146
@@ -3658,7 +3665,7 @@
Mailsrc/app/components/manage/settings/settings.component.html
- 231
+ 232,231
@@ -4191,18 +4198,25 @@
15
+
+ Document with ASN already exists.
+
+ src/app/services/consumer-status.service.ts
+ 16
+
+ File not found.src/app/services/consumer-status.service.ts
- 16
+ 17Pre-consume script does not exist.src/app/services/consumer-status.service.ts
- 17
+ 18Pre-Consume is a term that appears like that in the documentation as well and does not need a specific translation
@@ -4210,7 +4224,7 @@
Error while executing pre-consume script.src/app/services/consumer-status.service.ts
- 18
+ 19Pre-Consume is a term that appears like that in the documentation as well and does not need a specific translation
@@ -4218,7 +4232,7 @@
Post-consume script does not exist.src/app/services/consumer-status.service.ts
- 19
+ 20Post-Consume is a term that appears like that in the documentation as well and does not need a specific translation
@@ -4226,7 +4240,7 @@
Error while executing post-consume script.src/app/services/consumer-status.service.ts
- 20
+ 21Post-Consume is a term that appears like that in the documentation as well and does not need a specific translation
@@ -4234,49 +4248,49 @@
Received new file.src/app/services/consumer-status.service.ts
- 21
+ 22File type not supported.src/app/services/consumer-status.service.ts
- 22
+ 23Processing document...src/app/services/consumer-status.service.ts
- 23
+ 24Generating thumbnail...src/app/services/consumer-status.service.ts
- 24
+ 25Retrieving date from document...src/app/services/consumer-status.service.ts
- 25
+ 26Saving document...src/app/services/consumer-status.service.ts
- 26
+ 27Finished.src/app/services/consumer-status.service.ts
- 27
+ 28
@@ -4336,165 +4350,172 @@
145
+
+ Arabic
+
+ src/app/services/settings.service.ts
+ 151
+
+ Belarusiansrc/app/services/settings.service.ts
- 151
+ 157Czechsrc/app/services/settings.service.ts
- 157
+ 163Danishsrc/app/services/settings.service.ts
- 163
+ 169Germansrc/app/services/settings.service.ts
- 169
+ 175English (GB)src/app/services/settings.service.ts
- 175
+ 181Spanishsrc/app/services/settings.service.ts
- 181
+ 187Frenchsrc/app/services/settings.service.ts
- 187
+ 193Italiansrc/app/services/settings.service.ts
- 193
+ 199Luxembourgishsrc/app/services/settings.service.ts
- 199
+ 205Dutchsrc/app/services/settings.service.ts
- 205
+ 211Polishsrc/app/services/settings.service.ts
- 211
+ 217Portuguese (Brazil)src/app/services/settings.service.ts
- 217
+ 223Portuguesesrc/app/services/settings.service.ts
- 223
+ 229Romaniansrc/app/services/settings.service.ts
- 229
+ 235Russiansrc/app/services/settings.service.ts
- 235
+ 241Sloveniansrc/app/services/settings.service.ts
- 241
+ 247Serbiansrc/app/services/settings.service.ts
- 247
+ 253Swedishsrc/app/services/settings.service.ts
- 253
+ 259Turkishsrc/app/services/settings.service.ts
- 259
+ 265Chinese Simplifiedsrc/app/services/settings.service.ts
- 265
+ 271ISO 8601src/app/services/settings.service.ts
- 282
+ 288Successfully completed one-time migratration of settings to the database!src/app/services/settings.service.ts
- 393
+ 399Unable to migrate settings to the database, please try saving manually.src/app/services/settings.service.ts
- 394
+ 400
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts
index f99f547e6..19f85398b 100644
--- a/src-ui/src/app/components/document-detail/document-detail.component.ts
+++ b/src-ui/src/app/components/document-detail/document-detail.component.ts
@@ -204,6 +204,10 @@ export class DocumentDetailComponent
)
.subscribe({
next: (titleValue) => {
+ // In the rare case when the field changed just after debounced event was fired.
+ // We dont want to overwrite whats actually in the text field, so just return
+ if (titleValue !== this.titleInput.value) return
+
this.title = titleValue
this.documentForm.patchValue({ title: titleValue })
},
diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
index c114a2d6e..b18524e38 100644
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
@@ -26,11 +26,11 @@
-
+
-
+
{{contentTrimmed}}
diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts
index b43187879..5d24042b9 100644
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts
@@ -70,6 +70,22 @@ export class DocumentCardLargeComponent {
}
}
+ get searchCommentHighlights() {
+ let highlights = []
+ if (
+ this.document['__search_hit__'] &&
+ this.document['__search_hit__'].comment_highlights
+ ) {
+ // only show comments with a match
+ highlights = (
+ this.document['__search_hit__'].comment_highlights as string
+ )
+ .split(',')
+ .filter((higlight) => higlight.includes('No tracking data is collected by the app in any way.
-
+
diff --git a/src-ui/src/environments/environment.prod.ts b/src-ui/src/environments/environment.prod.ts
index 832f69378..16cbe7df6 100644
--- a/src-ui/src/environments/environment.prod.ts
+++ b/src-ui/src/environments/environment.prod.ts
@@ -5,7 +5,7 @@ export const environment = {
apiBaseUrl: document.baseURI + 'api/',
apiVersion: '2',
appTitle: 'Paperless-ngx',
- version: '1.12.1',
+ version: '1.12.1-dev',
webSocketHost: window.location.host,
webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:',
webSocketBaseUrl: base_url.pathname + 'ws/',
diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py
index 82b8afecc..6e3ecfe05 100644
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -4,7 +4,6 @@ import shutil
import tempfile
from dataclasses import dataclass
from functools import lru_cache
-from math import ceil
from pathlib import Path
from typing import List
from typing import Optional
@@ -12,10 +11,9 @@ from typing import Optional
import magic
from django.conf import settings
from pdf2image import convert_from_path
+from pdf2image.exceptions import PDFPageCountError
from pikepdf import Page
-from pikepdf import PasswordError
from pikepdf import Pdf
-from pikepdf import PdfImage
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
@@ -154,52 +152,15 @@ def scan_file_for_barcodes(
(page_number, barcode_text) tuples
"""
- def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]:
- detected_barcodes = []
- with Pdf.open(pdf_filepath) as pdf:
- for page_num, page in enumerate(pdf.pages):
- for image_key in page.images:
- pdfimage = PdfImage(page.images[image_key])
-
- # This type is known to have issues:
- # https://github.com/pikepdf/pikepdf/issues/401
- if "/CCITTFaxDecode" in pdfimage.filters:
- raise BarcodeImageFormatError(
- "Unable to decode CCITTFaxDecode images",
- )
-
- # Not all images can be transcoded to a PIL image, which
- # is what pyzbar expects to receive, so this may
- # raise an exception, triggering fallback
- pillow_img = pdfimage.as_pil_image()
-
- # Scale the image down
- # See: https://github.com/paperless-ngx/paperless-ngx/issues/2385
- # TLDR: zbar has issues with larger images
- width, height = pillow_img.size
- if width > 1024:
- scaler = ceil(width / 1024)
- new_width = int(width / scaler)
- new_height = int(height / scaler)
- pillow_img = pillow_img.resize((new_width, new_height))
-
- width, height = pillow_img.size
- if height > 2048:
- scaler = ceil(height / 2048)
- new_width = int(width / scaler)
- new_height = int(height / scaler)
- pillow_img = pillow_img.resize((new_width, new_height))
-
- for barcode_value in barcode_reader(pillow_img):
- detected_barcodes.append(Barcode(page_num, barcode_value))
-
- return detected_barcodes
-
def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]:
detected_barcodes = []
# use a temporary directory in case the file is too big to handle in memory
with tempfile.TemporaryDirectory() as path:
- pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
+ pages_from_path = convert_from_path(
+ pdf_filepath,
+ dpi=300,
+ output_folder=path,
+ )
for current_page_number, page in enumerate(pages_from_path):
for barcode_value in barcode_reader(page):
detected_barcodes.append(
@@ -219,27 +180,19 @@ def scan_file_for_barcodes(
# Always try pikepdf first, it's usually fine, faster and
# uses less memory
try:
- barcodes = _pikepdf_barcode_scan(pdf_filepath)
+ barcodes = _pdf2image_barcode_scan(pdf_filepath)
# Password protected files can't be checked
- except PasswordError as e:
+ # This is the exception raised for those
+ except PDFPageCountError as e:
logger.warning(
f"File is likely password protected, not checking for barcodes: {e}",
)
- # Handle pikepdf related image decoding issues with a fallback to page
- # by page conversion to images in a temporary directory
- except Exception as e:
+ # This file is really borked, allow the consumption to continue
+ # but it may fail further on
+ except Exception as e: # pragma: no cover
logger.warning(
- f"Falling back to pdf2image because: {e}",
+ f"Exception during barcode scanning: {e}",
)
- try:
- barcodes = _pdf2image_barcode_scan(pdf_filepath)
- # This file is really borked, allow the consumption to continue
- # but it may fail further on
- except Exception as e: # pragma: no cover
- logger.warning(
- f"Exception during barcode scanning: {e}",
- )
-
else:
logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}",
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index bc344abb9..8c80304d3 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,7 +1,10 @@
import datetime
import hashlib
import os
+import shutil
+import tempfile
import uuid
+from pathlib import Path
from subprocess import CompletedProcess
from subprocess import run
from typing import Optional
@@ -94,7 +97,8 @@ class Consumer(LoggingMixin):
def __init__(self):
super().__init__()
- self.path = None
+ self.path: Optional[Path] = None
+ self.original_path: Optional[Path] = None
self.filename = None
self.override_title = None
self.override_correspondent_id = None
@@ -167,16 +171,18 @@ class Consumer(LoggingMixin):
self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
- filepath_arg = os.path.normpath(self.path)
+ working_file_path = str(self.path)
+ original_file_path = str(self.original_path)
script_env = os.environ.copy()
- script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg
+ script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
+ script_env["DOCUMENT_WORKING_PATH"] = working_file_path
try:
completed_proc = run(
args=[
settings.PRE_CONSUME_SCRIPT,
- filepath_arg,
+ original_file_path,
],
env=script_env,
capture_output=True,
@@ -195,7 +201,7 @@ class Consumer(LoggingMixin):
exception=e,
)
- def run_post_consume_script(self, document):
+ def run_post_consume_script(self, document: Document):
if not settings.POST_CONSUME_SCRIPT:
return
@@ -285,8 +291,8 @@ class Consumer(LoggingMixin):
Return the document object if it was successfully created.
"""
- self.path = path
- self.filename = override_filename or os.path.basename(path)
+ self.path = Path(path).resolve()
+ self.filename = override_filename or self.path.name
self.override_title = override_title
self.override_correspondent_id = override_correspondent_id
self.override_document_type_id = override_document_type_id
@@ -311,6 +317,15 @@ class Consumer(LoggingMixin):
self.log("info", f"Consuming {self.filename}")
+ # For the actual work, copy the file into a tempdir
+ self.original_path = self.path
+ tempdir = tempfile.TemporaryDirectory(
+ prefix="paperless-ngx",
+ dir=settings.SCRATCH_DIR,
+ )
+ self.path = Path(tempdir.name) / Path(self.filename)
+ shutil.copy(self.original_path, self.path)
+
# Determine the parser class.
mime_type = magic.from_file(self.path, mime=True)
@@ -453,11 +468,12 @@ class Consumer(LoggingMixin):
# Delete the file only if it was successfully consumed
self.log("debug", f"Deleting file {self.path}")
os.unlink(self.path)
+ self.original_path.unlink()
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = os.path.join(
- os.path.dirname(self.path),
- "._" + os.path.basename(self.path),
+ os.path.dirname(self.original_path),
+ "._" + os.path.basename(self.original_path),
)
if os.path.isfile(shadow_file):
@@ -474,6 +490,7 @@ class Consumer(LoggingMixin):
)
finally:
document_parser.cleanup()
+ tempdir.cleanup()
self.run_post_consume_script(document)
diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png
similarity index 100%
rename from src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png
rename to src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png
diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png
similarity index 100%
rename from src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png
rename to src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png
diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py
index 7beeee288..4f7f1278a 100644
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -1,9 +1,7 @@
import os
import shutil
-import tempfile
from unittest import mock
-import pikepdf
from django.conf import settings
from django.test import override_settings
from django.test import TestCase
@@ -23,13 +21,29 @@ class TestBarcode(DirectoriesMixin, TestCase):
BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes")
- def test_barcode_reader(self):
+ def test_barcode_reader_png(self):
+ """
+ GIVEN:
+ - PNG file with separator barcode
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - The barcode is detected
+ """
test_file = os.path.join(self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT.png")
img = Image.open(test_file)
- separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
+ separator_barcode = settings.CONSUMER_BARCODE_STRING
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
- def test_barcode_reader2(self):
+ def test_barcode_reader_pbm(self):
+ """
+ GIVEN:
+ - Netpbm bitmap file with separator barcode
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - The barcode is detected
+ """
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pbm",
@@ -38,25 +52,49 @@ class TestBarcode(DirectoriesMixin, TestCase):
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
- def test_barcode_reader_distorsion(self):
+ def test_barcode_reader_distortion_scratchy(self):
+ """
+ GIVEN:
+ - Image containing high noise
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - The barcode is detected
+ """
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
- "barcode-39-PATCHT-distorsion.png",
+ "barcode-39-PATCHT-distortion.png",
)
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
- def test_barcode_reader_distorsion2(self):
+ def test_barcode_reader_distortion_stretched(self):
+ """
+ GIVEN:
+ - Image with a stretched barcode
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - The barcode is detected
+ """
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
- "barcode-39-PATCHT-distorsion2.png",
+ "barcode-39-PATCHT-distortion2.png",
)
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_unreadable(self):
+ """
+ GIVEN:
+ - Image with a truly unreadable barcode
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - No barcode is detected
+ """
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-unreadable.png",
@@ -65,6 +103,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), [])
def test_barcode_reader_qr(self):
+ """
+ GIVEN:
+ - Image file with QR separator barcode
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - The barcode is detected
+ """
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"qr-code-PATCHT.png",
@@ -74,6 +120,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_128(self):
+ """
+ GIVEN:
+ - Image file with 128 style separator barcode
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - The barcode is detected
+ """
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"barcode-128-PATCHT.png",
@@ -83,11 +137,27 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_no_barcode(self):
+ """
+ GIVEN:
+ - Image file with no barcode
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - No barcode is detected
+ """
test_file = os.path.join(self.SAMPLE_DIR, "simple.png")
img = Image.open(test_file)
- self.assertEqual(barcodes.barcode_reader(img), [])
+ self.assertListEqual(barcodes.barcode_reader(img), [])
def test_barcode_reader_custom_separator(self):
+ """
+ GIVEN:
+ - Image file with custom separator barcode value
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - The barcode is detected
+ """
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.png",
@@ -96,6 +166,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"])
def test_barcode_reader_custom_qr_separator(self):
+ """
+ GIVEN:
+ - Image file with custom separator barcode value as a QR code
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - The barcode is detected
+ """
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"barcode-qr-custom.png",
@@ -104,6 +182,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"])
def test_barcode_reader_custom_128_separator(self):
+ """
+ GIVEN:
+ - Image file with custom separator 128 barcode value
+ WHEN:
+ - Image is scanned for codes
+ THEN:
+ - The barcode is detected
+ """
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.png",
@@ -111,6 +197,679 @@ class TestBarcode(DirectoriesMixin, TestCase):
img = Image.open(test_file)
self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"])
+ def test_get_mime_type(self):
+ """
+ GIVEN:
+ -
+ WHEN:
+ -
+ THEN:
+ -
+ """
+ tiff_file = os.path.join(
+ self.SAMPLE_DIR,
+ "simple.tiff",
+ )
+ pdf_file = os.path.join(
+ self.SAMPLE_DIR,
+ "simple.pdf",
+ )
+ png_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "barcode-128-custom.png",
+ )
+ tiff_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile1")
+ pdf_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile2")
+ shutil.copy(tiff_file, tiff_file_no_extension)
+ shutil.copy(pdf_file, pdf_file_no_extension)
+
+ self.assertEqual(barcodes.get_file_mime_type(tiff_file), "image/tiff")
+ self.assertEqual(barcodes.get_file_mime_type(pdf_file), "application/pdf")
+ self.assertEqual(
+ barcodes.get_file_mime_type(tiff_file_no_extension),
+ "image/tiff",
+ )
+ self.assertEqual(
+ barcodes.get_file_mime_type(pdf_file_no_extension),
+ "application/pdf",
+ )
+ self.assertEqual(barcodes.get_file_mime_type(png_file), "image/png")
+
+ def test_convert_from_tiff_to_pdf(self):
+ """
+ GIVEN:
+ -
+ WHEN:
+ -
+ THEN:
+ -
+ """
+ test_file = os.path.join(
+ os.path.dirname(__file__),
+ "samples",
+ "simple.tiff",
+ )
+ dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff")
+ shutil.copy(test_file, dst)
+ target_file = barcodes.convert_from_tiff_to_pdf(dst)
+ file_extension = os.path.splitext(os.path.basename(target_file))[1]
+ self.assertTrue(os.path.isfile(target_file))
+ self.assertEqual(file_extension, ".pdf")
+
+ def test_convert_error_from_pdf_to_pdf(self):
+ """
+ GIVEN:
+ -
+ WHEN:
+ -
+ THEN:
+ -
+ """
+ test_file = os.path.join(
+ self.SAMPLE_DIR,
+ "simple.pdf",
+ )
+ dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf")
+ shutil.copy(test_file, dst)
+ self.assertIsNone(barcodes.convert_from_tiff_to_pdf(dst))
+
+ def test_scan_file_for_separating_barcodes(self):
+ """
+ GIVEN:
+ -
+ WHEN:
+ -
+ THEN:
+ -
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [0])
+
+ def test_scan_file_for_separating_barcodes_none_present(self):
+ """
+ GIVEN:
+ -
+ WHEN:
+ -
+ THEN:
+ -
+ """
+ test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [])
+
+ def test_scan_file_for_separating_barcodes_middle_page(self):
+ """
+ GIVEN:
+ - PDF file containing a separator on page 1 (zero indexed)
+ WHEN:
+ - File is scanned for barcodes
+ THEN:
+ - Barcode is detected on page 1 (zero indexed)
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t-middle.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [1])
+
+ def test_scan_file_for_separating_barcodes_multiple_pages(self):
+ """
+ GIVEN:
+ - PDF file containing a separator on pages 2 and 5 (zero indexed)
+ WHEN:
+ - File is scanned for barcodes
+ THEN:
+ - Barcode is detected on pages 2 and 5 (zero indexed)
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "several-patcht-codes.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [2, 5])
+
+ def test_scan_file_for_separating_barcodes_upside_down(self):
+ """
+ GIVEN:
+ - PDF file containing a separator on page 1 (zero indexed)
+ - The barcode is upside down
+ WHEN:
+ - File is scanned for barcodes
+ THEN:
+ - Barcode is detected on page 1 (zero indexed)
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t-middle_reverse.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [1])
+
+ def test_scan_file_for_separating_barcodes_fax_decode(self):
+ """
+ GIVEN:
+ - A PDF containing an image encoded as CCITT Group 4 encoding
+ WHEN:
+ - Barcode processing happens with the file
+ THEN:
+ - The barcode is still detected
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "barcode-fax-image.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [1])
+
+ def test_scan_file_for_separating_qr_barcodes(self):
+ """
+ GIVEN:
+ - PDF file containing a separator on page 0 (zero indexed)
+ - The barcode is a QR code
+ WHEN:
+ - File is scanned for barcodes
+ THEN:
+ - Barcode is detected on page 0 (zero indexed)
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t-qr.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [0])
+
+ @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
+ def test_scan_file_for_separating_custom_barcodes(self):
+ """
+ GIVEN:
+ - PDF file containing a separator on page 0 (zero indexed)
+ - The barcode separation value is customized
+ WHEN:
+ - File is scanned for barcodes
+ THEN:
+ - Barcode is detected on page 0 (zero indexed)
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "barcode-39-custom.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [0])
+
+ @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
+ def test_scan_file_for_separating_custom_qr_barcodes(self):
+ """
+ GIVEN:
+ - PDF file containing a separator on page 0 (zero indexed)
+ - The barcode separation value is customized
+ - The barcode is a QR code
+ WHEN:
+ - File is scanned for barcodes
+ THEN:
+ - Barcode is detected on page 0 (zero indexed)
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "barcode-qr-custom.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [0])
+
+ @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
+ def test_scan_file_for_separating_custom_128_barcodes(self):
+ """
+ GIVEN:
+ - PDF file containing a separator on page 0 (zero indexed)
+ - The barcode separation value is customized
+ - The barcode is a 128 code
+ WHEN:
+ - File is scanned for barcodes
+ THEN:
+ - Barcode is detected on page 0 (zero indexed)
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "barcode-128-custom.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [0])
+
+ def test_scan_file_for_separating_wrong_qr_barcodes(self):
+ """
+ GIVEN:
+ - PDF file containing a separator on page 0 (zero indexed)
+ - The barcode value is customized
+ - The separation value is NOT customized
+ WHEN:
+ - File is scanned for barcodes
+ THEN:
+ - No split pages are detected
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "barcode-39-custom.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [])
+
+ @override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC")
+ def test_scan_file_for_separating_qr_barcodes(self):
+ """
+ GIVEN:
+ - Input PDF with certain QR codes that aren't detected at current size
+ WHEN:
+ - The input file is scanned for barcodes
+ THEN:
+ - QR codes are detected
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "many-qr-codes.pdf",
+ )
+
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertGreater(len(doc_barcode_info.barcodes), 0)
+ self.assertListEqual(separator_page_numbers, [1])
+
+ def test_separate_pages(self):
+ """
+ GIVEN:
+ - Input PDF 2 pages after separation
+ WHEN:
+ - The input file separated at the barcode
+ THEN:
+ - Two new documents are produced
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t-middle.pdf",
+ )
+ documents = barcodes.separate_pages(test_file, [1])
+
+ self.assertEqual(len(documents), 2)
+
+ def test_separate_pages_double_code(self):
+ """
+ GIVEN:
+ - Input PDF with two patch code pages in a row
+ WHEN:
+ - The input file is split
+ THEN:
+ - Only two files are output
+ """
+ test_file = os.path.join(
+ os.path.dirname(__file__),
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t-double.pdf",
+ )
+ pages = barcodes.separate_pages(test_file, [1, 2])
+
+ self.assertEqual(len(pages), 2)
+
+ def test_separate_pages_no_list(self):
+ """
+ GIVEN:
+ - Input file to separate
+ WHEN:
+ - No separation pages are provided
+ THEN:
+ - No new documents are produced
+ - A warning is logged
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t-middle.pdf",
+ )
+ with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
+ pages = barcodes.separate_pages(test_file, [])
+ self.assertEqual(pages, [])
+ self.assertEqual(
+ cm.output,
+ [
+ "WARNING:paperless.barcodes:No pages to split on!",
+ ],
+ )
+
+ def test_save_to_dir(self):
+ """
+ GIVEN:
+ - File to save to a directory
+ WHEN:
+ - The file is saved
+ THEN:
+ - The file exists
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t.pdf",
+ )
+ barcodes.save_to_dir(test_file, target_dir=settings.SCRATCH_DIR)
+ target_file = os.path.join(settings.SCRATCH_DIR, "patch-code-t.pdf")
+ self.assertTrue(os.path.isfile(target_file))
+
+ def test_save_to_dir_not_existing(self):
+ """
+ GIVEN:
+ - File to save to a directory
+ - The directory doesn't exist
+ WHEN:
+ - The file is saved
+ THEN:
+ - The file exists
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t.pdf",
+ )
+ nonexistingdir = "/nowhere"
+ if os.path.isdir(nonexistingdir):
+ self.fail("non-existing dir exists")
+
+ with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
+ barcodes.save_to_dir(test_file, target_dir=nonexistingdir)
+ self.assertEqual(
+ cm.output,
+ [
+ f"WARNING:paperless.barcodes:{str(test_file)} or {str(nonexistingdir)} don't exist.",
+ ],
+ )
+
+ def test_save_to_dir_given_name(self):
+ """
+ GIVEN:
+ - File to save to a directory
+ - There is a name override
+ WHEN:
+ - The file is saved
+ THEN:
+ - The file exists
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t.pdf",
+ )
+ barcodes.save_to_dir(
+ test_file,
+ newname="newname.pdf",
+ target_dir=settings.SCRATCH_DIR,
+ )
+ target_file = os.path.join(settings.SCRATCH_DIR, "newname.pdf")
+ self.assertTrue(os.path.isfile(target_file))
+
+ def test_barcode_splitter(self):
+ """
+ GIVEN:
+ - Input file containing barcodes
+ WHEN:
+ - Input file is split on barcodes
+ THEN:
+ - Correct number of files produced
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t-middle.pdf",
+ )
+
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(test_file, doc_barcode_info.pdf_path)
+ self.assertTrue(len(separator_page_numbers) > 0)
+
+ document_list = barcodes.separate_pages(test_file, separator_page_numbers)
+ self.assertGreater(len(document_list), 0)
+
+ for document in document_list:
+ barcodes.save_to_dir(document, target_dir=settings.SCRATCH_DIR)
+
+ target_file1 = os.path.join(
+ settings.SCRATCH_DIR,
+ "patch-code-t-middle_document_0.pdf",
+ )
+ target_file2 = os.path.join(
+ settings.SCRATCH_DIR,
+ "patch-code-t-middle_document_1.pdf",
+ )
+
+ self.assertTrue(os.path.isfile(target_file1))
+ self.assertTrue(os.path.isfile(target_file2))
+
+ @override_settings(CONSUMER_ENABLE_BARCODES=True)
+ def test_consume_barcode_file(self):
+ """
+ GIVEN:
+ - Input file with barcodes given to consume task
+ WHEN:
+ - Consume task returns
+ THEN:
+ - The file was split
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t-middle.pdf",
+ )
+
+ dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
+ shutil.copy(test_file, dst)
+
+ with mock.patch("documents.tasks.async_to_sync"):
+ self.assertEqual(tasks.consume_file(dst), "File successfully split")
+
+ @override_settings(
+ CONSUMER_ENABLE_BARCODES=True,
+ CONSUMER_BARCODE_TIFF_SUPPORT=True,
+ )
+ def test_consume_barcode_tiff_file(self):
+ """
+ GIVEN:
+ - TIFF image containing barcodes
+ WHEN:
+ - Consume task returns
+ THEN:
+ - The file was split
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t-middle.tiff",
+ )
+ dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
+ shutil.copy(test_file, dst)
+
+ with mock.patch("documents.tasks.async_to_sync"):
+ self.assertEqual(tasks.consume_file(dst), "File successfully split")
+
+ @override_settings(
+ CONSUMER_ENABLE_BARCODES=True,
+ CONSUMER_BARCODE_TIFF_SUPPORT=True,
+ )
+ @mock.patch("documents.consumer.Consumer.try_consume_file")
+ def test_consume_barcode_unsupported_jpg_file(self, m):
+ """
+ GIVEN:
+ - JPEG image as input
+ WHEN:
+ - Consume task returns
+ THEN:
+ - Barcode reader reported warning
+ - Consumption continued with the file
+ """
+ test_file = os.path.join(
+ self.SAMPLE_DIR,
+ "simple.jpg",
+ )
+ dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg")
+ shutil.copy(test_file, dst)
+
+ with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
+ self.assertIn("Success", tasks.consume_file(dst))
+
+ self.assertListEqual(
+ cm.output,
+ [
+ "WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg",
+ ],
+ )
+ m.assert_called_once()
+
+ args, kwargs = m.call_args
+ self.assertIsNone(kwargs["override_filename"])
+ self.assertIsNone(kwargs["override_title"])
+ self.assertIsNone(kwargs["override_correspondent_id"])
+ self.assertIsNone(kwargs["override_document_type_id"])
+ self.assertIsNone(kwargs["override_tag_ids"])
+
+ @override_settings(
+ CONSUMER_ENABLE_BARCODES=True,
+ CONSUMER_BARCODE_TIFF_SUPPORT=True,
+ )
+ def test_consume_barcode_supported_no_extension_file(self):
+ """
+ GIVEN:
+ - TIFF image containing barcodes
+ - TIFF file is given without extension
+ WHEN:
+ - Consume task returns
+ THEN:
+ - The file was split
+ """
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "patch-code-t-middle.tiff",
+ )
+ dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle")
+ shutil.copy(test_file, dst)
+
+ with mock.patch("documents.tasks.async_to_sync"):
+ self.assertEqual(tasks.consume_file(dst), "File successfully split")
+
+ def test_scan_file_for_separating_barcodes_password(self):
+ """
+ GIVEN:
+ - Password protected PDF
+ WHEN:
+ - File is scanned for barcode
+ THEN:
+ - Scanning handles the exception without crashing
+ """
+ test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
+ with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ warning = cm.output[0]
+ expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
+ self.assertTrue(warning.startswith(expected_str))
+
+ separator_page_numbers = barcodes.get_separating_barcodes(
+ doc_barcode_info.barcodes,
+ )
+
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertListEqual(separator_page_numbers, [])
+
+
+class TestAsnBarcodes(DirectoriesMixin, TestCase):
+
+ SAMPLE_DIR = os.path.join(
+ os.path.dirname(__file__),
+ "samples",
+ )
+
+ BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes")
+
def test_barcode_reader_asn_normal(self):
"""
GIVEN:
@@ -163,528 +922,81 @@ class TestBarcode(DirectoriesMixin, TestCase):
img = Image.open(test_file)
self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"])
- def test_get_mime_type(self):
- tiff_file = os.path.join(
- self.SAMPLE_DIR,
- "simple.tiff",
- )
- pdf_file = os.path.join(
- self.SAMPLE_DIR,
- "simple.pdf",
- )
- png_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "barcode-128-custom.png",
- )
- tiff_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile1")
- pdf_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile2")
- shutil.copy(tiff_file, tiff_file_no_extension)
- shutil.copy(pdf_file, pdf_file_no_extension)
-
- self.assertEqual(barcodes.get_file_mime_type(tiff_file), "image/tiff")
- self.assertEqual(barcodes.get_file_mime_type(pdf_file), "application/pdf")
- self.assertEqual(
- barcodes.get_file_mime_type(tiff_file_no_extension),
- "image/tiff",
- )
- self.assertEqual(
- barcodes.get_file_mime_type(pdf_file_no_extension),
- "application/pdf",
- )
- self.assertEqual(barcodes.get_file_mime_type(png_file), "image/png")
-
- def test_convert_from_tiff_to_pdf(self):
- test_file = os.path.join(
- os.path.dirname(__file__),
- "samples",
- "simple.tiff",
- )
- dst = os.path.join(settings.SCRATCH_DIR, "simple.tiff")
- shutil.copy(test_file, dst)
- target_file = barcodes.convert_from_tiff_to_pdf(dst)
- file_extension = os.path.splitext(os.path.basename(target_file))[1]
- self.assertTrue(os.path.isfile(target_file))
- self.assertEqual(file_extension, ".pdf")
-
- def test_convert_error_from_pdf_to_pdf(self):
- test_file = os.path.join(
- self.SAMPLE_DIR,
- "simple.pdf",
- )
- dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf")
- shutil.copy(test_file, dst)
- self.assertIsNone(barcodes.convert_from_tiff_to_pdf(dst))
-
- def test_scan_file_for_separating_barcodes(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [0])
-
- def test_scan_file_for_separating_barcodes_none_present(self):
- test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [])
-
- def test_scan_file_for_separating_barcodes3(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t-middle.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [1])
-
- def test_scan_file_for_separating_barcodes4(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "several-patcht-codes.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [2, 5])
-
- def test_scan_file_for_separating_barcodes_upsidedown(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t-middle_reverse.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [1])
-
- def test_scan_file_for_barcodes_pillow_transcode_error(self):
+ @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-")
+ def test_scan_file_for_asn_custom_prefix(self):
"""
GIVEN:
- - A PDF containing an image which cannot be transcoded to a PIL image
+ - PDF containing an ASN barcode with custom prefix
+ - The ASN value is 123
WHEN:
- - The image tries to be transcoded to a PIL image, but fails
+ - File is scanned for barcodes
THEN:
- - The barcode reader is still called
+ - The ASN is located
+ - The ASN integer value is correct
"""
+ test_file = os.path.join(
+ self.BARCODE_SAMPLE_DIR,
+ "barcode-39-asn-custom-prefix.pdf",
+ )
+ doc_barcode_info = barcodes.scan_file_for_barcodes(
+ test_file,
+ )
+ asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
- def _build_device_n_pdf(self, save_path: str):
- # Based on the pikepdf tests
- # https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py
- pdf = pikepdf.new()
- pdf.add_blank_page(page_size=(72, 72))
- imobj = pikepdf.Stream(
- pdf,
- bytes(range(0, 256)),
- BitsPerComponent=8,
- ColorSpace=pikepdf.Array(
- [
- pikepdf.Name.DeviceN,
- pikepdf.Array([pikepdf.Name.Black]),
- pikepdf.Name.DeviceCMYK,
- pikepdf.Stream(
- pdf,
- b"{0 0 0 4 -1 roll}", # Colorspace conversion function
- FunctionType=4,
- Domain=[0.0, 1.0],
- Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
- ),
- ],
- ),
- Width=16,
- Height=16,
- Type=pikepdf.Name.XObject,
- Subtype=pikepdf.Name.Image,
- )
- pim = pikepdf.PdfImage(imobj)
- self.assertEqual(pim.mode, "DeviceN")
- self.assertTrue(pim.is_device_n)
+ self.assertEqual(doc_barcode_info.pdf_path, test_file)
+ self.assertEqual(asn, 123)
- pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do")
- pdf.pages[0].Resources = pikepdf.Dictionary(
- XObject=pikepdf.Dictionary(Im0=imobj),
- )
- pdf.save(save_path)
-
- with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf:
- # Build an offending file
- _build_device_n_pdf(self, str(device_n_pdf.name))
- with mock.patch("documents.barcodes.barcode_reader") as reader:
- reader.return_value = list()
-
- _ = barcodes.scan_file_for_barcodes(
- str(device_n_pdf.name),
- )
-
- reader.assert_called()
-
- def test_scan_file_for_separating_barcodes_fax_decode(self):
+ def test_scan_file_for_asn_barcode_invalid(self):
"""
GIVEN:
- - A PDF containing an image encoded as CCITT Group 4 encoding
+ - PDF containing an ASN barcode
+ - The ASN value is XYZXYZ
WHEN:
- - Barcode processing happens with the file
+ - File is scanned for barcodes
THEN:
- - The barcode is still detected
+ - The ASN is located
+ - The ASN value is not used
"""
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
- "barcode-fax-image.pdf",
+ "barcode-39-asn-invalid.pdf",
)
doc_barcode_info = barcodes.scan_file_for_barcodes(
test_file,
)
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
+
+ asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [1])
+ self.assertEqual(asn, None)
- def test_scan_file_for_separating_qr_barcodes(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t-qr.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [0])
-
- @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
- def test_scan_file_for_separating_custom_barcodes(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "barcode-39-custom.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [0])
-
- @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
- def test_scan_file_for_separating_custom_qr_barcodes(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "barcode-qr-custom.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [0])
-
- @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
- def test_scan_file_for_separating_custom_128_barcodes(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "barcode-128-custom.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [0])
-
- def test_scan_file_for_separating_wrong_qr_barcodes(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "barcode-39-custom.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [])
-
- @override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC")
- def test_scan_file_for_separating_qr_barcodes(self):
+ @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
+ def test_consume_barcode_file_asn_assignment(self):
"""
GIVEN:
- - Input PDF with certain QR codes that aren't detected at current size
+ - PDF containing an ASN barcode
+ - The ASN value is 123
WHEN:
- - The input file is scanned for barcodes
+ - File is scanned for barcodes
THEN:
- - QR codes are detected
+ - The ASN is located
+ - The ASN integer value is correct
+ - The ASN is provided as the override value to the consumer
"""
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
- "many-qr-codes.pdf",
+ "barcode-39-asn-123.pdf",
)
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertGreater(len(doc_barcode_info.barcodes), 0)
- self.assertListEqual(separator_page_numbers, [1])
-
- def test_separate_pages(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t-middle.pdf",
- )
- pages = barcodes.separate_pages(test_file, [1])
-
- self.assertEqual(len(pages), 2)
-
- def test_separate_pages_double_code(self):
- """
- GIVEN:
- - Input PDF with two patch code pages in a row
- WHEN:
- - The input file is split
- THEN:
- - Only two files are output
- """
- test_file = os.path.join(
- os.path.dirname(__file__),
- "samples",
- "barcodes",
- "patch-code-t-double.pdf",
- )
- pages = barcodes.separate_pages(test_file, [1, 2])
-
- self.assertEqual(len(pages), 2)
-
- def test_separate_pages_no_list(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t-middle.pdf",
- )
- with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
- pages = barcodes.separate_pages(test_file, [])
- self.assertEqual(pages, [])
- self.assertEqual(
- cm.output,
- [
- "WARNING:paperless.barcodes:No pages to split on!",
- ],
- )
-
- def test_save_to_dir(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t.pdf",
- )
- tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
- barcodes.save_to_dir(test_file, target_dir=tempdir)
- target_file = os.path.join(tempdir, "patch-code-t.pdf")
- self.assertTrue(os.path.isfile(target_file))
-
- def test_save_to_dir2(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t.pdf",
- )
- nonexistingdir = "/nowhere"
- if os.path.isdir(nonexistingdir):
- self.fail("non-existing dir exists")
- else:
- with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
- barcodes.save_to_dir(test_file, target_dir=nonexistingdir)
- self.assertEqual(
- cm.output,
- [
- f"WARNING:paperless.barcodes:{str(test_file)} or {str(nonexistingdir)} don't exist.",
- ],
- )
-
- def test_save_to_dir3(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t.pdf",
- )
- tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
- barcodes.save_to_dir(test_file, newname="newname.pdf", target_dir=tempdir)
- target_file = os.path.join(tempdir, "newname.pdf")
- self.assertTrue(os.path.isfile(target_file))
-
- def test_barcode_splitter(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t-middle.pdf",
- )
- tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
-
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(test_file, doc_barcode_info.pdf_path)
- self.assertTrue(len(separator_page_numbers) > 0)
-
- document_list = barcodes.separate_pages(test_file, separator_page_numbers)
- self.assertTrue(document_list)
- for document in document_list:
- barcodes.save_to_dir(document, target_dir=tempdir)
-
- target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
- target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
-
- self.assertTrue(os.path.isfile(target_file1))
- self.assertTrue(os.path.isfile(target_file2))
-
- @override_settings(CONSUMER_ENABLE_BARCODES=True)
- def test_consume_barcode_file(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t-middle.pdf",
- )
-
- dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
+ dst = os.path.join(settings.SCRATCH_DIR, "barcode-39-asn-123.pdf")
shutil.copy(test_file, dst)
- with mock.patch("documents.tasks.async_to_sync"):
- self.assertEqual(tasks.consume_file(dst), "File successfully split")
+ with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call:
+ tasks.consume_file(dst)
- @override_settings(
- CONSUMER_ENABLE_BARCODES=True,
- CONSUMER_BARCODE_TIFF_SUPPORT=True,
- )
- def test_consume_barcode_tiff_file(self):
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t-middle.tiff",
- )
- dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
- shutil.copy(test_file, dst)
+ args, kwargs = mocked_call.call_args
- with mock.patch("documents.tasks.async_to_sync"):
- self.assertEqual(tasks.consume_file(dst), "File successfully split")
-
- @override_settings(
- CONSUMER_ENABLE_BARCODES=True,
- CONSUMER_BARCODE_TIFF_SUPPORT=True,
- )
- @mock.patch("documents.consumer.Consumer.try_consume_file")
- def test_consume_barcode_unsupported_jpg_file(self, m):
- """
- This test assumes barcode and TIFF support are enabled and
- the user uploads an unsupported image file (e.g. jpg)
-
- The function shouldn't try to scan for separating barcodes
- and continue archiving the file as is.
- """
- test_file = os.path.join(
- self.SAMPLE_DIR,
- "simple.jpg",
- )
- dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg")
- shutil.copy(test_file, dst)
- with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
- self.assertIn("Success", tasks.consume_file(dst))
- self.assertListEqual(
- cm.output,
- [
- "WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg",
- ],
- )
- m.assert_called_once()
-
- args, kwargs = m.call_args
- self.assertIsNone(kwargs["override_filename"])
- self.assertIsNone(kwargs["override_title"])
- self.assertIsNone(kwargs["override_correspondent_id"])
- self.assertIsNone(kwargs["override_document_type_id"])
- self.assertIsNone(kwargs["override_tag_ids"])
-
- @override_settings(
- CONSUMER_ENABLE_BARCODES=True,
- CONSUMER_BARCODE_TIFF_SUPPORT=True,
- )
- def test_consume_barcode_supported_no_extension_file(self):
- """
- This test assumes barcode and TIFF support are enabled and
- the user uploads a supported image file, but without extension
- """
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "patch-code-t-middle.tiff",
- )
- dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle")
- shutil.copy(test_file, dst)
-
- with mock.patch("documents.tasks.async_to_sync"):
- self.assertEqual(tasks.consume_file(dst), "File successfully split")
-
- def test_scan_file_for_separating_barcodes_password(self):
- """
- GIVEN:
- - Password protected PDF
- - pikepdf based scanning
- WHEN:
- - File is scanned for barcode
- THEN:
- - Scanning handles the exception without exception
- """
- test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- separator_page_numbers = barcodes.get_separating_barcodes(
- doc_barcode_info.barcodes,
- )
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertListEqual(separator_page_numbers, [])
+ self.assertEqual(kwargs["override_asn"], 123)
def test_scan_file_for_asn_barcode(self):
"""
@@ -730,85 +1042,17 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(doc_barcode_info.pdf_path, test_file)
self.assertEqual(asn, None)
- def test_scan_file_for_asn_barcode_invalid(self):
- """
- GIVEN:
- - PDF containing an ASN barcode
- - The ASN value is XYZXYZ
- WHEN:
- - File is scanned for barcodes
- THEN:
- - The ASN is located
- - The ASN value is not used
- """
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "barcode-39-asn-invalid.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
-
- asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertEqual(asn, None)
-
- @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-")
- def test_scan_file_for_asn_custom_prefix(self):
- """
- GIVEN:
- - PDF containing an ASN barcode with custom prefix
- - The ASN value is 123
- WHEN:
- - File is scanned for barcodes
- THEN:
- - The ASN is located
- - The ASN integer value is correct
- """
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "barcode-39-asn-custom-prefix.pdf",
- )
- doc_barcode_info = barcodes.scan_file_for_barcodes(
- test_file,
- )
- asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
-
- self.assertEqual(doc_barcode_info.pdf_path, test_file)
- self.assertEqual(asn, 123)
-
- @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
- def test_consume_barcode_file_asn_assignment(self):
- """
- GIVEN:
- - PDF containing an ASN barcode
- - The ASN value is 123
- WHEN:
- - File is scanned for barcodes
- THEN:
- - The ASN is located
- - The ASN integer value is correct
- - The ASN is provided as the override value to the consumer
- """
- test_file = os.path.join(
- self.BARCODE_SAMPLE_DIR,
- "barcode-39-asn-123.pdf",
- )
-
- dst = os.path.join(settings.SCRATCH_DIR, "barcode-39-asn-123.pdf")
- shutil.copy(test_file, dst)
-
- with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call:
- tasks.consume_file(dst)
-
- args, kwargs = mocked_call.call_args
-
- self.assertEqual(kwargs["override_asn"], 123)
-
@override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
def test_asn_too_large(self):
-
+ """
+ GIVEN:
+ - ASN from barcode enabled
+ - Barcode contains too large an ASN value
+ WHEN:
+ - ASN from barcode checked for correctness
+ THEN:
+ - Exception is raised regarding size limits
+ """
src = os.path.join(
os.path.dirname(__file__),
"samples",
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index dc86de331..de368018f 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase):
with tempfile.NamedTemporaryFile() as script:
with override_settings(PRE_CONSUME_SCRIPT=script.name):
c = Consumer()
- c.path = "path-to-file"
+ c.original_path = "path-to-file"
+ c.path = "/tmp/somewhere/path-to-file"
c.run_pre_consume_script()
m.assert_called_once()
@@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase):
args, kwargs = m.call_args
command = kwargs["args"]
+ environment = kwargs["env"]
self.assertEqual(command[0], script.name)
self.assertEqual(command[1], "path-to-file")
+ self.assertDictContainsSubset(
+ {
+ "DOCUMENT_SOURCE_PATH": c.original_path,
+ "DOCUMENT_WORKING_PATH": c.path,
+ },
+ environment,
+ )
+
@mock.patch("documents.consumer.Consumer.log")
def test_script_with_output(self, mocked_log):
"""
@@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase):
m.assert_called_once()
- args, kwargs = m.call_args
+ _, kwargs = m.call_args
command = kwargs["args"]
+ environment = kwargs["env"]
self.assertEqual(command[0], script.name)
self.assertEqual(command[1], str(doc.pk))
@@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase):
self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"])
+ self.assertDictContainsSubset(
+ {
+ "DOCUMENT_ID": str(doc.pk),
+ "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
+ "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
+ "DOCUMENT_CORRESPONDENT": "my_bank",
+ "DOCUMENT_TAGS": "a,b",
+ },
+ environment,
+ )
+
def test_script_exit_non_zero(self):
"""
GIVEN:
diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py
index c52c9be92..b2ec0d024 100644
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@@ -3,6 +3,7 @@ import shutil
import tempfile
from collections import namedtuple
from contextlib import contextmanager
+from unittest import mock
from django.apps import apps
from django.db import connection
@@ -86,6 +87,30 @@ class DirectoriesMixin:
remove_dirs(self.dirs)
+class ConsumerProgressMixin:
+ def setUp(self) -> None:
+ self.send_progress_patcher = mock.patch(
+ "documents.consumer.Consumer._send_progress",
+ )
+ self.send_progress_mock = self.send_progress_patcher.start()
+ super().setUp()
+
+ def tearDown(self) -> None:
+ super().tearDown()
+ self.send_progress_patcher.stop()
+
+
+class DocumentConsumeDelayMixin:
+ def setUp(self) -> None:
+ self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay")
+ self.consume_file_mock = self.consume_file_patcher.start()
+ super().setUp()
+
+ def tearDown(self) -> None:
+ super().tearDown()
+ self.consume_file_patcher.stop()
+
+
class TestMigrations(TransactionTestCase):
@property
def app(self):
diff --git a/src/documents/views.py b/src/documents/views.py
index 854f2da2b..6a719fe70 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -477,21 +477,14 @@ class DocumentViewSet(
class SearchResultSerializer(DocumentSerializer):
def to_representation(self, instance):
doc = Document.objects.get(id=instance["id"])
- comments = ""
- if hasattr(instance.results.q, "subqueries"):
- commentTerm = instance.results.q.subqueries[0]
- comments = ",".join(
- [
- str(c.comment)
- for c in Comment.objects.filter(document=instance["id"])
- if commentTerm.text in c.comment
- ],
- )
+ comments = ",".join(
+ [str(c.comment) for c in Comment.objects.filter(document=instance["id"])],
+ )
r = super().to_representation(doc)
r["__search_hit__"] = {
"score": instance.score,
"highlights": instance.highlights("content", text=doc.content),
- "comment_highlights": instance.highlights("content", text=comments)
+ "comment_highlights": instance.highlights("comments", text=comments)
if doc
else None,
"rank": instance.rank,
diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py
index cc5d4e3c8..f1ee263aa 100644
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -271,6 +271,16 @@ class MailDocumentParser(DocumentParser):
"paperHeight": "11.7",
"scale": "1.0",
}
+
+ # Set the output format of the resulting PDF
+ # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
+ if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
+ data["pdfFormat"] = "PDF/A-2b"
+ elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
+ data["pdfFormat"] = "PDF/A-1a"
+ elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
+ data["pdfFormat"] = "PDF/A-3b"
+
try:
response = requests.post(
url,
diff --git a/src/paperless_mail/tests/test_parsers.py b/src/paperless_mail/tests/test_parsers.py
index e02267970..809a1192f 100644
--- a/src/paperless_mail/tests/test_parsers.py
+++ b/src/paperless_mail/tests/test_parsers.py
@@ -573,8 +573,8 @@ class TestParser(TestCase):
self.parser.gotenberg_server + "/forms/chromium/convert/html",
mock_post.call_args.args[0],
)
- self.assertEqual({}, mock_post.call_args.kwargs["headers"])
- self.assertEqual(
+ self.assertDictEqual({}, mock_post.call_args.kwargs["headers"])
+ self.assertDictEqual(
{
"marginTop": "0.1",
"marginBottom": "0.1",
@@ -583,6 +583,7 @@ class TestParser(TestCase):
"paperWidth": "8.27",
"paperHeight": "11.7",
"scale": "1.0",
+ "pdfFormat": "PDF/A-2b",
},
mock_post.call_args.kwargs["data"],
)
@@ -663,8 +664,8 @@ class TestParser(TestCase):
self.parser.gotenberg_server + "/forms/chromium/convert/html",
mock_post.call_args.args[0],
)
- self.assertEqual({}, mock_post.call_args.kwargs["headers"])
- self.assertEqual(
+ self.assertDictEqual({}, mock_post.call_args.kwargs["headers"])
+ self.assertDictEqual(
{
"marginTop": "0.1",
"marginBottom": "0.1",
diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py
index 1cfb1eecb..f34ecbbab 100644
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -95,9 +95,19 @@ class TikaDocumentParser(DocumentParser):
),
}
headers = {}
+ data = {}
+
+ # Set the output format of the resulting PDF
+ # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
+ if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
+ data["pdfFormat"] = "PDF/A-2b"
+ elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
+ data["pdfFormat"] = "PDF/A-1a"
+ elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
+ data["pdfFormat"] = "PDF/A-3b"
try:
- response = requests.post(url, files=files, headers=headers)
+ response = requests.post(url, files=files, headers=headers, data=data)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(