Merge remote-tracking branch 'origin/dev'

This commit is contained in:
Trenton Holmes 2023-01-29 08:40:13 -08:00
commit 9aea8a7d7c
30 changed files with 1680 additions and 1053 deletions

View File

@ -15,6 +15,8 @@ from github import ContainerPackage
from github import GithubBranchApi from github import GithubBranchApi
from github import GithubContainerRegistryApi from github import GithubContainerRegistryApi
import docker
logger = logging.getLogger("cleanup-tags") logger = logging.getLogger("cleanup-tags")
@ -151,12 +153,16 @@ class RegistryTagsCleaner:
for tag in sorted(self.tags_to_keep): for tag in sorted(self.tags_to_keep):
full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}" full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}"
logger.info(f"Checking manifest for {full_name}") logger.info(f"Checking manifest for {full_name}")
# TODO: It would be nice to use RegistryData from docker
# except the ID doesn't map to anything in the manifest
try: try:
proc = subprocess.run( proc = subprocess.run(
[ [
shutil.which("docker"), shutil.which("docker"),
"manifest", "buildx",
"imagetools",
"inspect", "inspect",
"--raw",
full_name, full_name,
], ],
capture_output=True, capture_output=True,
@ -241,6 +247,65 @@ class RegistryTagsCleaner:
# By default, keep anything which is tagged # By default, keep anything which is tagged
self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys())) self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys()))
def check_tags_pull(self):
"""
This method uses the Docker Python SDK to confirm all tags which were
kept still pull, for all platforms.
TODO: This is much slower (although more comprehensive). Maybe a Pool?
"""
logger.info("Beginning confirmation step")
client = docker.from_env()
imgs = []
for tag in sorted(self.tags_to_keep):
repository = f"ghcr.io/{self.repo_owner}/{self.package_name}"
for arch, variant in [("amd64", None), ("arm64", None), ("arm", "v7")]:
# From 11.2.0 onwards, qpdf is cross compiled, so there is a single arch, amd64
# skip others in this case
if "qpdf" in self.package_name and arch != "amd64" and tag == "11.2.0":
continue
# Skip beta and release candidate tags
elif "beta" in tag:
continue
# Build the platform name
if variant is not None:
platform = f"linux/{arch}/{variant}"
else:
platform = f"linux/{arch}"
try:
logger.info(f"Pulling {repository}:{tag} for {platform}")
image = client.images.pull(
repository=repository,
tag=tag,
platform=platform,
)
imgs.append(image)
except docker.errors.APIError as e:
logger.error(
f"Failed to pull {repository}:{tag}: {e}",
)
# Prevent out of space errors by removing after a few
# pulls
if len(imgs) > 50:
for image in imgs:
try:
client.images.remove(image.id)
except docker.errors.APIError as e:
err_str = str(e)
# Ignore attempts to remove images that are partly shared
# Ignore images which are somehow gone already
if (
"must be forced" not in err_str
and "No such image" not in err_str
):
logger.error(
f"Remove image ghcr.io/{self.repo_owner}/{self.package_name}:{tag} failed: {e}",
)
imgs = []
class MainImageTagsCleaner(RegistryTagsCleaner): class MainImageTagsCleaner(RegistryTagsCleaner):
def decide_what_tags_to_keep(self): def decide_what_tags_to_keep(self):
@ -397,6 +462,10 @@ def _main():
# Clean images which are untagged # Clean images which are untagged
cleaner.clean_untagged(args.is_manifest) cleaner.clean_untagged(args.is_manifest)
# Verify remaining tags still pull
if args.is_manifest:
cleaner.check_tags_pull()
if __name__ == "__main__": if __name__ == "__main__":
_main() _main()

View File

@ -212,12 +212,6 @@ jobs:
name: Prepare Docker Pipeline Data name: Prepare Docker Pipeline Data
if: github.event_name == 'push' && (startsWith(github.ref, 'refs/heads/feature-') || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/beta' || contains(github.ref, 'beta.rc') || startsWith(github.ref, 'refs/tags/v')) if: github.event_name == 'push' && (startsWith(github.ref, 'refs/heads/feature-') || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/beta' || contains(github.ref, 'beta.rc') || startsWith(github.ref, 'refs/tags/v'))
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
# If the push triggered the installer library workflow, wait for it to
# complete here. This ensures the required versions for the final
# image have been built, while not waiting at all if the versions haven't changed
concurrency:
group: build-installer-library
cancel-in-progress: false
needs: needs:
- documentation - documentation
- tests-backend - tests-backend

View File

@ -62,9 +62,9 @@ jobs:
with: with:
python-version: "3.10" python-version: "3.10"
- -
name: Install httpx name: Install Python libraries
run: | run: |
python -m pip install httpx python -m pip install httpx docker
# #
# Clean up primary package # Clean up primary package
# #
@ -81,13 +81,3 @@ jobs:
if: "${{ env.TOKEN != '' }}" if: "${{ env.TOKEN != '' }}"
run: | run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --delete "${{ matrix.cache-name }}" python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --untagged --delete "${{ matrix.cache-name }}"
#
# Verify tags which are left still pull
#
-
name: Check all tags still pull
run: |
ghcr_name=$(echo "ghcr.io/${GITHUB_REPOSITORY_OWNER}/${{ matrix.primary-name }}" | awk '{ print tolower($0) }')
echo "Pulling all tags of ${ghcr_name}"
docker pull --quiet --all-tags ${ghcr_name}
docker image list

View File

@ -169,3 +169,142 @@ jobs:
PIKEPDF_VERSION=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }} PIKEPDF_VERSION=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }}
PILLOW_VERSION=${{ needs.prepare-docker-build.outputs.pillow-version }} PILLOW_VERSION=${{ needs.prepare-docker-build.outputs.pillow-version }}
LXML_VERSION=${{ needs.prepare-docker-build.outputs.lxml-version }} LXML_VERSION=${{ needs.prepare-docker-build.outputs.lxml-version }}
commit-binary-files:
name: Store installers
needs:
- prepare-docker-build
- build-qpdf-debs
- build-jbig2enc
- build-psycopg2-wheel
- build-pikepdf-wheel
runs-on: ubuntu-22.04
steps:
-
name: Checkout
uses: actions/checkout@v3
with:
ref: binary-library
-
name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.9"
-
name: Install system dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -qq --no-install-recommends tree
-
name: Extract qpdf files
run: |
version=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).version }}
tag=${{ fromJSON(needs.prepare-docker-build.outputs.qpdf-json).image_tag }}
docker pull --quiet ${tag}
docker create --name qpdf-extract ${tag}
mkdir --parents qpdf/${version}/amd64
docker cp qpdf-extract:/usr/src/qpdf/${version}/amd64 qpdf/${version}
mkdir --parents qpdf/${version}/arm64
docker cp qpdf-extract:/usr/src/qpdf/${version}/arm64 qpdf/${version}
mkdir --parents qpdf/${version}/armv7
docker cp qpdf-extract:/usr/src/qpdf/${version}/armv7 qpdf/${version}
-
name: Extract psycopg2 files
run: |
version=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).version }}
tag=${{ fromJSON(needs.prepare-docker-build.outputs.psycopg2-json).image_tag }}
docker pull --quiet --platform linux/amd64 ${tag}
docker create --platform linux/amd64 --name psycopg2-extract ${tag}
mkdir --parents psycopg2/${version}/amd64
docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/amd64
mv psycopg2/${version}/amd64/wheels/* psycopg2/${version}/amd64
rm -r psycopg2/${version}/amd64/wheels/
docker rm psycopg2-extract
docker pull --quiet --platform linux/arm64 ${tag}
docker create --platform linux/arm64 --name psycopg2-extract ${tag}
mkdir --parents psycopg2/${version}/arm64
docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/arm64
mv psycopg2/${version}/arm64/wheels/* psycopg2/${version}/arm64
rm -r psycopg2/${version}/arm64/wheels/
docker rm psycopg2-extract
docker pull --quiet --platform linux/arm/v7 ${tag}
docker create --platform linux/arm/v7 --name psycopg2-extract ${tag}
mkdir --parents psycopg2/${version}/armv7
docker cp psycopg2-extract:/usr/src/wheels/ psycopg2/${version}/armv7
mv psycopg2/${version}/armv7/wheels/* psycopg2/${version}/armv7
rm -r psycopg2/${version}/armv7/wheels/
docker rm psycopg2-extract
-
name: Extract pikepdf files
run: |
version=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).version }}
tag=${{ fromJSON(needs.prepare-docker-build.outputs.pikepdf-json).image_tag }}
docker pull --quiet --platform linux/amd64 ${tag}
docker create --platform linux/amd64 --name pikepdf-extract ${tag}
mkdir --parents pikepdf/${version}/amd64
docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/amd64
mv pikepdf/${version}/amd64/wheels/* pikepdf/${version}/amd64
rm -r pikepdf/${version}/amd64/wheels/
docker rm pikepdf-extract
docker pull --quiet --platform linux/arm64 ${tag}
docker create --platform linux/arm64 --name pikepdf-extract ${tag}
mkdir --parents pikepdf/${version}/arm64
docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/arm64
mv pikepdf/${version}/arm64/wheels/* pikepdf/${version}/arm64
rm -r pikepdf/${version}/arm64/wheels/
docker rm pikepdf-extract
docker pull --quiet --platform linux/arm/v7 ${tag}
docker create --platform linux/arm/v7 --name pikepdf-extract ${tag}
mkdir --parents pikepdf/${version}/armv7
docker cp pikepdf-extract:/usr/src/wheels/ pikepdf/${version}/armv7
mv pikepdf/${version}/armv7/wheels/* pikepdf/${version}/armv7
rm -r pikepdf/${version}/armv7/wheels/
docker rm pikepdf-extract
-
name: Extract jbig2enc files
run: |
version=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).version }}
tag=${{ fromJSON(needs.prepare-docker-build.outputs.jbig2enc-json).image_tag }}
docker pull --quiet --platform linux/amd64 ${tag}
docker create --platform linux/amd64 --name jbig2enc-extract ${tag}
mkdir --parents jbig2enc/${version}/amd64
docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/amd64/
mv jbig2enc/${version}/amd64/build/* jbig2enc/${version}/amd64/
docker rm jbig2enc-extract
docker pull --quiet --platform linux/arm64 ${tag}
docker create --platform linux/arm64 --name jbig2enc-extract ${tag}
mkdir --parents jbig2enc/${version}/arm64
docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/arm64
mv jbig2enc/${version}/arm64/build/* jbig2enc/${version}/arm64/
docker rm jbig2enc-extract
docker pull --quiet --platform linux/arm/v7 ${tag}
docker create --platform linux/arm/v7 --name jbig2enc-extract ${tag}
mkdir --parents jbig2enc/${version}/armv7
docker cp jbig2enc-extract:/usr/src/jbig2enc/build jbig2enc/${version}/armv7
mv jbig2enc/${version}/armv7/build/* jbig2enc/${version}/armv7/
docker rm jbig2enc-extract
-
name: Show file structure
run: |
tree .
-
name: Commit files
run: |
git config --global user.name "github-actions"
git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
git add pikepdf/ qpdf/ psycopg2/ jbig2enc/
git commit -m "Updating installer packages" || true
git push origin || true

View File

@ -1,19 +1,5 @@
# syntax=docker/dockerfile:1.4 # syntax=docker/dockerfile:1.4
# Pull the installer images from the library
# These are all built previously
# They provide either a .deb or .whl
ARG JBIG2ENC_VERSION
ARG QPDF_VERSION
ARG PIKEPDF_VERSION
ARG PSYCOPG2_VERSION
FROM ghcr.io/paperless-ngx/paperless-ngx/builder/jbig2enc:${JBIG2ENC_VERSION} as jbig2enc-builder
FROM --platform=$BUILDPLATFORM ghcr.io/paperless-ngx/paperless-ngx/builder/qpdf:${QPDF_VERSION} as qpdf-builder
FROM ghcr.io/paperless-ngx/paperless-ngx/builder/pikepdf:${PIKEPDF_VERSION} as pikepdf-builder
FROM ghcr.io/paperless-ngx/paperless-ngx/builder/psycopg2:${PSYCOPG2_VERSION} as psycopg2-builder
FROM --platform=$BUILDPLATFORM node:16-bullseye-slim AS compile-frontend FROM --platform=$BUILDPLATFORM node:16-bullseye-slim AS compile-frontend
# This stage compiles the frontend # This stage compiles the frontend
@ -58,24 +44,21 @@ LABEL org.opencontainers.image.url="https://github.com/paperless-ngx/paperless-n
LABEL org.opencontainers.image.licenses="GPL-3.0-only" LABEL org.opencontainers.image.licenses="GPL-3.0-only"
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
# Buildx provided # Buildx provided, must be defined to use though
ARG TARGETARCH ARG TARGETARCH
ARG TARGETVARIANT ARG TARGETVARIANT
# Workflow provided # Workflow provided
ARG JBIG2ENC_VERSION
ARG QPDF_VERSION ARG QPDF_VERSION
ARG PIKEPDF_VERSION
ARG PSYCOPG2_VERSION
# #
# Begin installation and configuration # Begin installation and configuration
# Order the steps below from least often changed to most # Order the steps below from least often changed to most
# #
# copy jbig2enc
# Basically will never change again
COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/.libs/libjbig2enc* /usr/local/lib/
COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/jbig2 /usr/local/bin/
COPY --from=jbig2enc-builder /usr/src/jbig2enc/src/*.h /usr/local/include/
# Packages need for running # Packages need for running
ARG RUNTIME_PACKAGES="\ ARG RUNTIME_PACKAGES="\
# Python # Python
@ -198,19 +181,29 @@ RUN set -eux \
# Install the built packages from the installer library images # Install the built packages from the installer library images
# Use mounts to avoid copying installer files into the image # Use mounts to avoid copying installer files into the image
# These change sometimes # These change sometimes
RUN --mount=type=bind,from=qpdf-builder,target=/qpdf \ RUN set -eux \
--mount=type=bind,from=psycopg2-builder,target=/psycopg2 \ && echo "Getting binaries" \
--mount=type=bind,from=pikepdf-builder,target=/pikepdf \ && mkdir paperless-ngx \
set -eux \ && curl --fail --silent --show-error --output paperless-ngx.tar.gz --location https://github.com/paperless-ngx/paperless-ngx/archive/41d6e7e407af09a0882736d50c89b6e015997bff.tar.gz \
&& tar -xf paperless-ngx.tar.gz --directory paperless-ngx --strip-components=1 \
&& cd paperless-ngx \
# Setting a specific revision ensures we know what this installed
# and ensures cache breaking on changes
&& echo "Installing jbig2enc" \
&& cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/jbig2 /usr/local/bin/ \
&& cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/libjbig2enc* /usr/local/lib/ \
&& echo "Installing qpdf" \ && echo "Installing qpdf" \
&& apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \ && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \
&& apt-get install --yes --no-install-recommends /qpdf/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \ && apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \
&& echo "Installing pikepdf and dependencies" \ && echo "Installing pikepdf and dependencies" \
&& python3 -m pip install --no-cache-dir /pikepdf/usr/src/wheels/*.whl \ && python3 -m pip install --no-cache-dir ./pikepdf/${PIKEPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/*.whl \
&& python3 -m pip list \ && python3 -m pip list \
&& echo "Installing psycopg2" \ && echo "Installing psycopg2" \
&& python3 -m pip install --no-cache-dir /psycopg2/usr/src/wheels/psycopg2*.whl \ && python3 -m pip install --no-cache-dir ./psycopg2/${PSYCOPG2_VERSION}/${TARGETARCH}${TARGETVARIANT}/psycopg2*.whl \
&& python3 -m pip list && python3 -m pip list \
&& echo "Cleaning up image layer" \
&& cd ../ \
&& rm -rf paperless-ngx
WORKDIR /usr/src/paperless/src/ WORKDIR /usr/src/paperless/src/

View File

@ -29,7 +29,20 @@ RUN set -eux \
&& ./autogen.sh \ && ./autogen.sh \
&& ./configure \ && ./configure \
&& make \ && make \
&& echo "Gathering package data" \
&& dpkg-query -f '${Package;-40}${Version}\n' -W > ./pkg-list.txt \
&& echo "Cleaning up image" \ && echo "Cleaning up image" \
&& apt-get -y purge ${BUILD_PACKAGES} \ && apt-get -y purge ${BUILD_PACKAGES} \
&& apt-get -y autoremove --purge \ && apt-get -y autoremove --purge \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/* \
&& echo "Moving files around" \
&& mkdir build \
# Unlink a symlink that causes problems
&& unlink ./src/.libs/libjbig2enc.la \
# Move what the link pointed to
&& mv ./src/libjbig2enc.la ./build/ \
# Move the shared library .so files
&& mv ./src/.libs/libjbig2enc* ./build/ \
# And move the cli binary
&& mv ./src/jbig2 ./build/ \
&& mv ./pkg-list.txt ./build/

View File

@ -7,12 +7,17 @@
# Default to pulling from the main repo registry when manually building # Default to pulling from the main repo registry when manually building
ARG REPO="paperless-ngx/paperless-ngx" ARG REPO="paperless-ngx/paperless-ngx"
# This does nothing, except provide a name for a copy below
ARG QPDF_VERSION ARG QPDF_VERSION
FROM --platform=$BUILDPLATFORM ghcr.io/${REPO}/builder/qpdf:${QPDF_VERSION} as qpdf-builder FROM --platform=$BUILDPLATFORM ghcr.io/${REPO}/builder/qpdf:${QPDF_VERSION} as qpdf-builder
# This does nothing, except provide a name for a copy below #
# Stage: builder
FROM python:3.9-slim-bullseye as main # Purpose:
# - Build the pikepdf wheel
# - Build any dependent wheels which can't be found
#
FROM python:3.9-slim-bullseye as builder
LABEL org.opencontainers.image.description="A intermediate image with pikepdf wheel built" LABEL org.opencontainers.image.description="A intermediate image with pikepdf wheel built"
@ -100,3 +105,14 @@ RUN set -eux \
&& apt-get -y purge ${BUILD_PACKAGES} \ && apt-get -y purge ${BUILD_PACKAGES} \
&& apt-get -y autoremove --purge \ && apt-get -y autoremove --purge \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
#
# Stage: package
# Purpose: Holds the compiled .whl files in a tiny image to pull
#
FROM alpine:3.17 as package
WORKDIR /usr/src/wheels/
COPY --from=builder /usr/src/wheels/*.whl ./
COPY --from=builder /usr/src/wheels/pkg-list.txt ./

View File

@ -2,7 +2,12 @@
# Inputs: # Inputs:
# - PSYCOPG2_VERSION - Version to build # - PSYCOPG2_VERSION - Version to build
FROM python:3.9-slim-bullseye as main #
# Stage: builder
# Purpose:
# - Build the psycopg2 wheel
#
FROM python:3.9-slim-bullseye as builder
LABEL org.opencontainers.image.description="A intermediate image with psycopg2 wheel built" LABEL org.opencontainers.image.description="A intermediate image with psycopg2 wheel built"
@ -48,3 +53,14 @@ RUN set -eux \
&& apt-get -y purge ${BUILD_PACKAGES} \ && apt-get -y purge ${BUILD_PACKAGES} \
&& apt-get -y autoremove --purge \ && apt-get -y autoremove --purge \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
#
# Stage: package
# Purpose: Holds the compiled .whl files in a tiny image to pull
#
FROM alpine:3.17 as package
WORKDIR /usr/src/wheels/
COPY --from=builder /usr/src/wheels/*.whl ./
COPY --from=builder /usr/src/wheels/pkg-list.txt ./

57
docker-builders/README.md Normal file
View File

@ -0,0 +1,57 @@
# Installer Library
This folder contains the Dockerfiles for building certain installers or libraries, which are then pulled into the main image.
## [jbig2enc](https://github.com/agl/jbig2enc)
### Why
JBIG is an image coding which can achieve better compression of images for PDFs.
### What
The Docker image builds a shared library file and utility, which is copied into the correct location in the final image.
### Updating
1. Ensure the given qpdf version is present in [Debian bookworm](https://packages.debian.org/bookworm/qpdf)
2. Update `.build-config.json` to the given version
3. If the Debian specific version has incremented, update `Dockerfile.qpdf`
See Also:
- [OCRMyPDF Documentation](https://ocrmypdf.readthedocs.io/en/latest/jbig2.html)
## [psycopg2](https://www.psycopg.org/)
### Why
The pre-built wheels of psycopg2 are built on Debian 9, which provides a quite old version of libpq-dev. This causes issue with authentication methods.
### What
The image builds psycopg2 wheels on Debian 10 and places the produced wheels into `/usr/src/wheels/`.
See Also:
- [Issue 266](https://github.com/paperless-ngx/paperless-ngx/issues/266)
## [qpdf](https://qpdf.readthedocs.io/en/stable/index.html)
### Why
qpdf and it's library provide tools to read, manipulate and fix up PDFs. Version 11 is also required by `pikepdf` 6+ and Debian 9 does not provide above version 10.
### What
The Docker image cross compiles .deb installers for each supported architecture of the main image. The installers are placed in `/usr/src/qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/`
## [pikepdf](https://pikepdf.readthedocs.io/en/latest/)
### Why
Required by OCRMyPdf, this is a general purpose library for PDF manipulation in Python via the qpdf libraries.
### What
The built wheels are placed into `/usr/src/wheels/`

View File

@ -80,7 +80,7 @@ django_checks() {
search_index() { search_index() {
local -r index_version=1 local -r index_version=2
local -r index_version_file=${DATA_DIR}/.index_version local -r index_version_file=${DATA_DIR}/.index_version
if [[ (! -f "${index_version_file}") || $(<"${index_version_file}") != "$index_version" ]]; then if [[ (! -f "${index_version_file}") || $(<"${index_version_file}") != "$index_version" ]]; then

View File

@ -121,7 +121,17 @@ Executed after the consumer sees a new document in the consumption
folder, but before any processing of the document is performed. This folder, but before any processing of the document is performed. This
script can access the following relevant environment variables set: script can access the following relevant environment variables set:
- `DOCUMENT_SOURCE_PATH` | Environment Variable | Description |
| ----------------------- | ------------------------------------------------------------ |
| `DOCUMENT_SOURCE_PATH` | Original path of the consumed document |
| `DOCUMENT_WORKING_PATH` | Path to a copy of the original that consumption will work on |
!!! note
Pre-consume scripts which modify the document should only change
the `DOCUMENT_WORKING_PATH` file or a second consume task may
be triggered, leading to failures as two tasks work on the
same document path
A simple but common example for this would be creating a simple script A simple but common example for this would be creating a simple script
like this: like this:
@ -130,7 +140,7 @@ like this:
```bash ```bash
#!/usr/bin/env bash #!/usr/bin/env bash
pdf2pdfocr.py -i ${DOCUMENT_SOURCE_PATH} pdf2pdfocr.py -i ${DOCUMENT_WORKING_PATH}
``` ```
`/etc/paperless.conf` `/etc/paperless.conf`
@ -157,26 +167,36 @@ Executed after the consumer has successfully processed a document and
has moved it into paperless. It receives the following environment has moved it into paperless. It receives the following environment
variables: variables:
- `DOCUMENT_ID` | Environment Variable | Description |
- `DOCUMENT_FILE_NAME` | ---------------------------- | --------------------------------------------- |
- `DOCUMENT_CREATED` | `DOCUMENT_ID` | Database primary key of the document |
- `DOCUMENT_MODIFIED` | `DOCUMENT_FILE_NAME` | Formatted filename, not including paths |
- `DOCUMENT_ADDED` | `DOCUMENT_CREATED` | Date & time when document created |
- `DOCUMENT_SOURCE_PATH` | `DOCUMENT_MODIFIED` | Date & time when document was last modified |
- `DOCUMENT_ARCHIVE_PATH` | `DOCUMENT_ADDED` | Date & time when document was added |
- `DOCUMENT_THUMBNAIL_PATH` | `DOCUMENT_SOURCE_PATH` | Path to the original document file |
- `DOCUMENT_DOWNLOAD_URL` | `DOCUMENT_ARCHIVE_PATH` | Path to the generate archive file (if any) |
- `DOCUMENT_THUMBNAIL_URL` | `DOCUMENT_THUMBNAIL_PATH` | Path to the generated thumbnail |
- `DOCUMENT_CORRESPONDENT` | `DOCUMENT_DOWNLOAD_URL` | URL for document download |
- `DOCUMENT_TAGS` | `DOCUMENT_THUMBNAIL_URL` | URL for the document thumbnail |
- `DOCUMENT_ORIGINAL_FILENAME` | `DOCUMENT_CORRESPONDENT` | Assigned correspondent (if any) |
| `DOCUMENT_TAGS` | Comma separated list of tags applied (if any) |
| `DOCUMENT_ORIGINAL_FILENAME` | Filename of original document |
The script can be in any language, but for a simple shell script The script can be in any language, A simple shell script example:
example, you can take a look at
[post-consumption-example.sh](https://github.com/paperless-ngx/paperless-ngx/blob/main/scripts/post-consumption-example.sh)
in this project.
The post consumption script cannot cancel the consumption process. ```bash title="post-consumption-example"
--8<-- "./scripts/post-consumption-example.sh"
```
!!! note
The post consumption script cannot cancel the consumption process.
!!! warning
The post consumption script should not modify the document files
directly
The script's stdout and stderr will be logged line by line to the The script's stdout and stderr will be logged line by line to the
webserver log, along with the exit code of the script. webserver log, along with the exit code of the script.

View File

@ -2,6 +2,9 @@
## paperless-ngx 1.12.1 ## paperless-ngx 1.12.1
_Note: Version 1.12.x introduced searching of comments which will work for comments added after the upgrade but a reindex of the search index is required in order to be able to search
older comments. The Docker image will automatically perform this reindex, bare metal installations will have to perform this manually, see [the docs](https://docs.paperless-ngx.com/administration/#index)._
### Bug Fixes ### Bug Fixes
- Fix: comments not showing in search until after manual reindex in v1.12 [@shamoon](https://github.com/shamoon) ([#2513](https://github.com/paperless-ngx/paperless-ngx/pull/2513)) - Fix: comments not showing in search until after manual reindex in v1.12 [@shamoon](https://github.com/shamoon) ([#2513](https://github.com/paperless-ngx/paperless-ngx/pull/2513))

View File

@ -41,6 +41,7 @@ markdown_extensions:
anchor_linenums: true anchor_linenums: true
- pymdownx.superfences - pymdownx.superfences
- pymdownx.inlinehilite - pymdownx.inlinehilite
- pymdownx.snippets
strict: true strict: true
nav: nav:
- index.md - index.md

File diff suppressed because it is too large Load Diff

View File

@ -204,6 +204,10 @@ export class DocumentDetailComponent
) )
.subscribe({ .subscribe({
next: (titleValue) => { next: (titleValue) => {
// In the rare case when the field changed just after debounced event was fired.
// We dont want to overwrite whats actually in the text field, so just return
if (titleValue !== this.titleInput.value) return
this.title = titleValue this.title = titleValue
this.documentForm.patchValue({ title: titleValue }) this.documentForm.patchValue({ title: titleValue })
}, },

View File

@ -26,11 +26,11 @@
</div> </div>
<p class="card-text"> <p class="card-text">
<span *ngIf="document.__search_hit__ && document.__search_hit__.highlights" [innerHtml]="document.__search_hit__.highlights"></span> <span *ngIf="document.__search_hit__ && document.__search_hit__.highlights" [innerHtml]="document.__search_hit__.highlights"></span>
<span *ngIf="document.__search_hit__ && document.__search_hit__.comment_highlights" class="d-block"> <span *ngFor="let highlight of searchCommentHighlights" class="d-block">
<svg width="1em" height="1em" fill="currentColor" class="me-2"> <svg width="1em" height="1em" fill="currentColor" class="me-2">
<use xlink:href="assets/bootstrap-icons.svg#chat-left-text"/> <use xlink:href="assets/bootstrap-icons.svg#chat-left-text"/>
</svg> </svg>
<span [innerHtml]="document.__search_hit__.comment_highlights"></span> <span [innerHtml]="highlight"></span>
</span> </span>
<span *ngIf="!document.__search_hit__" class="result-content">{{contentTrimmed}}</span> <span *ngIf="!document.__search_hit__" class="result-content">{{contentTrimmed}}</span>
</p> </p>

View File

@ -70,6 +70,22 @@ export class DocumentCardLargeComponent {
} }
} }
get searchCommentHighlights() {
let highlights = []
if (
this.document['__search_hit__'] &&
this.document['__search_hit__'].comment_highlights
) {
// only show comments with a match
highlights = (
this.document['__search_hit__'].comment_highlights as string
)
.split(',')
.filter((higlight) => higlight.includes('<span'))
}
return highlights
}
getIsThumbInverted() { getIsThumbInverted() {
return this.settingsService.get(SETTINGS_KEYS.DARK_MODE_THUMB_INVERTED) return this.settingsService.get(SETTINGS_KEYS.DARK_MODE_THUMB_INVERTED)
} }

View File

@ -143,7 +143,7 @@
<p i18n> <p i18n>
<em>No tracking data is collected by the app in any way.</em> <em>No tracking data is collected by the app in any way.</em>
</p> </p>
<app-input-check i18n-title title="Enable update checking" formControlName="updateCheckingEnabled" i18n-hint hint="Note that for users of thirdy-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release."></app-input-check> <app-input-check i18n-title title="Enable update checking" formControlName="updateCheckingEnabled" i18n-hint hint="Note that for users of third-party containers e.g. linuxserver.io this notification may be 'ahead' of the current third-party release."></app-input-check>
</div> </div>
</div> </div>

View File

@ -5,7 +5,7 @@ export const environment = {
apiBaseUrl: document.baseURI + 'api/', apiBaseUrl: document.baseURI + 'api/',
apiVersion: '2', apiVersion: '2',
appTitle: 'Paperless-ngx', appTitle: 'Paperless-ngx',
version: '1.12.1', version: '1.12.1-dev',
webSocketHost: window.location.host, webSocketHost: window.location.host,
webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:', webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:',
webSocketBaseUrl: base_url.pathname + 'ws/', webSocketBaseUrl: base_url.pathname + 'ws/',

View File

@ -4,7 +4,6 @@ import shutil
import tempfile import tempfile
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache from functools import lru_cache
from math import ceil
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from typing import Optional from typing import Optional
@ -12,10 +11,9 @@ from typing import Optional
import magic import magic
from django.conf import settings from django.conf import settings
from pdf2image import convert_from_path from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
from pikepdf import Page from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf from pikepdf import Pdf
from pikepdf import PdfImage
from PIL import Image from PIL import Image
from PIL import ImageSequence from PIL import ImageSequence
from pyzbar import pyzbar from pyzbar import pyzbar
@ -154,52 +152,15 @@ def scan_file_for_barcodes(
(page_number, barcode_text) tuples (page_number, barcode_text) tuples
""" """
def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]:
detected_barcodes = []
with Pdf.open(pdf_filepath) as pdf:
for page_num, page in enumerate(pdf.pages):
for image_key in page.images:
pdfimage = PdfImage(page.images[image_key])
# This type is known to have issues:
# https://github.com/pikepdf/pikepdf/issues/401
if "/CCITTFaxDecode" in pdfimage.filters:
raise BarcodeImageFormatError(
"Unable to decode CCITTFaxDecode images",
)
# Not all images can be transcoded to a PIL image, which
# is what pyzbar expects to receive, so this may
# raise an exception, triggering fallback
pillow_img = pdfimage.as_pil_image()
# Scale the image down
# See: https://github.com/paperless-ngx/paperless-ngx/issues/2385
# TLDR: zbar has issues with larger images
width, height = pillow_img.size
if width > 1024:
scaler = ceil(width / 1024)
new_width = int(width / scaler)
new_height = int(height / scaler)
pillow_img = pillow_img.resize((new_width, new_height))
width, height = pillow_img.size
if height > 2048:
scaler = ceil(height / 2048)
new_width = int(width / scaler)
new_height = int(height / scaler)
pillow_img = pillow_img.resize((new_width, new_height))
for barcode_value in barcode_reader(pillow_img):
detected_barcodes.append(Barcode(page_num, barcode_value))
return detected_barcodes
def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]: def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]:
detected_barcodes = [] detected_barcodes = []
# use a temporary directory in case the file is too big to handle in memory # use a temporary directory in case the file is too big to handle in memory
with tempfile.TemporaryDirectory() as path: with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(pdf_filepath, output_folder=path) pages_from_path = convert_from_path(
pdf_filepath,
dpi=300,
output_folder=path,
)
for current_page_number, page in enumerate(pages_from_path): for current_page_number, page in enumerate(pages_from_path):
for barcode_value in barcode_reader(page): for barcode_value in barcode_reader(page):
detected_barcodes.append( detected_barcodes.append(
@ -219,27 +180,19 @@ def scan_file_for_barcodes(
# Always try pikepdf first, it's usually fine, faster and # Always try pikepdf first, it's usually fine, faster and
# uses less memory # uses less memory
try: try:
barcodes = _pikepdf_barcode_scan(pdf_filepath) barcodes = _pdf2image_barcode_scan(pdf_filepath)
# Password protected files can't be checked # Password protected files can't be checked
except PasswordError as e: # This is the exception raised for those
except PDFPageCountError as e:
logger.warning( logger.warning(
f"File is likely password protected, not checking for barcodes: {e}", f"File is likely password protected, not checking for barcodes: {e}",
) )
# Handle pikepdf related image decoding issues with a fallback to page # This file is really borked, allow the consumption to continue
# by page conversion to images in a temporary directory # but it may fail further on
except Exception as e: except Exception as e: # pragma: no cover
logger.warning( logger.warning(
f"Falling back to pdf2image because: {e}", f"Exception during barcode scanning: {e}",
) )
try:
barcodes = _pdf2image_barcode_scan(pdf_filepath)
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)
else: else:
logger.warning( logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}", f"Unsupported file format for barcode reader: {str(mime_type)}",

View File

@ -1,7 +1,10 @@
import datetime import datetime
import hashlib import hashlib
import os import os
import shutil
import tempfile
import uuid import uuid
from pathlib import Path
from subprocess import CompletedProcess from subprocess import CompletedProcess
from subprocess import run from subprocess import run
from typing import Optional from typing import Optional
@ -94,7 +97,8 @@ class Consumer(LoggingMixin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.path = None self.path: Optional[Path] = None
self.original_path: Optional[Path] = None
self.filename = None self.filename = None
self.override_title = None self.override_title = None
self.override_correspondent_id = None self.override_correspondent_id = None
@ -167,16 +171,18 @@ class Consumer(LoggingMixin):
self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
filepath_arg = os.path.normpath(self.path) working_file_path = str(self.path)
original_file_path = str(self.original_path)
script_env = os.environ.copy() script_env = os.environ.copy()
script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg script_env["DOCUMENT_SOURCE_PATH"] = original_file_path
script_env["DOCUMENT_WORKING_PATH"] = working_file_path
try: try:
completed_proc = run( completed_proc = run(
args=[ args=[
settings.PRE_CONSUME_SCRIPT, settings.PRE_CONSUME_SCRIPT,
filepath_arg, original_file_path,
], ],
env=script_env, env=script_env,
capture_output=True, capture_output=True,
@ -195,7 +201,7 @@ class Consumer(LoggingMixin):
exception=e, exception=e,
) )
def run_post_consume_script(self, document): def run_post_consume_script(self, document: Document):
if not settings.POST_CONSUME_SCRIPT: if not settings.POST_CONSUME_SCRIPT:
return return
@ -285,8 +291,8 @@ class Consumer(LoggingMixin):
Return the document object if it was successfully created. Return the document object if it was successfully created.
""" """
self.path = path self.path = Path(path).resolve()
self.filename = override_filename or os.path.basename(path) self.filename = override_filename or self.path.name
self.override_title = override_title self.override_title = override_title
self.override_correspondent_id = override_correspondent_id self.override_correspondent_id = override_correspondent_id
self.override_document_type_id = override_document_type_id self.override_document_type_id = override_document_type_id
@ -311,6 +317,15 @@ class Consumer(LoggingMixin):
self.log("info", f"Consuming {self.filename}") self.log("info", f"Consuming {self.filename}")
# For the actual work, copy the file into a tempdir
self.original_path = self.path
tempdir = tempfile.TemporaryDirectory(
prefix="paperless-ngx",
dir=settings.SCRATCH_DIR,
)
self.path = Path(tempdir.name) / Path(self.filename)
shutil.copy(self.original_path, self.path)
# Determine the parser class. # Determine the parser class.
mime_type = magic.from_file(self.path, mime=True) mime_type = magic.from_file(self.path, mime=True)
@ -453,11 +468,12 @@ class Consumer(LoggingMixin):
# Delete the file only if it was successfully consumed # Delete the file only if it was successfully consumed
self.log("debug", f"Deleting file {self.path}") self.log("debug", f"Deleting file {self.path}")
os.unlink(self.path) os.unlink(self.path)
self.original_path.unlink()
# https://github.com/jonaswinkler/paperless-ng/discussions/1037 # https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = os.path.join( shadow_file = os.path.join(
os.path.dirname(self.path), os.path.dirname(self.original_path),
"._" + os.path.basename(self.path), "._" + os.path.basename(self.original_path),
) )
if os.path.isfile(shadow_file): if os.path.isfile(shadow_file):
@ -474,6 +490,7 @@ class Consumer(LoggingMixin):
) )
finally: finally:
document_parser.cleanup() document_parser.cleanup()
tempdir.cleanup()
self.run_post_consume_script(document) self.run_post_consume_script(document)

File diff suppressed because it is too large Load Diff

View File

@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase):
with tempfile.NamedTemporaryFile() as script: with tempfile.NamedTemporaryFile() as script:
with override_settings(PRE_CONSUME_SCRIPT=script.name): with override_settings(PRE_CONSUME_SCRIPT=script.name):
c = Consumer() c = Consumer()
c.path = "path-to-file" c.original_path = "path-to-file"
c.path = "/tmp/somewhere/path-to-file"
c.run_pre_consume_script() c.run_pre_consume_script()
m.assert_called_once() m.assert_called_once()
@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase):
args, kwargs = m.call_args args, kwargs = m.call_args
command = kwargs["args"] command = kwargs["args"]
environment = kwargs["env"]
self.assertEqual(command[0], script.name) self.assertEqual(command[0], script.name)
self.assertEqual(command[1], "path-to-file") self.assertEqual(command[1], "path-to-file")
self.assertDictContainsSubset(
{
"DOCUMENT_SOURCE_PATH": c.original_path,
"DOCUMENT_WORKING_PATH": c.path,
},
environment,
)
@mock.patch("documents.consumer.Consumer.log") @mock.patch("documents.consumer.Consumer.log")
def test_script_with_output(self, mocked_log): def test_script_with_output(self, mocked_log):
""" """
@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase):
m.assert_called_once() m.assert_called_once()
args, kwargs = m.call_args _, kwargs = m.call_args
command = kwargs["args"] command = kwargs["args"]
environment = kwargs["env"]
self.assertEqual(command[0], script.name) self.assertEqual(command[0], script.name)
self.assertEqual(command[1], str(doc.pk)) self.assertEqual(command[1], str(doc.pk))
@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase):
self.assertEqual(command[7], "my_bank") self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"]) self.assertCountEqual(command[8].split(","), ["a", "b"])
self.assertDictContainsSubset(
{
"DOCUMENT_ID": str(doc.pk),
"DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
"DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
"DOCUMENT_CORRESPONDENT": "my_bank",
"DOCUMENT_TAGS": "a,b",
},
environment,
)
def test_script_exit_non_zero(self): def test_script_exit_non_zero(self):
""" """
GIVEN: GIVEN:

View File

@ -3,6 +3,7 @@ import shutil
import tempfile import tempfile
from collections import namedtuple from collections import namedtuple
from contextlib import contextmanager from contextlib import contextmanager
from unittest import mock
from django.apps import apps from django.apps import apps
from django.db import connection from django.db import connection
@ -86,6 +87,30 @@ class DirectoriesMixin:
remove_dirs(self.dirs) remove_dirs(self.dirs)
class ConsumerProgressMixin:
def setUp(self) -> None:
self.send_progress_patcher = mock.patch(
"documents.consumer.Consumer._send_progress",
)
self.send_progress_mock = self.send_progress_patcher.start()
super().setUp()
def tearDown(self) -> None:
super().tearDown()
self.send_progress_patcher.stop()
class DocumentConsumeDelayMixin:
def setUp(self) -> None:
self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay")
self.consume_file_mock = self.consume_file_patcher.start()
super().setUp()
def tearDown(self) -> None:
super().tearDown()
self.consume_file_patcher.stop()
class TestMigrations(TransactionTestCase): class TestMigrations(TransactionTestCase):
@property @property
def app(self): def app(self):

View File

@ -477,21 +477,14 @@ class DocumentViewSet(
class SearchResultSerializer(DocumentSerializer): class SearchResultSerializer(DocumentSerializer):
def to_representation(self, instance): def to_representation(self, instance):
doc = Document.objects.get(id=instance["id"]) doc = Document.objects.get(id=instance["id"])
comments = "" comments = ",".join(
if hasattr(instance.results.q, "subqueries"): [str(c.comment) for c in Comment.objects.filter(document=instance["id"])],
commentTerm = instance.results.q.subqueries[0] )
comments = ",".join(
[
str(c.comment)
for c in Comment.objects.filter(document=instance["id"])
if commentTerm.text in c.comment
],
)
r = super().to_representation(doc) r = super().to_representation(doc)
r["__search_hit__"] = { r["__search_hit__"] = {
"score": instance.score, "score": instance.score,
"highlights": instance.highlights("content", text=doc.content), "highlights": instance.highlights("content", text=doc.content),
"comment_highlights": instance.highlights("content", text=comments) "comment_highlights": instance.highlights("comments", text=comments)
if doc if doc
else None, else None,
"rank": instance.rank, "rank": instance.rank,

View File

@ -271,6 +271,16 @@ class MailDocumentParser(DocumentParser):
"paperHeight": "11.7", "paperHeight": "11.7",
"scale": "1.0", "scale": "1.0",
} }
# Set the output format of the resulting PDF
# Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
data["pdfFormat"] = "PDF/A-2b"
elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
data["pdfFormat"] = "PDF/A-1a"
elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
data["pdfFormat"] = "PDF/A-3b"
try: try:
response = requests.post( response = requests.post(
url, url,

View File

@ -573,8 +573,8 @@ class TestParser(TestCase):
self.parser.gotenberg_server + "/forms/chromium/convert/html", self.parser.gotenberg_server + "/forms/chromium/convert/html",
mock_post.call_args.args[0], mock_post.call_args.args[0],
) )
self.assertEqual({}, mock_post.call_args.kwargs["headers"]) self.assertDictEqual({}, mock_post.call_args.kwargs["headers"])
self.assertEqual( self.assertDictEqual(
{ {
"marginTop": "0.1", "marginTop": "0.1",
"marginBottom": "0.1", "marginBottom": "0.1",
@ -583,6 +583,7 @@ class TestParser(TestCase):
"paperWidth": "8.27", "paperWidth": "8.27",
"paperHeight": "11.7", "paperHeight": "11.7",
"scale": "1.0", "scale": "1.0",
"pdfFormat": "PDF/A-2b",
}, },
mock_post.call_args.kwargs["data"], mock_post.call_args.kwargs["data"],
) )
@ -663,8 +664,8 @@ class TestParser(TestCase):
self.parser.gotenberg_server + "/forms/chromium/convert/html", self.parser.gotenberg_server + "/forms/chromium/convert/html",
mock_post.call_args.args[0], mock_post.call_args.args[0],
) )
self.assertEqual({}, mock_post.call_args.kwargs["headers"]) self.assertDictEqual({}, mock_post.call_args.kwargs["headers"])
self.assertEqual( self.assertDictEqual(
{ {
"marginTop": "0.1", "marginTop": "0.1",
"marginBottom": "0.1", "marginBottom": "0.1",

View File

@ -95,9 +95,19 @@ class TikaDocumentParser(DocumentParser):
), ),
} }
headers = {} headers = {}
data = {}
# Set the output format of the resulting PDF
# Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
data["pdfFormat"] = "PDF/A-2b"
elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
data["pdfFormat"] = "PDF/A-1a"
elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
data["pdfFormat"] = "PDF/A-3b"
try: try:
response = requests.post(url, files=files, headers=headers) response = requests.post(url, files=files, headers=headers, data=data)
response.raise_for_status() # ensure we notice bad responses response.raise_for_status() # ensure we notice bad responses
except Exception as err: except Exception as err:
raise ParseError( raise ParseError(