Compare commits

..

8 Commits

Author SHA1 Message Date
shamoon
770fb2d60e Update build-and-release.yml 2025-09-24 15:22:15 -07:00
shamoon
c8ef9e663a Yikes, try split ci workflow 2025-09-24 15:03:41 -07:00
shamoon
2195e4af45 Ok, lets try manual Codecov comments 2025-09-24 14:48:06 -07:00
shamoon
c6716905a4 Revert "Chore: Enable SonarQube scanning (#10904)"
This reverts commit 8d1f23e9d6.
2025-09-24 14:38:22 -07:00
shamoon
850ee5a415 Revert "Chore: remove conditional from pre-commit job in CI (#10916)"
This reverts commit 53b393dab5.
2025-09-24 14:38:19 -07:00
shamoon
b25b5abdb0 Revert "Development: try separating sonar scan"
This reverts commit 68e0559053.
2025-09-24 14:38:13 -07:00
shamoon
68e0559053 Development: try separating sonar scan 2025-09-24 14:26:05 -07:00
DerRockWolf
4ff09c4cf4 Enhancement: support workflow path matching of barcode-split documents (#10723) 2025-09-24 21:03:03 +00:00
23 changed files with 672 additions and 862 deletions

430
.github/workflows/build-and-release.yml vendored Normal file
View File

@@ -0,0 +1,430 @@
name: 'Build and Release'
on:
workflow_run:
workflows:
- ci
types:
- completed
permissions:
contents: write
packages: write
pull-requests: write
env:
DEFAULT_UV_VERSION: "0.8.x"
DEFAULT_PYTHON_VERSION: "3.11"
NLTK_DATA: "/usr/share/nltk_data"
jobs:
prepare:
if: >-
github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.event == 'push'
name: Prepare build context
runs-on: ubuntu-24.04
outputs:
should-build: ${{ steps.determine.outputs.should-build }}
ref: ${{ steps.determine.outputs.ref }}
ref-name: ${{ steps.determine.outputs.ref-name }}
sha: ${{ steps.determine.outputs.sha }}
is-tag: ${{ steps.determine.outputs.is-tag }}
is-release-target: ${{ steps.determine.outputs.is-release-target }}
is-beta-rc: ${{ steps.determine.outputs.is-beta-rc }}
steps:
- name: Determine ref information
id: determine
uses: actions/github-script@v7
with:
script: |
const run = context.payload.workflow_run;
const owner = context.repo.owner;
const repo = context.repo.repo;
const sha = run.head_sha;
const branch = run.head_branch;
let ref = undefined;
let refName = undefined;
if (branch) {
ref = `refs/heads/${branch}`;
refName = branch;
} else {
const iterator = github.paginate.iterator(
github.rest.repos.listTags,
{
owner,
repo,
per_page: 100,
},
);
for await (const { data } of iterator) {
const match = data.find((tag) => tag.commit?.sha === sha);
if (match) {
ref = `refs/tags/${match.name}`;
refName = match.name;
break;
}
}
}
const outputs = {
shouldBuild: false,
ref: ref ?? '',
refName: refName ?? '',
sha,
isTag: ref?.startsWith('refs/tags/') ?? false,
isReleaseTarget: false,
isBetaRc: false,
};
if (!ref || !refName) {
core.info('No matching ref found for workflow run; skipping post-CI workflow.');
} else {
const allowed =
ref.startsWith('refs/heads/feature-') ||
ref.startsWith('refs/heads/fix-') ||
ref.startsWith('refs/heads/l10n_') ||
ref === 'refs/heads/dev' ||
ref === 'refs/heads/beta' ||
ref.includes('beta.rc') ||
ref.startsWith('refs/tags/v');
const isBetaRc = refName.includes('beta.rc');
const isReleaseTarget = outputs.isTag && (refName.startsWith('v') || isBetaRc);
outputs.shouldBuild = allowed;
outputs.isReleaseTarget = isReleaseTarget;
outputs.isBetaRc = isBetaRc;
}
core.setOutput('should-build', outputs.shouldBuild ? 'true' : 'false');
core.setOutput('ref', outputs.ref);
core.setOutput('ref-name', outputs.refName);
core.setOutput('sha', outputs.sha);
core.setOutput('is-tag', outputs.isTag ? 'true' : 'false');
core.setOutput('is-release-target', outputs.isReleaseTarget ? 'true' : 'false');
core.setOutput('is-beta-rc', outputs.isBetaRc ? 'true' : 'false');
build-docker-image:
needs: prepare
if: needs.prepare.outputs.should-build == 'true'
name: Build Docker image for ${{ needs.prepare.outputs.ref-name }}
runs-on: ubuntu-24.04
concurrency:
group: ${{ github.workflow }}-build-docker-image-${{ needs.prepare.outputs.ref-name || needs.prepare.outputs.sha }}
cancel-in-progress: true
env:
REF: ${{ needs.prepare.outputs.ref }}
REF_NAME: ${{ needs.prepare.outputs.ref-name }}
SHA: ${{ needs.prepare.outputs.sha }}
steps:
- name: Checkout
uses: actions/checkout@v5
with:
ref: ${{ env.SHA }}
- name: Check pushing to Docker Hub
id: push-other-places
env:
REPOSITORY_OWNER: ${{ github.repository_owner }}
REF_NAME: ${{ env.REF_NAME }}
REF: ${{ env.REF }}
run: |
if [[ "$REPOSITORY_OWNER" == "paperless-ngx" ]] && \
([[ "$REF_NAME" == "dev" ]] || [[ "$REF_NAME" == "beta" ]] || [[ "$REF" == refs/tags/v* ]]); then
echo "Enabling DockerHub image push"
echo "enable=true" >> "$GITHUB_OUTPUT"
else
echo "Not pushing to DockerHub"
echo "enable=false" >> "$GITHUB_OUTPUT"
fi
- name: Set ghcr repository name
id: set-ghcr-repository
run: |
ghcr_name=$(echo "${{ github.repository }}" | awk '{ print tolower($0) }')
echo "Name is ${ghcr_name}"
echo "ghcr-repository=${ghcr_name}" >> "$GITHUB_OUTPUT"
- name: Gather Docker metadata
id: docker-meta
uses: docker/metadata-action@v5
env:
GITHUB_REF: ${{ env.REF }}
GITHUB_REF_NAME: ${{ env.REF_NAME }}
GITHUB_SHA: ${{ env.SHA }}
with:
images: |
ghcr.io/${{ steps.set-ghcr-repository.outputs.ghcr-repository }}
name=paperlessngx/paperless-ngx,enable=${{ steps.push-other-places.outputs.enable }}
name=quay.io/paperlessngx/paperless-ngx,enable=${{ steps.push-other-places.outputs.enable }}
tags: |
type=ref,event=branch
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
with:
platforms: arm64
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Login to Docker Hub
if: steps.push-other-places.outputs.enable == 'true'
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Login to Quay.io
if: steps.push-other-places.outputs.enable == 'true'
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_ROBOT_TOKEN }}
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.docker-meta.outputs.tags }}
labels: ${{ steps.docker-meta.outputs.labels }}
build-args: |
PNGX_TAG_VERSION=${{ steps.docker-meta.outputs.version }}
cache-from: |
type=registry,ref=ghcr.io/${{ steps.set-ghcr-repository.outputs.ghcr-repository }}/builder/cache/app:${{ env.REF_NAME }}
type=registry,ref=ghcr.io/${{ steps.set-ghcr-repository.outputs.ghcr-repository }}/builder/cache/app:dev
cache-to: |
type=registry,mode=max,ref=ghcr.io/${{ steps.set-ghcr-repository.outputs.ghcr-repository }}/builder/cache/app:${{ env.REF_NAME }}
- name: Inspect image
run: |
docker buildx imagetools inspect ${{ fromJSON(steps.docker-meta.outputs.json).tags[0] }}
- name: Export frontend artifact from docker
run: |
docker create --name frontend-extract ${{ fromJSON(steps.docker-meta.outputs.json).tags[0] }}
docker cp frontend-extract:/usr/src/paperless/src/documents/static/frontend src/documents/static/frontend/
- name: Upload frontend artifact
uses: actions/upload-artifact@v4
with:
name: frontend-compiled
path: src/documents/static/frontend/
retention-days: 7
build-release:
needs:
- prepare
- build-docker-image
if: needs.prepare.outputs.should-build == 'true'
name: Build release bundle
runs-on: ubuntu-24.04
env:
REF_NAME: ${{ needs.prepare.outputs.ref-name }}
SHA: ${{ needs.prepare.outputs.sha }}
CI_RUN_ID: ${{ github.event.workflow_run.id }}
steps:
- name: Checkout
uses: actions/checkout@v5
with:
ref: ${{ env.SHA }}
- name: Set up Python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
version: ${{ env.DEFAULT_UV_VERSION }}
enable-cache: true
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Install Python dependencies
run: |
uv sync --python ${{ steps.setup-python.outputs.python-version }} --dev --frozen
- name: Install system dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -qq --no-install-recommends gettext liblept5
- name: Download frontend artifact
uses: actions/download-artifact@v5
with:
name: frontend-compiled
path: src/documents/static/frontend/
- name: Download documentation artifact
uses: actions/download-artifact@v5
with:
name: documentation
path: docs/_build/html/
run-id: ${{ env.CI_RUN_ID }}
- name: Generate requirements file
run: |
uv export --quiet --no-dev --all-extras --format requirements-txt --output-file requirements.txt
- name: Compile messages
run: |
cd src/
uv run \
--python ${{ steps.setup-python.outputs.python-version }} \
manage.py compilemessages
- name: Collect static files
run: |
cd src/
uv run \
--python ${{ steps.setup-python.outputs.python-version }} \
manage.py collectstatic --no-input
- name: Move files
run: |
echo "Making dist folders"
for directory in dist \
dist/paperless-ngx \
dist/paperless-ngx/scripts;
do
mkdir --verbose --parents ${directory}
done
echo "Copying basic files"
for file_name in .dockerignore \
.env \
Dockerfile \
pyproject.toml \
uv.lock \
requirements.txt \
LICENSE \
README.md \
paperless.conf.example
do
cp --verbose ${file_name} dist/paperless-ngx/
done
mv --verbose dist/paperless-ngx/paperless.conf.example dist/paperless-ngx/paperless.conf
echo "Copying Docker related files"
cp --recursive docker/ dist/paperless-ngx/docker
echo "Copying startup scripts"
cp --verbose scripts/*.service scripts/*.sh scripts/*.socket dist/paperless-ngx/scripts/
echo "Copying source files"
cp --recursive src/ dist/paperless-ngx/src
echo "Copying documentation"
cp --recursive docs/_build/html/ dist/paperless-ngx/docs
mv --verbose static dist/paperless-ngx
- name: Make release package
run: |
echo "Creating release archive"
cd dist
sudo chown -R 1000:1000 paperless-ngx/
tar -cJf paperless-ngx.tar.xz paperless-ngx/
- name: Upload release artifact
uses: actions/upload-artifact@v4
with:
name: release
path: dist/paperless-ngx.tar.xz
retention-days: 7
publish-release:
needs:
- prepare
- build-release
if: needs.prepare.outputs.is-release-target == 'true'
name: Publish release
runs-on: ubuntu-24.04
outputs:
prerelease: ${{ steps.get_version.outputs.prerelease }}
changelog: ${{ steps.create-release.outputs.body }}
version: ${{ steps.get_version.outputs.version }}
steps:
- name: Download release artifact
uses: actions/download-artifact@v5
with:
name: release
path: ./
- name: Get version
id: get_version
run: |
echo "version=${{ needs.prepare.outputs.ref-name }}" >> "$GITHUB_OUTPUT"
if [[ ${{ needs.prepare.outputs.is-beta-rc }} == 'true' ]]; then
echo "prerelease=true" >> "$GITHUB_OUTPUT"
else
echo "prerelease=false" >> "$GITHUB_OUTPUT"
fi
- name: Create Release and Changelog
id: create-release
uses: release-drafter/release-drafter@v6
with:
name: Paperless-ngx ${{ steps.get_version.outputs.version }}
tag: ${{ steps.get_version.outputs.version }}
version: ${{ steps.get_version.outputs.version }}
prerelease: ${{ steps.get_version.outputs.prerelease }}
publish: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Upload release archive
id: upload-release-asset
uses: shogo82148/actions-upload-release-asset@v1
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
upload_url: ${{ steps.create-release.outputs.upload_url }}
asset_path: ./paperless-ngx.tar.xz
asset_name: paperless-ngx-${{ steps.get_version.outputs.version }}.tar.xz
asset_content_type: application/x-xz
append-changelog:
needs:
- publish-release
if: needs.publish-release.outputs.prerelease == 'false'
name: Append changelog to docs
runs-on: ubuntu-24.04
steps:
- name: Checkout
uses: actions/checkout@v5
with:
ref: main
- name: Set up Python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
version: ${{ env.DEFAULT_UV_VERSION }}
enable-cache: true
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Append Changelog to docs
id: append-Changelog
working-directory: docs
run: |
git branch ${{ needs.publish-release.outputs.version }}-changelog
git checkout ${{ needs.publish-release.outputs.version }}-changelog
echo -e "# Changelog\n\n${{ needs.publish-release.outputs.changelog }}\n" > changelog-new.md
echo "Manually linking usernames"
sed -i -r 's|@([a-zA-Z0-9_]+) \(\[#|[@\1](https://github.com/\1) ([#|g' changelog-new.md
echo "Removing unneeded comment tags"
sed -i -r 's|@<!---->|@|g' changelog-new.md
CURRENT_CHANGELOG=`tail --lines +2 changelog.md`
echo -e "$CURRENT_CHANGELOG" >> changelog-new.md
mv changelog-new.md changelog.md
uv run \
--python ${{ steps.setup-python.outputs.python-version }} \
--dev \
pre-commit run --files changelog.md || true
git config --global user.name "github-actions"
git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
git commit -am "Changelog ${{ needs.publish-release.outputs.version }} - GHA"
git push origin ${{ needs.publish-release.outputs.version }}-changelog
- name: Create Pull Request
uses: actions/github-script@v7
with:
script: |
const { repo, owner } = context.repo;
const result = await github.rest.pulls.create({
title: 'Documentation: Add ${{ needs.publish-release.outputs.version }} changelog',
owner,
repo,
head: '${{ needs.publish-release.outputs.version }}-changelog',
base: 'main',
body: 'This PR is auto-generated by CI.'
});
github.rest.issues.addLabels({
owner,
repo,
issue_number: result.data.number,
labels: ['documentation', 'skip-changelog']
});

View File

@@ -17,52 +17,11 @@ env:
DEFAULT_PYTHON_VERSION: "3.11" DEFAULT_PYTHON_VERSION: "3.11"
NLTK_DATA: "/usr/share/nltk_data" NLTK_DATA: "/usr/share/nltk_data"
jobs: jobs:
detect-duplicate:
name: Detect Duplicate Run
runs-on: ubuntu-24.04
outputs:
should_run: ${{ steps.check.outputs.should_run }}
steps:
- name: Check if workflow should run
id: check
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
if (context.eventName !== 'push') {
core.info('Not a push event; running workflow.');
core.setOutput('should_run', 'true');
return;
}
const ref = context.ref || '';
if (!ref.startsWith('refs/heads/')) {
core.info('Push is not to a branch; running workflow.');
core.setOutput('should_run', 'true');
return;
}
const branch = ref.substring('refs/heads/'.length);
const { owner, repo } = context.repo;
const prs = await github.paginate(github.rest.pulls.list, {
owner,
repo,
state: 'open',
head: `${owner}:${branch}`,
per_page: 100,
});
if (prs.length === 0) {
core.info(`No open PR found for ${branch}; running workflow.`);
core.setOutput('should_run', 'true');
} else {
core.info(`Found ${prs.length} open PR(s) for ${branch}; skipping duplicate push run.`);
core.setOutput('should_run', 'false');
}
pre-commit: pre-commit:
needs: # We want to run on external PRs, but not on our own internal PRs as they'll be run
- detect-duplicate # by the push to the branch. Without this if check, checks are duplicated since
if: needs.detect-duplicate.outputs.should_run == 'true' # internal PRs match both the push and pull_request events.
if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
name: Linting Checks name: Linting Checks
runs-on: ubuntu-24.04 runs-on: ubuntu-24.04
steps: steps:
@@ -192,18 +151,6 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }} token: ${{ secrets.CODECOV_TOKEN }}
flags: backend-python-${{ matrix.python-version }} flags: backend-python-${{ matrix.python-version }}
files: coverage.xml files: coverage.xml
- name: Upload coverage artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: backend-coverage-${{ matrix.python-version }}
path: |
.coverage
coverage.xml
junit.xml
retention-days: 1
include-hidden-files: true
if-no-files-found: error
- name: Stop containers - name: Stop containers
if: always() if: always()
run: | run: |
@@ -286,17 +233,6 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }} token: ${{ secrets.CODECOV_TOKEN }}
flags: frontend-node-${{ matrix.node-version }} flags: frontend-node-${{ matrix.node-version }}
directory: src-ui/coverage/ directory: src-ui/coverage/
- name: Upload coverage artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: frontend-coverage-${{ matrix.shard-index }}
path: |
src-ui/coverage/lcov.info
src-ui/coverage/coverage-final.json
src-ui/junit.xml
retention-days: 1
if-no-files-found: error
tests-frontend-e2e: tests-frontend-e2e:
name: "Frontend E2E Tests (Node ${{ matrix.node-version }} - ${{ matrix.shard-index }}/${{ matrix.shard-count }})" name: "Frontend E2E Tests (Node ${{ matrix.node-version }} - ${{ matrix.shard-index }}/${{ matrix.shard-count }})"
runs-on: ubuntu-24.04 runs-on: ubuntu-24.04
@@ -377,392 +313,3 @@ jobs:
env: env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
run: cd src-ui && pnpm run build --configuration=production run: cd src-ui && pnpm run build --configuration=production
sonarqube-analysis:
name: "SonarQube Analysis"
runs-on: ubuntu-24.04
needs:
- tests-backend
- tests-frontend
if: github.repository_owner == 'paperless-ngx'
steps:
- name: Checkout
uses: actions/checkout@v5
with:
fetch-depth: 0
- name: Download all backend coverage
uses: actions/download-artifact@v5.0.0
with:
pattern: backend-coverage-*
path: ./coverage/
- name: Download all frontend coverage
uses: actions/download-artifact@v5.0.0
with:
pattern: frontend-coverage-*
path: ./coverage/
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Install coverage tools
run: |
pip install coverage
npm install -g nyc
# Merge backend coverage from all Python versions
- name: Merge backend coverage
run: |
coverage combine coverage/backend-coverage-*/.coverage
coverage xml -o merged-backend-coverage.xml
# Merge frontend coverage from all shards
- name: Merge frontend coverage
run: |
# Find all coverage-final.json files from the shards, exit with error if none found
shopt -s nullglob
files=(coverage/frontend-coverage-*/coverage/coverage-final.json)
if [ ${#files[@]} -eq 0 ]; then
echo "No frontend coverage JSON found under coverage/" >&2
exit 1
fi
# Create .nyc_output directory and copy each shard's coverage JSON into it with a unique name
mkdir -p .nyc_output
for coverage_json in "${files[@]}"; do
shard=$(basename "$(dirname "$(dirname "$coverage_json")")")
cp "$coverage_json" ".nyc_output/${shard}.json"
done
npx nyc merge .nyc_output .nyc_output/out.json
npx nyc report --reporter=lcovonly --report-dir coverage
- name: Upload coverage artifacts
uses: actions/upload-artifact@v4.6.2
with:
name: merged-coverage
path: |
merged-backend-coverage.xml
.nyc_output/*
coverage/lcov.info
retention-days: 7
if-no-files-found: error
include-hidden-files: true
- name: SonarQube Analysis
uses: SonarSource/sonarqube-scan-action@v5
env:
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
build-docker-image:
name: Build Docker image for ${{ github.ref_name }}
runs-on: ubuntu-24.04
if: github.event_name == 'push' && (startsWith(github.ref, 'refs/heads/feature-') || startsWith(github.ref, 'refs/heads/fix-') || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/beta' || contains(github.ref, 'beta.rc') || startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/l10n_'))
concurrency:
group: ${{ github.workflow }}-build-docker-image-${{ github.ref_name }}
cancel-in-progress: true
needs:
- tests-backend
- tests-frontend
- tests-frontend-e2e
steps:
- name: Check pushing to Docker Hub
id: push-other-places
# Only push to Dockerhub from the main repo AND the ref is either:
# main
# dev
# beta
# a tag
# Otherwise forks would require a Docker Hub account and secrets setup
run: |
if [[ ${{ github.repository_owner }} == "paperless-ngx" && ( ${{ github.ref_name }} == "dev" || ${{ github.ref_name }} == "beta" || ${{ startsWith(github.ref, 'refs/tags/v') }} == "true" ) ]] ; then
echo "Enabling DockerHub image push"
echo "enable=true" >> $GITHUB_OUTPUT
else
echo "Not pushing to DockerHub"
echo "enable=false" >> $GITHUB_OUTPUT
fi
- name: Set ghcr repository name
id: set-ghcr-repository
run: |
ghcr_name=$(echo "${{ github.repository }}" | awk '{ print tolower($0) }')
echo "Name is ${ghcr_name}"
echo "ghcr-repository=${ghcr_name}" >> $GITHUB_OUTPUT
- name: Gather Docker metadata
id: docker-meta
uses: docker/metadata-action@v5
with:
images: |
ghcr.io/${{ steps.set-ghcr-repository.outputs.ghcr-repository }}
name=paperlessngx/paperless-ngx,enable=${{ steps.push-other-places.outputs.enable }}
name=quay.io/paperlessngx/paperless-ngx,enable=${{ steps.push-other-places.outputs.enable }}
tags: |
# Tag branches with branch name
type=ref,event=branch
# Process semver tags
# For a tag x.y.z or vX.Y.Z, output an x.y.z and x.y image tag
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
- name: Checkout
uses: actions/checkout@v5
# If https://github.com/docker/buildx/issues/1044 is resolved,
# the append input with a native arm64 arch could be used to
# significantly speed up building
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
with:
platforms: arm64
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Login to Docker Hub
uses: docker/login-action@v3
# Don't attempt to login if not pushing to Docker Hub
if: steps.push-other-places.outputs.enable == 'true'
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Login to Quay.io
uses: docker/login-action@v3
# Don't attempt to login if not pushing to Quay.io
if: steps.push-other-places.outputs.enable == 'true'
with:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_ROBOT_TOKEN }}
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
platforms: linux/amd64,linux/arm64
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.docker-meta.outputs.tags }}
labels: ${{ steps.docker-meta.outputs.labels }}
build-args: |
PNGX_TAG_VERSION=${{ steps.docker-meta.outputs.version }}
# Get cache layers from this branch, then dev
# This allows new branches to get at least some cache benefits, generally from dev
cache-from: |
type=registry,ref=ghcr.io/${{ steps.set-ghcr-repository.outputs.ghcr-repository }}/builder/cache/app:${{ github.ref_name }}
type=registry,ref=ghcr.io/${{ steps.set-ghcr-repository.outputs.ghcr-repository }}/builder/cache/app:dev
cache-to: |
type=registry,mode=max,ref=ghcr.io/${{ steps.set-ghcr-repository.outputs.ghcr-repository }}/builder/cache/app:${{ github.ref_name }}
- name: Inspect image
run: |
docker buildx imagetools inspect ${{ fromJSON(steps.docker-meta.outputs.json).tags[0] }}
- name: Export frontend artifact from docker
run: |
docker create --name frontend-extract ${{ fromJSON(steps.docker-meta.outputs.json).tags[0] }}
docker cp frontend-extract:/usr/src/paperless/src/documents/static/frontend src/documents/static/frontend/
- name: Upload frontend artifact
uses: actions/upload-artifact@v4
with:
name: frontend-compiled
path: src/documents/static/frontend/
retention-days: 7
build-release:
name: "Build Release"
needs:
- build-docker-image
- documentation
runs-on: ubuntu-24.04
steps:
- name: Checkout
uses: actions/checkout@v5
- name: Set up Python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
version: ${{ env.DEFAULT_UV_VERSION }}
enable-cache: true
python-version: ${{ steps.setup-python.outputs.python-version }}
- name: Install Python dependencies
run: |
uv sync --python ${{ steps.setup-python.outputs.python-version }} --dev --frozen
- name: Install system dependencies
run: |
sudo apt-get update -qq
sudo apt-get install -qq --no-install-recommends gettext liblept5
- name: Download frontend artifact
uses: actions/download-artifact@v5
with:
name: frontend-compiled
path: src/documents/static/frontend/
- name: Download documentation artifact
uses: actions/download-artifact@v5
with:
name: documentation
path: docs/_build/html/
- name: Generate requirements file
run: |
uv export --quiet --no-dev --all-extras --format requirements-txt --output-file requirements.txt
- name: Compile messages
run: |
cd src/
uv run \
--python ${{ steps.setup-python.outputs.python-version }} \
manage.py compilemessages
- name: Collect static files
run: |
cd src/
uv run \
--python ${{ steps.setup-python.outputs.python-version }} \
manage.py collectstatic --no-input
- name: Move files
run: |
echo "Making dist folders"
for directory in dist \
dist/paperless-ngx \
dist/paperless-ngx/scripts;
do
mkdir --verbose --parents ${directory}
done
echo "Copying basic files"
for file_name in .dockerignore \
.env \
Dockerfile \
pyproject.toml \
uv.lock \
requirements.txt \
LICENSE \
README.md \
paperless.conf.example
do
cp --verbose ${file_name} dist/paperless-ngx/
done
mv --verbose dist/paperless-ngx/paperless.conf.example dist/paperless-ngx/paperless.conf
echo "Copying Docker related files"
cp --recursive docker/ dist/paperless-ngx/docker
echo "Copying startup scripts"
cp --verbose scripts/*.service scripts/*.sh scripts/*.socket dist/paperless-ngx/scripts/
echo "Copying source files"
cp --recursive src/ dist/paperless-ngx/src
echo "Copying documentation"
cp --recursive docs/_build/html/ dist/paperless-ngx/docs
mv --verbose static dist/paperless-ngx
- name: Make release package
run: |
echo "Creating release archive"
cd dist
sudo chown -R 1000:1000 paperless-ngx/
tar -cJf paperless-ngx.tar.xz paperless-ngx/
- name: Upload release artifact
uses: actions/upload-artifact@v4
with:
name: release
path: dist/paperless-ngx.tar.xz
retention-days: 7
publish-release:
name: "Publish Release"
runs-on: ubuntu-24.04
outputs:
prerelease: ${{ steps.get_version.outputs.prerelease }}
changelog: ${{ steps.create-release.outputs.body }}
version: ${{ steps.get_version.outputs.version }}
needs:
- build-release
if: github.ref_type == 'tag' && (startsWith(github.ref_name, 'v') || contains(github.ref_name, '-beta.rc'))
steps:
- name: Download release artifact
uses: actions/download-artifact@v5
with:
name: release
path: ./
- name: Get version
id: get_version
run: |
echo "version=${{ github.ref_name }}" >> $GITHUB_OUTPUT
if [[ ${{ contains(github.ref_name, '-beta.rc') }} == 'true' ]]; then
echo "prerelease=true" >> $GITHUB_OUTPUT
else
echo "prerelease=false" >> $GITHUB_OUTPUT
fi
- name: Create Release and Changelog
id: create-release
uses: release-drafter/release-drafter@v6
with:
name: Paperless-ngx ${{ steps.get_version.outputs.version }}
tag: ${{ steps.get_version.outputs.version }}
version: ${{ steps.get_version.outputs.version }}
prerelease: ${{ steps.get_version.outputs.prerelease }}
publish: true # ensures release is not marked as draft
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Upload release archive
id: upload-release-asset
uses: shogo82148/actions-upload-release-asset@v1
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
upload_url: ${{ steps.create-release.outputs.upload_url }}
asset_path: ./paperless-ngx.tar.xz
asset_name: paperless-ngx-${{ steps.get_version.outputs.version }}.tar.xz
asset_content_type: application/x-xz
append-changelog:
name: "Append Changelog"
runs-on: ubuntu-24.04
needs:
- publish-release
if: needs.publish-release.outputs.prerelease == 'false'
steps:
- name: Checkout
uses: actions/checkout@v5
with:
ref: main
- name: Set up Python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
version: ${{ env.DEFAULT_UV_VERSION }}
enable-cache: true
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
- name: Append Changelog to docs
id: append-Changelog
working-directory: docs
run: |
git branch ${{ needs.publish-release.outputs.version }}-changelog
git checkout ${{ needs.publish-release.outputs.version }}-changelog
echo -e "# Changelog\n\n${{ needs.publish-release.outputs.changelog }}\n" > changelog-new.md
echo "Manually linking usernames"
sed -i -r 's|@([a-zA-Z0-9_]+) \(\[#|[@\1](https://github.com/\1) ([#|g' changelog-new.md
echo "Removing unneeded comment tags"
sed -i -r 's|@<!---->|@|g' changelog-new.md
CURRENT_CHANGELOG=`tail --lines +2 changelog.md`
echo -e "$CURRENT_CHANGELOG" >> changelog-new.md
mv changelog-new.md changelog.md
uv run \
--python ${{ steps.setup-python.outputs.python-version }} \
--dev \
pre-commit run --files changelog.md || true
git config --global user.name "github-actions"
git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
git commit -am "Changelog ${{ needs.publish-release.outputs.version }} - GHA"
git push origin ${{ needs.publish-release.outputs.version }}-changelog
- name: Create Pull Request
uses: actions/github-script@v7
with:
script: |
const { repo, owner } = context.repo;
const result = await github.rest.pulls.create({
title: 'Documentation: Add ${{ needs.publish-release.outputs.version }} changelog',
owner,
repo,
head: '${{ needs.publish-release.outputs.version }}-changelog',
base: 'main',
body: 'This PR is auto-generated by CI.'
});
github.rest.issues.addLabels({
owner,
repo,
issue_number: result.data.number,
labels: ['documentation', 'skip-changelog']
});

220
.github/workflows/codecov-comment.yml vendored Normal file
View File

@@ -0,0 +1,220 @@
name: Codecov PR Comment
on:
workflow_run:
workflows:
- ci
types:
- completed
permissions:
contents: read
pull-requests: write
jobs:
comment:
if: >-
github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'success'
runs-on: ubuntu-24.04
steps:
- name: Gather pull request context
id: pr
uses: actions/github-script@v7
with:
script: |
const run = context.payload.workflow_run;
if (!run.pull_requests || run.pull_requests.length === 0) {
core.info('No associated pull request. Skipping.');
return { shouldRun: false };
}
const pr = run.pull_requests[0];
return {
shouldRun: true,
prNumber: pr.number,
headSha: run.head_sha,
};
- name: Fetch Codecov coverage
id: coverage
if: steps.pr.outputs.shouldRun == 'true'
uses: actions/github-script@v7
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
COMMIT_SHA: ${{ steps.pr.outputs.headSha }}
with:
script: |
const token = process.env.CODECOV_TOKEN;
if (!token) {
core.warning('CODECOV_TOKEN secret is not available; skipping comment.');
core.setOutput('shouldComment', 'false');
return;
}
const commitSha = process.env.COMMIT_SHA;
const owner = context.repo.owner;
const repo = context.repo.repo;
const url = `https://codecov.io/api/v2/github/${owner}/repos/${repo}/commits/${commitSha}/report`;
const maxAttempts = 10;
const waitMs = 15000;
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
let data;
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
core.info(`Fetching Codecov report (attempt ${attempt}/${maxAttempts})`);
const response = await fetch(url, {
headers: {
Authorization: `Bearer ${token}`,
'Content-Type': 'application/json',
Accept: 'application/json',
},
});
if (response.status === 404) {
core.info('Report not ready yet (404). Waiting before retrying.');
await sleep(waitMs);
continue;
}
if (!response.ok) {
const text = await response.text();
throw new Error(`Codecov API returned ${response.status}: ${text}`);
}
data = await response.json();
if (data && Object.keys(data).length > 0) {
break;
}
core.info('Report payload empty. Waiting before retrying.');
await sleep(waitMs);
}
if (!data) {
core.warning('Unable to retrieve Codecov report after multiple attempts.');
core.setOutput('shouldComment', 'false');
return;
}
const totals = data.report?.totals ?? data.commit?.totals ?? data.totals;
if (!totals) {
core.warning('Codecov response does not contain coverage totals.');
core.setOutput('shouldComment', 'false');
return;
}
const compareTotals = data.report?.compare?.totals ?? data.compare?.totals;
const flagsRaw = data.report?.totals_by_flag ?? data.report?.components ?? [];
const toNumber = (value) => {
if (value === null || value === undefined || value === '') {
return undefined;
}
const num = Number(value);
return Number.isFinite(num) ? num : undefined;
};
const coverage = toNumber(totals.coverage);
const baseCoverage = toNumber(compareTotals?.base_coverage ?? compareTotals?.base);
const delta = toNumber(
compareTotals?.coverage_change ??
compareTotals?.coverage_diff ??
totals.delta ??
totals.diff ??
totals.change,
);
const formatPercent = (value) => {
if (value === undefined) return '—';
return `${value.toFixed(2)}%`;
};
const formatDelta = (value) => {
if (value === undefined) return '—';
const sign = value >= 0 ? '+' : '';
return `${sign}${value.toFixed(2)}%`;
};
const shortSha = commitSha.slice(0, 7);
const lines = [
'<!-- codecov-coverage-comment -->',
'**Codecov Coverage**',
'',
`- Head \`${shortSha}\`: ${formatPercent(coverage)}`,
];
if (baseCoverage !== undefined) {
lines.push(`- Base: ${formatPercent(baseCoverage)}`);
}
if (delta !== undefined) {
lines.push(`- Change: ${formatDelta(delta)}`);
}
const flagEntries = Array.isArray(flagsRaw)
? flagsRaw
: Object.entries(flagsRaw).map(([name, totals]) => ({ name, totals }));
const flagRows = [];
for (const entry of flagEntries) {
const label = entry.flag ?? entry.name ?? entry.component ?? entry.id;
const entryTotals = entry.totals ?? entry;
const entryCoverage = toNumber(entryTotals?.coverage);
const entryDelta = toNumber(
entryTotals?.coverage_change ??
entryTotals?.coverage_diff ??
entryTotals?.delta ??
entryTotals?.diff,
);
if (!label || entryCoverage === undefined) {
continue;
}
flagRows.push(`| ${label} | ${formatPercent(entryCoverage)} | ${formatDelta(entryDelta)} |`);
}
if (flagRows.length) {
lines.push('');
lines.push('| Flag | Coverage | Change |');
lines.push('| --- | --- | --- |');
lines.push(...flagRows);
}
const commentBody = lines.join('\n');
const shouldComment = coverage !== undefined;
core.setOutput('shouldComment', shouldComment ? 'true' : 'false');
if (shouldComment) {
core.setOutput('commentBody', commentBody);
}
- name: Upsert coverage comment
if: steps.pr.outputs.shouldRun == 'true' && steps.coverage.outputs.shouldComment == 'true'
uses: actions/github-script@v7
env:
PR_NUMBER: ${{ steps.pr.outputs.prNumber }}
COMMENT_BODY: ${{ steps.coverage.outputs.commentBody }}
with:
script: |
const prNumber = Number(process.env.PR_NUMBER);
const body = process.env.COMMENT_BODY;
const marker = '<!-- codecov-coverage-comment -->';
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
per_page: 100,
});
const existing = comments.find((comment) => comment.body?.includes(marker));
if (existing) {
core.info(`Updating existing coverage comment (id: ${existing.id}).`);
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existing.id,
body,
});
} else {
core.info('Creating new coverage comment.');
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body,
});
}

View File

@@ -1805,23 +1805,3 @@ password. All of these options come from their similarly-named [Django settings]
#### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL} #### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
: Defaults to false. : Defaults to false.
## Remote OCR
#### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
: The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
Defaults to None, which disables remote OCR.
#### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
: The API key to use for the remote OCR engine.
Defaults to None.
#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
: The endpoint to use for the remote OCR engine. This is required for Azure AI.
Defaults to None.

View File

@@ -25,10 +25,9 @@ physical documents into a searchable online archive so you can keep, well, _less
## Features ## Features
- **Organize and index** your scanned documents with tags, correspondents, types, and more. - **Organize and index** your scanned documents with tags, correspondents, types, and more.
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so. - _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images. - Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
- Utilizes the open-source Tesseract engine to recognize more than 100 languages. - Utilizes the open-source Tesseract engine to recognize more than 100 languages.
- _New!_ Supports remote OCR with Azure AI (opt-in).
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals. - Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
- Uses machine-learning to automatically add tags, correspondents and document types to your documents. - Uses machine-learning to automatically add tags, correspondents and document types to your documents.
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, PowerPoint, and LibreOffice equivalents)[^1] and more. - Supports PDF documents, images, plain text files, Office documents (Word, Excel, PowerPoint, and LibreOffice equivalents)[^1] and more.

View File

@@ -882,21 +882,6 @@ how regularly you intend to scan documents and use paperless.
performed the task associated with the document, move it to the performed the task associated with the document, move it to the
inbox. inbox.
## Remote OCR
!!! important
This feature is disabled by default and will always remain strictly "opt-in".
Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
Additionally, when using a commercial service with this feature, consider both potential costs as well as any associated file size
or page limitations (e.g. with a free tier).
## Architecture ## Architecture
Paperless-ngx consists of the following components: Paperless-ngx consists of the following components:

View File

@@ -15,7 +15,6 @@ classifiers = [
# This will allow testing to not install a webserver, mysql, etc # This will allow testing to not install a webserver, mysql, etc
dependencies = [ dependencies = [
"azure-ai-documentintelligence>=1.0.2",
"babel>=2.17", "babel>=2.17",
"bleach~=6.2.0", "bleach~=6.2.0",
"celery[redis]~=5.5.1", "celery[redis]~=5.5.1",
@@ -234,7 +233,6 @@ testpaths = [
"src/paperless_tesseract/tests/", "src/paperless_tesseract/tests/",
"src/paperless_tika/tests", "src/paperless_tika/tests",
"src/paperless_text/tests/", "src/paperless_text/tests/",
"src/paperless_remote/tests/",
] ]
addopts = [ addopts = [
"--pythonwarnings=all", "--pythonwarnings=all",
@@ -257,7 +255,6 @@ PAPERLESS_DISABLE_DBHANDLER = "true"
PAPERLESS_CACHE_BACKEND = "django.core.cache.backends.locmem.LocMemCache" PAPERLESS_CACHE_BACKEND = "django.core.cache.backends.locmem.LocMemCache"
[tool.coverage.run] [tool.coverage.run]
relative_files = true
source = [ source = [
"src/", "src/",
] ]

View File

@@ -1,24 +0,0 @@
sonar.projectKey=paperless-ngx_paperless-ngx
sonar.organization=paperless-ngx
sonar.projectName=Paperless-ngx
sonar.projectVersion=1.0
# Source and test directories
sonar.sources=src/,src-ui/
sonar.test.inclusions=**/test_*.py,**/tests.py,**/*.spec.ts,**/*.test.ts
# Language specific settings
sonar.python.version=3.10,3.11,3.12,3.13
# Coverage reports
sonar.python.coverage.reportPaths=merged-backend-coverage.xml
sonar.javascript.lcov.reportPaths=coverage/lcov.info
# Test execution reports
sonar.junit.reportPaths=**/junit.xml,**/test-results.xml
# Encoding
sonar.sourceEncoding=UTF-8
# Exclusions
sonar.exclusions=**/migrations/**,**/node_modules/**,**/static/**,**/venv/**,**/.venv/**,**/dist/**

View File

@@ -164,6 +164,9 @@ class BarcodePlugin(ConsumeTaskPlugin):
mailrule_id=self.input_doc.mailrule_id, mailrule_id=self.input_doc.mailrule_id,
# Can't use same folder or the consume might grab it again # Can't use same folder or the consume might grab it again
original_file=(tmp_dir / new_document.name).resolve(), original_file=(tmp_dir / new_document.name).resolve(),
# Adding optional original_path for later uses in
# workflow matching
original_path=self.input_doc.original_file,
), ),
# All the same metadata # All the same metadata
self.metadata, self.metadata,

View File

@@ -156,6 +156,7 @@ class ConsumableDocument:
source: DocumentSource source: DocumentSource
original_file: Path original_file: Path
original_path: Path | None = None
mailrule_id: int | None = None mailrule_id: int | None = None
mime_type: str = dataclasses.field(init=False, default=None) mime_type: str = dataclasses.field(init=False, default=None)

View File

@@ -314,11 +314,19 @@ def consumable_document_matches_workflow(
trigger_matched = False trigger_matched = False
# Document path vs trigger path # Document path vs trigger path
# Use the original_path if set, else us the original_file
match_against = (
document.original_path
if document.original_path is not None
else document.original_file
)
if ( if (
trigger.filter_path is not None trigger.filter_path is not None
and len(trigger.filter_path) > 0 and len(trigger.filter_path) > 0
and not fnmatch( and not fnmatch(
document.original_file, match_against,
trigger.filter_path, trigger.filter_path,
) )
): ):

View File

@@ -614,14 +614,16 @@ class TestBarcodeNewConsume(
self.assertIsNotFile(temp_copy) self.assertIsNotFile(temp_copy)
# Check the split files exist # Check the split files exist
# Check the original_path is set
# Check the source is unchanged # Check the source is unchanged
# Check the overrides are unchanged # Check the overrides are unchanged
for ( for (
new_input_doc, new_input_doc,
new_doc_overrides, new_doc_overrides,
) in self.get_all_consume_delay_call_args(): ) in self.get_all_consume_delay_call_args():
self.assertEqual(new_input_doc.source, DocumentSource.ConsumeFolder)
self.assertIsFile(new_input_doc.original_file) self.assertIsFile(new_input_doc.original_file)
self.assertEqual(new_input_doc.original_path, temp_copy)
self.assertEqual(new_input_doc.source, DocumentSource.ConsumeFolder)
self.assertEqual(overrides, new_doc_overrides) self.assertEqual(overrides, new_doc_overrides)

View File

@@ -322,7 +322,6 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig", "paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig", "paperless_mail.apps.PaperlessMailConfig",
"paperless_remote.apps.PaperlessRemoteParserConfig",
"django.contrib.admin", "django.contrib.admin",
"rest_framework", "rest_framework",
"rest_framework.authtoken", "rest_framework.authtoken",
@@ -1390,10 +1389,3 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
"PAPERLESS_WEBHOOKS_ALLOW_INTERNAL_REQUESTS", "PAPERLESS_WEBHOOKS_ALLOW_INTERNAL_REQUESTS",
"true", "true",
) )
###############################################################################
# Remote Parser #
###############################################################################
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")

View File

@@ -1,4 +0,0 @@
# this is here so that django finds the checks.
from paperless_remote.checks import check_remote_parser_configured
__all__ = ["check_remote_parser_configured"]

View File

@@ -1,14 +0,0 @@
from django.apps import AppConfig
from paperless_remote.signals import remote_consumer_declaration
class PaperlessRemoteParserConfig(AppConfig):
name = "paperless_remote"
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(remote_consumer_declaration)
AppConfig.ready(self)

View File

@@ -1,17 +0,0 @@
from django.conf import settings
from django.core.checks import Error
from django.core.checks import register
@register()
def check_remote_parser_configured(app_configs, **kwargs):
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
):
return [
Error(
"Azure AI remote parser requires endpoint and API key to be configured.",
),
]
return []

View File

@@ -1,113 +0,0 @@
from pathlib import Path
from django.conf import settings
from paperless_tesseract.parsers import RasterisedDocumentParser
class RemoteEngineConfig:
def __init__(
self,
engine: str,
api_key: str | None = None,
endpoint: str | None = None,
):
self.engine = engine
self.api_key = api_key
self.endpoint = endpoint
def engine_is_valid(self):
valid = self.engine in ["azureai"] and self.api_key is not None
if self.engine == "azureai":
valid = valid and self.endpoint is not None
return valid
class RemoteDocumentParser(RasterisedDocumentParser):
"""
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
as this is the only service that provides a remote OCR API with text-embedded PDF output.
"""
logging_name = "paperless.parsing.remote"
def get_settings(self) -> RemoteEngineConfig:
"""
Returns the configuration for the remote OCR engine, loaded from Django settings.
"""
return RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
)
def supported_mime_types(self):
if self.settings.engine_is_valid():
return {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
else:
return {}
def azure_ai_vision_parse(
self,
file: Path,
) -> str | None:
"""
Uses Azure AI Vision to parse the document and return the text content.
It requests a searchable PDF output with embedded text.
The PDF is saved to the archive_path attribute.
Returns the text content extracted from the document.
If the parsing fails, it returns None.
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
endpoint=self.settings.endpoint,
credential=AzureKeyCredential(self.settings.api_key),
)
with file.open("rb") as f:
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
poller = client.begin_analyze_document(
model_id="prebuilt-read",
body=analyze_request,
output_content_format=DocumentContentFormat.TEXT,
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
content_type="application/json",
)
poller.wait()
result_id = poller.details["operation_id"]
result = poller.result()
# Download the PDF with embedded text
self.archive_path = self.tempdir / "archive.pdf"
with self.archive_path.open("wb") as f:
for chunk in client.get_analyze_result_pdf(
model_id="prebuilt-read",
result_id=result_id,
):
f.write(chunk)
client.close()
return result.content
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():
self.log.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self.text = ""
elif self.settings.engine == "azureai":
self.text = self.azure_ai_vision_parse(document_path)

View File

@@ -1,18 +0,0 @@
def get_parser(*args, **kwargs):
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(*args, **kwargs)
def get_supported_mime_types():
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(None).supported_mime_types()
def remote_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 5,
"mime_types": get_supported_mime_types(),
}

View File

@@ -1,24 +0,0 @@
from unittest import TestCase
from django.test import override_settings
from paperless_remote import check_remote_parser_configured
class TestChecks(TestCase):
@override_settings(REMOTE_OCR_ENGINE=None)
def test_no_engine(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)
@override_settings(REMOTE_OCR_ENGINE="azureai")
@override_settings(REMOTE_OCR_API_KEY="somekey")
@override_settings(REMOTE_OCR_ENDPOINT=None)
def test_azure_no_endpoint(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"Azure AI remote parser requires endpoint and API key to be configured.",
),
)

View File

@@ -1,101 +0,0 @@
import uuid
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.parsers import RemoteDocumentParser
from paperless_remote.signals import get_parser
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
def assertContainsStrings(self, content: str, strings: list[str]):
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
@mock.patch("paperless_tesseract.parsers.run_subprocess")
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
# Arrange mock Azure client
mock_client = mock.Mock()
mock_client_cls.return_value = mock_client
# Simulate poller result and its `.details`
mock_poller = mock.Mock()
mock_poller.wait.return_value = None
mock_poller.details = {"operation_id": "fake-op-id"}
mock_client.begin_analyze_document.return_value = mock_poller
mock_poller.result.return_value.content = "This is a test document."
# Return dummy PDF bytes
mock_client.get_analyze_result_pdf.return_value = [
b"%PDF-",
b"1.7 ",
b"FAKEPDF",
]
# Simulate pdftotext by writing dummy text to sidecar file
def fake_run(cmd, *args, **kwargs):
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
f.write("This is a test document.")
mock_subprocess.side_effect = fake_run
with override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
):
parser = get_parser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)
@override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="key",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
)
def test_supported_mime_types_valid_config(self):
parser = RemoteDocumentParser(uuid.uuid4())
expected_types = {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
self.assertEqual(parser.supported_mime_types(), expected_types)
def test_supported_mime_types_invalid_config(self):
parser = get_parser(uuid.uuid4())
self.assertEqual(parser.supported_mime_types(), {})
@override_settings(
REMOTE_OCR_ENGINE=None,
REMOTE_OCR_API_KEY=None,
REMOTE_OCR_ENDPOINT=None,
)
def test_parse_with_invalid_config(self):
parser = get_parser(uuid.uuid4())
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
self.assertEqual(parser.text, "")

39
uv.lock generated
View File

@@ -95,34 +95,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" }, { url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
] ]
[[package]]
name = "azure-ai-documentintelligence"
version = "1.0.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
]
[[package]]
name = "azure-core"
version = "1.33.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
]
[[package]] [[package]]
name = "babel" name = "babel"
version = "2.17.0" version = "2.17.0"
@@ -1440,15 +1412,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" }, { url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
] ]
[[package]]
name = "isodate"
version = "0.7.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
]
[[package]] [[package]]
name = "jinja2" name = "jinja2"
version = "3.1.6" version = "3.1.6"
@@ -2069,7 +2032,6 @@ name = "paperless-ngx"
version = "2.18.4" version = "2.18.4"
source = { virtual = "." } source = { virtual = "." }
dependencies = [ dependencies = [
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2207,7 +2169,6 @@ typing = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
{ name = "babel", specifier = ">=2.17" }, { name = "babel", specifier = ">=2.17" },
{ name = "bleach", specifier = "~=6.2.0" }, { name = "bleach", specifier = "~=6.2.0" },
{ name = "celery", extras = ["redis"], specifier = "~=5.5.1" }, { name = "celery", extras = ["redis"], specifier = "~=5.5.1" },