Merge branch 'dev' into feature-remote-ocr-2

2025-10-30 03:56:23 -05:00 · 2025-09-24 13:46:49 -07:00 · 2025-09-22 13:42:55 -07:00 · 2025-09-21 16:18:13 -07:00 · 2025-09-17 19:18:47 -07:00 · 2025-09-17 16:51:23 -07:00
22 changed files with 500 additions and 477 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -192,6 +192,18 @@ jobs:
          token: ${{ secrets.CODECOV_TOKEN }}
          flags: backend-python-${{ matrix.python-version }}
          files: coverage.xml
+      - name: Upload coverage artifacts
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: backend-coverage-${{ matrix.python-version }}
+          path: |
+            .coverage
+            coverage.xml
+            junit.xml
+          retention-days: 1
+          include-hidden-files: true
+          if-no-files-found: error
      - name: Stop containers
        if: always()
        run: |
@@ -274,6 +286,17 @@ jobs:
          token: ${{ secrets.CODECOV_TOKEN }}
          flags: frontend-node-${{ matrix.node-version }}
          directory: src-ui/coverage/
+      - name: Upload coverage artifacts
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: frontend-coverage-${{ matrix.shard-index }}
+          path: |
+            src-ui/coverage/lcov.info
+            src-ui/coverage/coverage-final.json
+            src-ui/junit.xml
+          retention-days: 1
+          if-no-files-found: error
  tests-frontend-e2e:
    name: "Frontend E2E Tests (Node ${{ matrix.node-version }} - ${{ matrix.shard-index }}/${{ matrix.shard-count }})"
    runs-on: ubuntu-24.04
@@ -322,455 +345,6 @@ jobs:
        run: cd src-ui && pnpm exec playwright install
      - name: Run Playwright e2e tests
        run: cd src-ui && pnpm exec playwright test --shard ${{ matrix.shard-index }}/${{ matrix.shard-count }}
-  codecov-comment:
-    name: "Codecov PR Comment"
-    runs-on: ubuntu-24.04
-    needs:
-      - tests-backend
-      - tests-frontend
-      - tests-frontend-e2e
-    if: github.event_name == 'pull_request'
-    permissions:
-      contents: read
-      pull-requests: write
-    steps:
-      - name: Gather pull request context
-        id: pr
-        uses: actions/github-script@v7
-        with:
-          script: |
-            const pr = context.payload.pull_request;
-            if (!pr) {
-              core.info('No associated pull request. Skipping.');
-              core.setOutput('shouldRun', 'false');
-              return;
-            }
-
-            core.setOutput('shouldRun', 'true');
-            core.setOutput('prNumber', pr.number.toString());
-            core.setOutput('headSha', pr.head.sha);
-      - name: Fetch Codecov coverage
-        id: coverage
-        if: steps.pr.outputs.shouldRun == 'true'
-        uses: actions/github-script@v7
-        env:
-          COMMIT_SHA: ${{ steps.pr.outputs.headSha }}
-          PR_NUMBER: ${{ steps.pr.outputs.prNumber }}
-        with:
-          script: |
-            const commitSha = process.env.COMMIT_SHA;
-            const prNumber = process.env.PR_NUMBER;
-            const owner = context.repo.owner;
-            const repo = context.repo.repo;
-            const service = 'gh';
-            const baseUrl = `https://api.codecov.io/api/v2/${service}/${owner}/repos/${repo}`;
-            const commitUrl = `${baseUrl}/commits/${commitSha}`;
-            const maxAttempts = 20;
-            const waitMs = 15000;
-            const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
-
-            let data;
-            for (let attempt = 1; attempt <= maxAttempts; attempt++) {
-              core.info(`Fetching Codecov report (attempt ${attempt}/${maxAttempts})`);
-              let response;
-              try {
-                response = await fetch(commitUrl, {
-                  headers: {
-                    'Content-Type': 'application/json',
-                    Accept: 'application/json',
-                  },
-                });
-              } catch (error) {
-                core.warning(`Codecov fetch failed: ${error}. Waiting before retrying.`);
-                await sleep(waitMs);
-                continue;
-              }
-
-              if (response.status === 404) {
-                core.info('Report not ready yet (404). Waiting before retrying.');
-                await sleep(waitMs);
-                continue;
-              }
-
-              if ([429, 500, 502, 503, 504].includes(response.status)) {
-                const text = await response.text().catch(() => '');
-                core.info(`Codecov API transient error ${response.status}: ${text}. Waiting before retrying.`);
-                await sleep(waitMs);
-                continue;
-              }
-
-              if (!response.ok) {
-                const text = await response.text().catch(() => '');
-                core.warning(`Codecov API returned ${response.status}: ${text}. Skipping comment.`);
-                core.setOutput('shouldComment', 'false');
-                return;
-              }
-
-              data = await response.json().catch((error) => {
-                core.warning(`Failed to parse Codecov response: ${error}.`);
-                return undefined;
-              });
-              if (data && Object.keys(data).length > 0) {
-                break;
-              }
-
-              core.info('Report payload empty. Waiting before retrying.');
-              await sleep(waitMs);
-            }
-
-            if (!data && prNumber) {
-              core.info('Attempting to retrieve coverage from PR endpoint.');
-              const prUrl = `${baseUrl}/pulls/${prNumber}`;
-              let prResponse;
-              try {
-                prResponse = await fetch(prUrl, {
-                  headers: {
-                    'Content-Type': 'application/json',
-                    Accept: 'application/json',
-                  },
-                });
-              } catch (error) {
-                core.warning(`Codecov PR fetch failed: ${error}.`);
-              }
-
-              if (prResponse) {
-                if ([429, 500, 502, 503, 504].includes(prResponse.status)) {
-                  const text = await prResponse.text().catch(() => '');
-                  core.info(`Codecov PR endpoint transient error ${prResponse.status}: ${text}.`);
-                } else if (!prResponse.ok) {
-                  const text = await prResponse.text().catch(() => '');
-                  core.warning(`Codecov PR endpoint returned ${prResponse.status}: ${text}.`);
-                } else {
-                  const prData = await prResponse.json().catch((error) => {
-                    core.warning(`Failed to parse Codecov PR response: ${error}.`);
-                    return undefined;
-                  });
-
-                  if (prData?.latest_report) {
-                    data = { report: prData.latest_report };
-                  } else if (prData?.head_totals) {
-                    const headTotals = prData.head_totals;
-                    const baseTotals = prData.base_totals;
-                    let compareTotals;
-                    if (baseTotals && headTotals) {
-                      const headCoverage = Number(headTotals.coverage);
-                      const baseCoverage = Number(baseTotals.coverage);
-                      if (Number.isFinite(headCoverage) && Number.isFinite(baseCoverage)) {
-                        compareTotals = {
-                          base_coverage: baseCoverage,
-                          coverage_change: headCoverage - baseCoverage,
-                        };
-                      }
-                    }
-
-                    data = {
-                      report: {
-                        totals: headTotals,
-                        compare: compareTotals ? { totals: compareTotals } : undefined,
-                        totals_by_flag: [],
-                      },
-                      head_totals: headTotals,
-                      base_totals: baseTotals,
-                    };
-                  } else {
-                    data = prData;
-                  }
-                }
-              }
-            }
-
-            if (!data) {
-              core.warning('Unable to retrieve Codecov report after multiple attempts.');
-              core.setOutput('shouldComment', 'false');
-              return;
-            }
-
-            const toNumber = (value) => {
-              if (value === null || value === undefined || value === '') {
-                return undefined;
-              }
-              const num = Number(value);
-              return Number.isFinite(num) ? num : undefined;
-            };
-
-            const reportData = data.report || data;
-            const totals = reportData.totals ?? data.head_totals ?? data.totals;
-            if (!totals) {
-              core.warning('Codecov response does not contain coverage totals.');
-              core.setOutput('shouldComment', 'false');
-              return;
-            }
-
-            let compareTotals = reportData.compare?.totals ?? data.compare?.totals;
-            if (!compareTotals && data.base_totals) {
-              const baseCoverageValue = toNumber(data.base_totals.coverage);
-              if (baseCoverageValue !== undefined) {
-                const headCoverageValue = toNumber((data.head_totals ?? {}).coverage);
-                compareTotals = {
-                  base_coverage: baseCoverageValue,
-                  coverage_change:
-                    headCoverageValue !== undefined ? headCoverageValue - baseCoverageValue : undefined,
-                };
-              }
-            }
-
-            const coverage = toNumber(totals.coverage);
-            const baseCoverage = toNumber(compareTotals?.base_coverage ?? compareTotals?.base);
-            let delta = toNumber(
-              compareTotals?.coverage_change ??
-              compareTotals?.coverage_diff ??
-              totals.delta ??
-              totals.diff ??
-              totals.change,
-            );
-
-            if (delta === undefined && coverage !== undefined && baseCoverage !== undefined) {
-              delta = coverage - baseCoverage;
-            }
-
-            const formatPercent = (value) => {
-              if (value === undefined) return '—';
-              return `${value.toFixed(2)}%`;
-            };
-
-            const formatDelta = (value) => {
-              if (value === undefined) return '—';
-              const sign = value >= 0 ? '+' : '';
-              return `${sign}${value.toFixed(2)}%`;
-            };
-
-            const shortSha = commitSha.slice(0, 7);
-            const reportBaseUrl = `https://app.codecov.io/gh/${owner}/${repo}`;
-            const commitReportUrl = `${reportBaseUrl}/commit/${commitSha}?src=pr&el=comment`;
-            const prReportUrl = prNumber
-              ? `${reportBaseUrl}/pull/${prNumber}?src=pr&el=comment`
-              : commitReportUrl;
-
-            const findBaseCommitSha = () =>
-              data?.report?.compare?.base_commitid ??
-              data?.report?.compare?.base?.commitid ??
-              data?.report?.base_commitid ??
-              data?.compare?.base_commitid ??
-              data?.compare?.base?.commitid ??
-              data?.base_commitid ??
-              data?.base?.commitid;
-
-            const baseCommitSha = findBaseCommitSha();
-            const baseCommitUrl = baseCommitSha
-              ? `${reportBaseUrl}/commit/${baseCommitSha}?src=pr&el=comment`
-              : undefined;
-            const baseShortSha = baseCommitSha ? baseCommitSha.slice(0, 7) : undefined;
-
-            const lines = ['<!-- codecov-coverage-comment -->'];
-            lines.push(`## [Codecov](${prReportUrl}) Report`);
-            lines.push('');
-
-            if (coverage !== undefined) {
-              lines.push(`:white_check_mark: Project coverage for \`${shortSha}\` is ${formatPercent(coverage)}.`);
-            } else {
-              lines.push(':warning: Coverage for the head commit is unavailable.');
-            }
-
-            if (baseCoverage !== undefined) {
-              const changeEmoji = delta === undefined ? ':grey_question:' : delta >= 0 ? ':white_check_mark:' : ':small_red_triangle_down:';
-              const baseCoverageText = `Base${baseShortSha ? ` \`${baseShortSha}\`` : ''} ${formatPercent(baseCoverage)}`;
-              const baseLink = baseCommitUrl ? `[${baseCoverageText}](${baseCommitUrl})` : baseCoverageText;
-              const changeText =
-                delta !== undefined
-                  ? `${baseLink} (${formatDelta(delta)})`
-                  : `${baseLink} (change unknown)`;
-              lines.push(`${changeEmoji} ${changeText}.`);
-            }
-
-            lines.push(`:clipboard: [View full report on Codecov](${commitReportUrl}).`);
-
-            const normalizeTotals = (value) => {
-              if (!value) return undefined;
-              if (value.totals && typeof value.totals === 'object') return value.totals;
-              return value;
-            };
-
-            const headTotals = normalizeTotals(totals) ?? {};
-            const baseTotals =
-              normalizeTotals(data.base_totals) ??
-              normalizeTotals(reportData.base_totals) ??
-              normalizeTotals(reportData.compare?.base_totals) ??
-              normalizeTotals(reportData.compare?.base);
-
-            const formatInteger = (value) => {
-              if (value === undefined) return '—';
-              return value.toLocaleString('en-US');
-            };
-
-            const formatIntegerDelta = (value) => {
-              if (value === undefined) return '—';
-              const sign = value >= 0 ? '+' : '';
-              return `${sign}${value.toLocaleString('en-US')}`;
-            };
-
-            const getInteger = (value) => {
-              const num = toNumber(value);
-              return Number.isFinite(num) ? Math.round(num) : undefined;
-            };
-
-            const metrics = [];
-            metrics.push({
-              label: 'Coverage',
-              base: baseCoverage,
-              head: coverage,
-              diff: delta,
-              format: formatPercent,
-              formatDiff: formatDelta,
-            });
-
-            const pushIntegerMetric = (label, headValueRaw, baseValueRaw) => {
-              const headValue = getInteger(headValueRaw);
-              const baseValue = getInteger(baseValueRaw);
-              if (headValue === undefined && baseValue === undefined) {
-                return;
-              }
-              const diff = headValue !== undefined && baseValue !== undefined ? headValue - baseValue : undefined;
-              metrics.push({
-                label,
-                base: baseValue,
-                head: headValue,
-                diff,
-                format: formatInteger,
-                formatDiff: formatIntegerDelta,
-              });
-            };
-
-            pushIntegerMetric('Files', headTotals.files, baseTotals?.files);
-            pushIntegerMetric('Lines', headTotals.lines, baseTotals?.lines);
-            pushIntegerMetric('Branches', headTotals.branches, baseTotals?.branches);
-            pushIntegerMetric('Hits', headTotals.hits, baseTotals?.hits);
-            pushIntegerMetric('Misses', headTotals.misses, baseTotals?.misses);
-
-            const hasMetricData = metrics.some((metric) => metric.base !== undefined || metric.head !== undefined);
-            if (hasMetricData) {
-              lines.push('');
-              lines.push('<details><summary>Coverage summary</summary>');
-              lines.push('');
-              lines.push('| Metric | Base | Head | Δ |');
-              lines.push('| --- | --- | --- | --- |');
-              for (const metric of metrics) {
-                const baseValue = metric.base !== undefined ? metric.format(metric.base) : '—';
-                const headValue = metric.head !== undefined ? metric.format(metric.head) : '—';
-                const diffValue = metric.diff !== undefined ? metric.formatDiff(metric.diff) : '—';
-                lines.push(`| ${metric.label} | ${baseValue} | ${headValue} | ${diffValue} |`);
-              }
-              lines.push('');
-              lines.push('</details>');
-            }
-
-            const normalizeEntries = (raw) => {
-              if (!raw) return [];
-              if (Array.isArray(raw)) return raw;
-              if (typeof raw === 'object') {
-                return Object.entries(raw).map(([name, totals]) => ({ name, ...(typeof totals === 'object' ? totals : { coverage: totals }) }));
-              }
-              return [];
-            };
-
-            const buildTableRows = (entries) => {
-              const rows = [];
-              for (const entry of entries) {
-                const label = entry.flag ?? entry.name ?? entry.component ?? entry.id;
-                const entryTotals = entry.totals ?? entry;
-                const entryCoverage = toNumber(entryTotals?.coverage);
-                if (!label || entryCoverage === undefined) {
-                  continue;
-                }
-                const entryDelta = toNumber(
-                  entryTotals?.coverage_change ??
-                  entryTotals?.coverage_diff ??
-                  entryTotals?.delta ??
-                  entryTotals?.diff ??
-                  entryTotals?.change,
-                );
-                const coverageText = entryCoverage !== undefined ? `\`${formatPercent(entryCoverage)}\`` : '—';
-                const deltaText = entryDelta !== undefined ? `\`${formatDelta(entryDelta)}\`` : '—';
-                rows.push(`| ${label} | ${coverageText} | ${deltaText} |`);
-              }
-              return rows;
-            };
-
-            const componentEntries = normalizeEntries(reportData.components ?? data.components);
-            const flagEntries = normalizeEntries(reportData.totals_by_flag ?? data.totals_by_flag);
-
-            if (componentEntries.length) {
-              const componentsLink = prNumber
-                ? `${reportBaseUrl}/pull/${prNumber}/components?src=pr&el=components`
-                : `${commitReportUrl}`;
-              const componentRows = buildTableRows(componentEntries);
-              if (componentRows.length) {
-                lines.push('');
-                lines.push(`[Components report](${componentsLink})`);
-                lines.push('');
-                lines.push('| Component | Coverage | Δ |');
-                lines.push('| --- | --- | --- |');
-                lines.push(...componentRows);
-              }
-            }
-
-            if (flagEntries.length) {
-              const flagsLink = prNumber
-                ? `${reportBaseUrl}/pull/${prNumber}/flags?src=pr&el=flags`
-                : `${commitReportUrl}`;
-              const flagRows = buildTableRows(flagEntries);
-              if (flagRows.length) {
-                lines.push('');
-                lines.push(`[Flags report](${flagsLink})`);
-                lines.push('');
-                lines.push('| Flag | Coverage | Δ |');
-                lines.push('| --- | --- | --- |');
-                lines.push(...flagRows);
-              }
-            }
-
-            const commentBody = lines.join('\n');
-            const shouldComment = coverage !== undefined;
-            core.setOutput('shouldComment', shouldComment ? 'true' : 'false');
-            if (shouldComment) {
-              core.setOutput('commentBody', commentBody);
-            }
-      - name: Upsert coverage comment
-        if: steps.pr.outputs.shouldRun == 'true' && steps.coverage.outputs.shouldComment == 'true'
-        uses: actions/github-script@v7
-        env:
-          PR_NUMBER: ${{ steps.pr.outputs.prNumber }}
-          COMMENT_BODY: ${{ steps.coverage.outputs.commentBody }}
-        with:
-          script: |
-            const prNumber = Number(process.env.PR_NUMBER);
-            const body = process.env.COMMENT_BODY;
-            const marker = '<!-- codecov-coverage-comment -->';
-
-            const { data: comments } = await github.rest.issues.listComments({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: prNumber,
-              per_page: 100,
-            });
-
-            const existing = comments.find((comment) => comment.body?.includes(marker));
-            if (existing) {
-              core.info(`Updating existing coverage comment (id: ${existing.id}).`);
-              await github.rest.issues.updateComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                comment_id: existing.id,
-                body,
-              });
-            } else {
-              core.info('Creating new coverage comment.');
-              await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: prNumber,
-                body,
-              });
-            }
  frontend-bundle-analysis:
    name: "Frontend Bundle Analysis"
    runs-on: ubuntu-24.04
@@ -803,6 +377,74 @@ jobs:
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        run: cd src-ui && pnpm run build --configuration=production
+  sonarqube-analysis:
+    name: "SonarQube Analysis"
+    runs-on: ubuntu-24.04
+    needs:
+      - tests-backend
+      - tests-frontend
+    if: github.repository_owner == 'paperless-ngx'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+      - name: Download all backend coverage
+        uses: actions/download-artifact@v5.0.0
+        with:
+          pattern: backend-coverage-*
+          path: ./coverage/
+      - name: Download all frontend coverage
+        uses: actions/download-artifact@v5.0.0
+        with:
+          pattern: frontend-coverage-*
+          path: ./coverage/
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
+      - name: Install coverage tools
+        run: |
+          pip install coverage
+          npm install -g nyc
+      # Merge backend coverage from all Python versions
+      - name: Merge backend coverage
+        run: |
+          coverage combine coverage/backend-coverage-*/.coverage
+          coverage xml -o merged-backend-coverage.xml
+      # Merge frontend coverage from all shards
+      - name: Merge frontend coverage
+        run: |
+          # Find all coverage-final.json files from the shards, exit with error if none found
+          shopt -s nullglob
+          files=(coverage/frontend-coverage-*/coverage/coverage-final.json)
+          if [ ${#files[@]} -eq 0 ]; then
+            echo "No frontend coverage JSON found under coverage/" >&2
+            exit 1
+          fi
+          # Create .nyc_output directory and copy each shard's coverage JSON into it with a unique name
+          mkdir -p .nyc_output
+          for coverage_json in "${files[@]}"; do
+            shard=$(basename "$(dirname "$(dirname "$coverage_json")")")
+            cp "$coverage_json" ".nyc_output/${shard}.json"
+          done
+          npx nyc merge .nyc_output .nyc_output/out.json
+          npx nyc report --reporter=lcovonly --report-dir coverage
+      - name: Upload coverage artifacts
+        uses: actions/upload-artifact@v4.6.2
+        with:
+          name: merged-coverage
+          path: |
+            merged-backend-coverage.xml
+            .nyc_output/*
+            coverage/lcov.info
+          retention-days: 7
+          if-no-files-found: error
+          include-hidden-files: true
+      - name: SonarQube Analysis
+        uses: SonarSource/sonarqube-scan-action@v5
+        env:
+          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
  build-docker-image:
    name: Build Docker image for ${{ github.ref_name }}
    runs-on: ubuntu-24.04
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1805,3 +1805,23 @@ password. All of these options come from their similarly-named [Django settings]
 #### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}

 : Defaults to false.
+
+## Remote OCR
+
+#### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
+
+: The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
+
+    Defaults to None, which disables remote OCR.
+
+#### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
+
+: The API key to use for the remote OCR engine.
+
+    Defaults to None.
+
+#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
+
+: The endpoint to use for the remote OCR engine. This is required for Azure AI.
+
+    Defaults to None.
--- a/docs/index.md
+++ b/docs/index.md
@@ -25,9 +25,10 @@ physical documents into a searchable online archive so you can keep, well, _less
 ## Features

 -   **Organize and index** your scanned documents with tags, correspondents, types, and more.
-   _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
+-   _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
 -   Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
-   Utilizes the open-source Tesseract engine to recognize more than 100 languages.
+    -   Utilizes the open-source Tesseract engine to recognize more than 100 languages.
+    -   _New!_ Supports remote OCR with Azure AI (opt-in).
 -   Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
 -   Uses machine-learning to automatically add tags, correspondents and document types to your documents.
 -   Supports PDF documents, images, plain text files, Office documents (Word, Excel, PowerPoint, and LibreOffice equivalents)[^1] and more.
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -882,6 +882,21 @@ how regularly you intend to scan documents and use paperless.
    performed the task associated with the document, move it to the
    inbox.

+## Remote OCR
+
+!!! important
+
+    This feature is disabled by default and will always remain strictly "opt-in".
+
+Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
+[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
+This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
+Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
+the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
+
+Additionally, when using a commercial service with this feature, consider both potential costs as well as any associated file size
+or page limitations (e.g. with a free tier).
+
 ## Architecture

 Paperless-ngx consists of the following components:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ classifiers = [
 # This will allow testing to not install a webserver, mysql, etc

 dependencies = [
+  "azure-ai-documentintelligence>=1.0.2",
  "babel>=2.17",
  "bleach~=6.2.0",
  "celery[redis]~=5.5.1",
@@ -233,6 +234,7 @@ testpaths = [
  "src/paperless_tesseract/tests/",
  "src/paperless_tika/tests",
  "src/paperless_text/tests/",
+  "src/paperless_remote/tests/",
 ]
 addopts = [
  "--pythonwarnings=all",
@@ -255,6 +257,7 @@ PAPERLESS_DISABLE_DBHANDLER = "true"
 PAPERLESS_CACHE_BACKEND = "django.core.cache.backends.locmem.LocMemCache"

 [tool.coverage.run]
+relative_files = true
 source = [
  "src/",
 ]
--- a/sonar-project.properties
+++ b/sonar-project.properties
@@ -0,0 +1,24 @@
+sonar.projectKey=paperless-ngx_paperless-ngx
+sonar.organization=paperless-ngx
+sonar.projectName=Paperless-ngx
+sonar.projectVersion=1.0
+
+# Source and test directories
+sonar.sources=src/,src-ui/
+sonar.test.inclusions=**/test_*.py,**/tests.py,**/*.spec.ts,**/*.test.ts
+
+# Language specific settings
+sonar.python.version=3.10,3.11,3.12,3.13
+
+# Coverage reports
+sonar.python.coverage.reportPaths=merged-backend-coverage.xml
+sonar.javascript.lcov.reportPaths=coverage/lcov.info
+
+# Test execution reports
+sonar.junit.reportPaths=**/junit.xml,**/test-results.xml
+
+# Encoding
+sonar.sourceEncoding=UTF-8
+
+# Exclusions
+sonar.exclusions=**/migrations/**,**/node_modules/**,**/static/**,**/venv/**,**/.venv/**,**/dist/**
--- a/src-ui/src/app/components/common/edit-dialog/custom-field-edit-dialog/custom-field-edit-dialog.component.ts
+++ b/src-ui/src/app/components/common/edit-dialog/custom-field-edit-dialog/custom-field-edit-dialog.component.ts
@@ -177,16 +177,10 @@ export class CustomFieldEditDialogComponent
  }

  public removeSelectOption(index: number) {
-    const globalIndex =
-      index + (this.selectOptionsPage - 1) * SELECT_OPTION_PAGE_SIZE
-    this._allSelectOptions.splice(globalIndex, 1)
-
-    const totalPages = Math.max(
-      1,
-      Math.ceil(this._allSelectOptions.length / SELECT_OPTION_PAGE_SIZE)
+    this.selectOptions.removeAt(index)
+    this._allSelectOptions.splice(
+      index + (this.selectOptionsPage - 1) * SELECT_OPTION_PAGE_SIZE,
+      1
    )
-    const targetPage = Math.min(this.selectOptionsPage, totalPages)
-
-    this.selectOptionsPage = targetPage
  }
 }
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -164,9 +164,6 @@ class BarcodePlugin(ConsumeTaskPlugin):
                        mailrule_id=self.input_doc.mailrule_id,
                        # Can't use same folder or the consume might grab it again
                        original_file=(tmp_dir / new_document.name).resolve(),
-                        # Adding optional original_path for later uses in
-                        # workflow matching
-                        original_path=self.input_doc.original_file,
                    ),
                    # All the same metadata
                    self.metadata,
--- a/src/documents/data_models.py
+++ b/src/documents/data_models.py
@@ -156,7 +156,6 @@ class ConsumableDocument:

    source: DocumentSource
    original_file: Path
-    original_path: Path | None = None
    mailrule_id: int | None = None
    mime_type: str = dataclasses.field(init=False, default=None)

--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -314,19 +314,11 @@ def consumable_document_matches_workflow(
        trigger_matched = False

    # Document path vs trigger path
-
-    # Use the original_path if set, else us the original_file
-    match_against = (
-        document.original_path
-        if document.original_path is not None
-        else document.original_file
-    )
-
    if (
        trigger.filter_path is not None
        and len(trigger.filter_path) > 0
        and not fnmatch(
-            match_against,
+            document.original_file,
            trigger.filter_path,
        )
    ):
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -614,16 +614,14 @@ class TestBarcodeNewConsume(
            self.assertIsNotFile(temp_copy)

            # Check the split files exist
-            # Check the original_path is set
            # Check the source is unchanged
            # Check the overrides are unchanged
            for (
                new_input_doc,
                new_doc_overrides,
            ) in self.get_all_consume_delay_call_args():
-                self.assertIsFile(new_input_doc.original_file)
-                self.assertEqual(new_input_doc.original_path, temp_copy)
                self.assertEqual(new_input_doc.source, DocumentSource.ConsumeFolder)
+                self.assertIsFile(new_input_doc.original_file)
                self.assertEqual(overrides, new_doc_overrides)


--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -322,6 +322,7 @@ INSTALLED_APPS = [
    "paperless_tesseract.apps.PaperlessTesseractConfig",
    "paperless_text.apps.PaperlessTextConfig",
    "paperless_mail.apps.PaperlessMailConfig",
+    "paperless_remote.apps.PaperlessRemoteParserConfig",
    "django.contrib.admin",
    "rest_framework",
    "rest_framework.authtoken",
@@ -1389,3 +1390,10 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
    "PAPERLESS_WEBHOOKS_ALLOW_INTERNAL_REQUESTS",
    "true",
 )
+
+###############################################################################
+# Remote Parser                                                               #
+###############################################################################
+REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
+REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
+REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
--- a/src/paperless_remote/init.py
+++ b/src/paperless_remote/init.py
@@ -0,0 +1,4 @@
+# this is here so that django finds the checks.
+from paperless_remote.checks import check_remote_parser_configured
+
+__all__ = ["check_remote_parser_configured"]
--- a/src/paperless_remote/apps.py
+++ b/src/paperless_remote/apps.py
@@ -0,0 +1,14 @@
+from django.apps import AppConfig
+
+from paperless_remote.signals import remote_consumer_declaration
+
+
+class PaperlessRemoteParserConfig(AppConfig):
+    name = "paperless_remote"
+
+    def ready(self):
+        from documents.signals import document_consumer_declaration
+
+        document_consumer_declaration.connect(remote_consumer_declaration)
+
+        AppConfig.ready(self)
--- a/src/paperless_remote/checks.py
+++ b/src/paperless_remote/checks.py
@@ -0,0 +1,17 @@
+from django.conf import settings
+from django.core.checks import Error
+from django.core.checks import register
+
+
+@register()
+def check_remote_parser_configured(app_configs, **kwargs):
+    if settings.REMOTE_OCR_ENGINE == "azureai" and not (
+        settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
+    ):
+        return [
+            Error(
+                "Azure AI remote parser requires endpoint and API key to be configured.",
+            ),
+        ]
+
+    return []
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@@ -0,0 +1,113 @@
+from pathlib import Path
+
+from django.conf import settings
+
+from paperless_tesseract.parsers import RasterisedDocumentParser
+
+
+class RemoteEngineConfig:
+    def __init__(
+        self,
+        engine: str,
+        api_key: str | None = None,
+        endpoint: str | None = None,
+    ):
+        self.engine = engine
+        self.api_key = api_key
+        self.endpoint = endpoint
+
+    def engine_is_valid(self):
+        valid = self.engine in ["azureai"] and self.api_key is not None
+        if self.engine == "azureai":
+            valid = valid and self.endpoint is not None
+        return valid
+
+
+class RemoteDocumentParser(RasterisedDocumentParser):
+    """
+    This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
+    as this is the only service that provides a remote OCR API with text-embedded PDF output.
+    """
+
+    logging_name = "paperless.parsing.remote"
+
+    def get_settings(self) -> RemoteEngineConfig:
+        """
+        Returns the configuration for the remote OCR engine, loaded from Django settings.
+        """
+        return RemoteEngineConfig(
+            engine=settings.REMOTE_OCR_ENGINE,
+            api_key=settings.REMOTE_OCR_API_KEY,
+            endpoint=settings.REMOTE_OCR_ENDPOINT,
+        )
+
+    def supported_mime_types(self):
+        if self.settings.engine_is_valid():
+            return {
+                "application/pdf": ".pdf",
+                "image/png": ".png",
+                "image/jpeg": ".jpg",
+                "image/tiff": ".tiff",
+                "image/bmp": ".bmp",
+                "image/gif": ".gif",
+                "image/webp": ".webp",
+            }
+        else:
+            return {}
+
+    def azure_ai_vision_parse(
+        self,
+        file: Path,
+    ) -> str | None:
+        """
+        Uses Azure AI Vision to parse the document and return the text content.
+        It requests a searchable PDF output with embedded text.
+        The PDF is saved to the archive_path attribute.
+        Returns the text content extracted from the document.
+        If the parsing fails, it returns None.
+        """
+        from azure.ai.documentintelligence import DocumentIntelligenceClient
+        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
+        from azure.ai.documentintelligence.models import AnalyzeOutputOption
+        from azure.ai.documentintelligence.models import DocumentContentFormat
+        from azure.core.credentials import AzureKeyCredential
+
+        client = DocumentIntelligenceClient(
+            endpoint=self.settings.endpoint,
+            credential=AzureKeyCredential(self.settings.api_key),
+        )
+
+        with file.open("rb") as f:
+            analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
+            poller = client.begin_analyze_document(
+                model_id="prebuilt-read",
+                body=analyze_request,
+                output_content_format=DocumentContentFormat.TEXT,
+                output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
+                content_type="application/json",
+            )
+
+        poller.wait()
+        result_id = poller.details["operation_id"]
+        result = poller.result()
+
+        # Download the PDF with embedded text
+        self.archive_path = self.tempdir / "archive.pdf"
+        with self.archive_path.open("wb") as f:
+            for chunk in client.get_analyze_result_pdf(
+                model_id="prebuilt-read",
+                result_id=result_id,
+            ):
+                f.write(chunk)
+
+        client.close()
+        return result.content
+
+    def parse(self, document_path: Path, mime_type, file_name=None):
+        if not self.settings.engine_is_valid():
+            self.log.warning(
+                "No valid remote parser engine is configured, content will be empty.",
+            )
+            self.text = ""
+        elif self.settings.engine == "azureai":
+            self.text = self.azure_ai_vision_parse(document_path)
--- a/src/paperless_remote/signals.py
+++ b/src/paperless_remote/signals.py
@@ -0,0 +1,18 @@
+def get_parser(*args, **kwargs):
+    from paperless_remote.parsers import RemoteDocumentParser
+
+    return RemoteDocumentParser(*args, **kwargs)
+
+
+def get_supported_mime_types():
+    from paperless_remote.parsers import RemoteDocumentParser
+
+    return RemoteDocumentParser(None).supported_mime_types()
+
+
+def remote_consumer_declaration(sender, **kwargs):
+    return {
+        "parser": get_parser,
+        "weight": 5,
+        "mime_types": get_supported_mime_types(),
+    }
--- a/src/paperless_remote/tests/init.py
+++ b/src/paperless_remote/tests/init.py
--- a/src/paperless_remote/tests/samples/simple-digital.pdf
+++ b/src/paperless_remote/tests/samples/simple-digital.pdf
--- a/src/paperless_remote/tests/test_checks.py
+++ b/src/paperless_remote/tests/test_checks.py
@@ -0,0 +1,24 @@
+from unittest import TestCase
+
+from django.test import override_settings
+
+from paperless_remote import check_remote_parser_configured
+
+
+class TestChecks(TestCase):
+    @override_settings(REMOTE_OCR_ENGINE=None)
+    def test_no_engine(self):
+        msgs = check_remote_parser_configured(None)
+        self.assertEqual(len(msgs), 0)
+
+    @override_settings(REMOTE_OCR_ENGINE="azureai")
+    @override_settings(REMOTE_OCR_API_KEY="somekey")
+    @override_settings(REMOTE_OCR_ENDPOINT=None)
+    def test_azure_no_endpoint(self):
+        msgs = check_remote_parser_configured(None)
+        self.assertEqual(len(msgs), 1)
+        self.assertTrue(
+            msgs[0].msg.startswith(
+                "Azure AI remote parser requires endpoint and API key to be configured.",
+            ),
+        )
--- a/src/paperless_remote/tests/test_parser.py
+++ b/src/paperless_remote/tests/test_parser.py
@@ -0,0 +1,101 @@
+import uuid
+from pathlib import Path
+from unittest import mock
+
+from django.test import TestCase
+from django.test import override_settings
+
+from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import FileSystemAssertsMixin
+from paperless_remote.parsers import RemoteDocumentParser
+from paperless_remote.signals import get_parser
+
+
+class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
+
+    def assertContainsStrings(self, content: str, strings: list[str]):
+        # Asserts that all strings appear in content, in the given order.
+        indices = []
+        for s in strings:
+            if s in content:
+                indices.append(content.index(s))
+            else:
+                self.fail(f"'{s}' is not in '{content}'")
+        self.assertListEqual(indices, sorted(indices))
+
+    @mock.patch("paperless_tesseract.parsers.run_subprocess")
+    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
+    def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
+        # Arrange mock Azure client
+        mock_client = mock.Mock()
+        mock_client_cls.return_value = mock_client
+
+        # Simulate poller result and its `.details`
+        mock_poller = mock.Mock()
+        mock_poller.wait.return_value = None
+        mock_poller.details = {"operation_id": "fake-op-id"}
+        mock_client.begin_analyze_document.return_value = mock_poller
+        mock_poller.result.return_value.content = "This is a test document."
+
+        # Return dummy PDF bytes
+        mock_client.get_analyze_result_pdf.return_value = [
+            b"%PDF-",
+            b"1.7 ",
+            b"FAKEPDF",
+        ]
+
+        # Simulate pdftotext by writing dummy text to sidecar file
+        def fake_run(cmd, *args, **kwargs):
+            with Path(cmd[-1]).open("w", encoding="utf-8") as f:
+                f.write("This is a test document.")
+
+        mock_subprocess.side_effect = fake_run
+
+        with override_settings(
+            REMOTE_OCR_ENGINE="azureai",
+            REMOTE_OCR_API_KEY="somekey",
+            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
+        ):
+            parser = get_parser(uuid.uuid4())
+            parser.parse(
+                self.SAMPLE_FILES / "simple-digital.pdf",
+                "application/pdf",
+            )
+
+            self.assertContainsStrings(
+                parser.text.strip(),
+                ["This is a test document."],
+            )
+
+    @override_settings(
+        REMOTE_OCR_ENGINE="azureai",
+        REMOTE_OCR_API_KEY="key",
+        REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
+    )
+    def test_supported_mime_types_valid_config(self):
+        parser = RemoteDocumentParser(uuid.uuid4())
+        expected_types = {
+            "application/pdf": ".pdf",
+            "image/png": ".png",
+            "image/jpeg": ".jpg",
+            "image/tiff": ".tiff",
+            "image/bmp": ".bmp",
+            "image/gif": ".gif",
+            "image/webp": ".webp",
+        }
+        self.assertEqual(parser.supported_mime_types(), expected_types)
+
+    def test_supported_mime_types_invalid_config(self):
+        parser = get_parser(uuid.uuid4())
+        self.assertEqual(parser.supported_mime_types(), {})
+
+    @override_settings(
+        REMOTE_OCR_ENGINE=None,
+        REMOTE_OCR_API_KEY=None,
+        REMOTE_OCR_ENDPOINT=None,
+    )
+    def test_parse_with_invalid_config(self):
+        parser = get_parser(uuid.uuid4())
+        parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
+        self.assertEqual(parser.text, "")
--- a/uv.lock
+++ b/uv.lock
@@ -95,6 +95,34 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
 ]

+[[package]]
+name = "azure-ai-documentintelligence"
+version = "1.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
+]
+
+[[package]]
+name = "azure-core"
+version = "1.33.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
+]
+
 [[package]]
 name = "babel"
 version = "2.17.0"
@@ -1412,6 +1440,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
 ]

+[[package]]
+name = "isodate"
+version = "0.7.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -2032,6 +2069,7 @@ name = "paperless-ngx"
 version = "2.18.4"
 source = { virtual = "." }
 dependencies = [
+    { name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2169,6 +2207,7 @@ typing = [

 [package.metadata]
 requires-dist = [
+    { name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
    { name = "babel", specifier = ">=2.17" },
    { name = "bleach", specifier = "~=6.2.0" },
    { name = "celery", extras = ["redis"], specifier = "~=5.5.1" },
Author	SHA1	Message	Date
shamoon	476556379b	Merge branch 'dev' into feature-remote-ocr-2	2025-09-24 13:46:49 -07:00
shamoon	e5cafff043	Merge branch 'dev' into feature-remote-ocr-2	2025-09-22 13:42:55 -07:00
shamoon	8e0d574e99	Merge branch 'dev' into feature-remote-ocr-2	2025-09-21 16:18:13 -07:00
shamoon	8a5820328e	Sonar suggestions	2025-09-17 19:18:47 -07:00
shamoon	809d62a2f4	Merge branch 'dev' into feature-remote-ocr-2	2025-09-17 16:51:23 -07:00
shamoon	0d87f94b9b	Merge branch 'dev' into feature-remote-ocr-2	2025-09-14 14:01:35 -07:00
shamoon	315b90f8e5	Add typing to assertContainsStrings test util	2025-09-11 13:56:14 -07:00
shamoon	47b2d2964b	Use regular testcase instead of django, config check test	2025-09-11 13:52:10 -07:00
shamoon	e05639ae4e	tempdir already a path	2025-09-11 13:49:30 -07:00
shamoon	f400a8cb2f	Close client	2025-09-11 13:49:06 -07:00
shamoon	26abcf5612	Also ensure API key is set	2025-09-11 13:48:06 -07:00
shamoon	afde52430d	Merge branch 'dev' into feature-remote-ocr-2	2025-09-11 13:25:53 -07:00
shamoon	716f2da652	Merge branch 'dev' into feature-remote-ocr-2	2025-09-08 11:36:49 -07:00
shamoon	c54073b7c2	Merge branch 'dev' into feature-remote-ocr-2	2025-09-04 09:16:59 -07:00
shamoon	247e6f39dc	Merge branch 'dev' into feature-remote-ocr-2	2025-09-01 20:10:40 -07:00
shamoon	1e6dfc4481	Merge branch 'dev' into feature-remote-ocr-2	2025-08-26 13:30:39 -07:00
shamoon	7cc0750066	Add note on costs and limitations for Azure OCR	2025-08-24 05:47:07 -07:00
shamoon	bd6585d3b4	Merge branch 'dev' into feature-remote-ocr-2	2025-08-22 08:54:26 -07:00
shamoon	717e828a1d	Merge branch 'dev' into feature-remote-ocr-2	2025-08-17 21:25:14 -07:00
shamoon	07381d48e6	Merge branch 'dev' into feature-remote-ocr-2	2025-08-17 07:49:58 -07:00
shamoon	dd0ffaf312	Merge branch 'dev' into feature-remote-ocr-2	2025-08-11 10:48:36 -07:00
shamoon	264504affc	Fix consumer declaration file extensions	2025-08-10 05:32:52 -07:00
shamoon	4feedf2add	Merge branch 'dev' into feature-remote-ocr-2	2025-08-06 16:04:25 -04:00
shamoon	2f76cf9831	Merge branch 'dev' into feature-remote-ocr-2	2025-08-01 23:55:49 -04:00
shamoon	1002d37f6b	Update test_parser.py	2025-07-09 11:05:37 -07:00
shamoon	d260a94740	Update parsers.py	2025-07-09 11:02:57 -07:00
shamoon	88c69b83ea	Update index.md	2025-07-09 11:00:12 -07:00
shamoon	2557ee2014	Update docs to mention remote OCR with Azure AI	2025-07-09 09:53:30 -07:00
shamoon	3c75deed80	Add paperless_remote tests to testpaths	2025-07-08 14:19:45 -07:00
shamoon	d05343c927	Test fixes / coverage	2025-07-08 14:19:45 -07:00
shamoon	e7972b7eaf	Coverage	2025-07-08 14:19:45 -07:00
shamoon	75a091cc0d	Fix test	2025-07-08 14:19:44 -07:00
shamoon	dca74803fd	Use output_content_format poller.result to get clean content	2025-07-08 14:19:44 -07:00
shamoon	3cf3d868d0	Some docs	2025-07-08 14:19:43 -07:00
shamoon	bf4fc6604a	Test	2025-07-08 14:19:43 -07:00
shamoon	e8c1eb86fa	This actually works [ci skip]	2025-07-08 14:19:43 -07:00
shamoon	c3dad3cf69	Basic parse	2025-07-08 14:19:42 -07:00
shamoon	811bd66088	Ok, restart implementing this with just azure [ci skip]	2025-07-08 14:19:42 -07:00