From 6119c215e7d8782f4a8a4a1a889462ddd2297b63 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Mon, 22 Sep 2025 23:30:24 -0700 Subject: [PATCH 1/2] Fix: skip fuzzy matching for empty document content (#10914) --- .../commands/document_fuzzy_match.py | 3 +++ src/documents/tests/test_management_fuzzy.py | 26 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/documents/management/commands/document_fuzzy_match.py b/src/documents/management/commands/document_fuzzy_match.py index 5eebeb172..4ecdf6d01 100644 --- a/src/documents/management/commands/document_fuzzy_match.py +++ b/src/documents/management/commands/document_fuzzy_match.py @@ -92,6 +92,9 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): # doc to doc is obviously not useful if first_doc.pk == second_doc.pk: continue + # Skip empty documents (e.g. password-protected) + if first_doc.content.strip() == "" or second_doc.content.strip() == "": + continue # Skip matching which have already been matched together # doc 1 to doc 2 is the same as doc 2 to doc 1 doc_1_to_doc_2 = (first_doc.pk, second_doc.pk) diff --git a/src/documents/tests/test_management_fuzzy.py b/src/documents/tests/test_management_fuzzy.py index 2d7d3735a..453a86082 100644 --- a/src/documents/tests/test_management_fuzzy.py +++ b/src/documents/tests/test_management_fuzzy.py @@ -206,3 +206,29 @@ class TestFuzzyMatchCommand(TestCase): self.assertEqual(Document.objects.count(), 2) self.assertIsNotNone(Document.objects.get(pk=1)) self.assertIsNotNone(Document.objects.get(pk=2)) + + def test_empty_content(self): + """ + GIVEN: + - 2 documents exist, content is empty (pw-protected) + WHEN: + - Command is called + THEN: + - No matches are found + """ + Document.objects.create( + checksum="BEEFCAFE", + title="A", + content="", + mime_type="application/pdf", + filename="test.pdf", + ) + Document.objects.create( + checksum="DEADBEAF", + title="A", + content="", + mime_type="application/pdf", + filename="other_test.pdf", + ) + stdout, _ = self.call_command() + self.assertIn("No matches found", stdout) From 53b393dab556d61a07aea0df281c69a0b178174b Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Wed, 24 Sep 2025 13:43:09 -0700 Subject: [PATCH 2/2] Chore: remove conditional from pre-commit job in CI (#10916) --- .github/workflows/ci.yml | 49 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index edb6a5641..44596b4a8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,11 +17,52 @@ env: DEFAULT_PYTHON_VERSION: "3.11" NLTK_DATA: "/usr/share/nltk_data" jobs: + detect-duplicate: + name: Detect Duplicate Run + runs-on: ubuntu-24.04 + outputs: + should_run: ${{ steps.check.outputs.should_run }} + steps: + - name: Check if workflow should run + id: check + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + if (context.eventName !== 'push') { + core.info('Not a push event; running workflow.'); + core.setOutput('should_run', 'true'); + return; + } + + const ref = context.ref || ''; + if (!ref.startsWith('refs/heads/')) { + core.info('Push is not to a branch; running workflow.'); + core.setOutput('should_run', 'true'); + return; + } + + const branch = ref.substring('refs/heads/'.length); + const { owner, repo } = context.repo; + const prs = await github.paginate(github.rest.pulls.list, { + owner, + repo, + state: 'open', + head: `${owner}:${branch}`, + per_page: 100, + }); + + if (prs.length === 0) { + core.info(`No open PR found for ${branch}; running workflow.`); + core.setOutput('should_run', 'true'); + } else { + core.info(`Found ${prs.length} open PR(s) for ${branch}; skipping duplicate push run.`); + core.setOutput('should_run', 'false'); + } pre-commit: - # We want to run on external PRs, but not on our own internal PRs as they'll be run - # by the push to the branch. Without this if check, checks are duplicated since - # internal PRs match both the push and pull_request events. - if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository + needs: + - detect-duplicate + if: needs.detect-duplicate.outputs.should_run == 'true' name: Linting Checks runs-on: ubuntu-24.04 steps: