diff --git a/.github/scripts/cleanup-tags.py b/.github/scripts/cleanup-tags.py index 904f44346..3b65ec52d 100644 --- a/.github/scripts/cleanup-tags.py +++ b/.github/scripts/cleanup-tags.py @@ -1,4 +1,12 @@ #!/usr/bin/env python3 +""" +This script cleans up the untagged images of the main image. It checks for "feature-" +branches, correlates them to the images, and removes images which have no branch +related to them. + +After removing the image, it looks at untagged images, removing those which are +not pointed to by a manifest. +""" import json import logging import os @@ -8,6 +16,7 @@ from argparse import ArgumentParser from typing import Dict from typing import Final from typing import List +from typing import Optional from common import get_log_level from github import ContainerPackage @@ -26,7 +35,7 @@ class DockerManifest2: def __init__(self, data: Dict) -> None: self._data = data - # This is the sha256: digest string. Corresponds to Github API name + # This is the sha256: digest string. Corresponds to GitHub API name # if the package is an untagged package self.digest = self._data["digest"] platform_data_os = self._data["platform"]["os"] @@ -38,6 +47,236 @@ class DockerManifest2: self.platform = f"{platform_data_os}/{platform_arch}{platform_variant}" +class RegistryTagsCleaner: + def __init__( + self, + package_name: str, + repo_owner: str, + repo_name: str, + package_api: GithubContainerRegistryApi, + branch_api: Optional[GithubBranchApi], + ): + self.actually_delete = False + self.package_api = package_api + self.branch_api = branch_api + self.package_name = package_name + self.repo_owner = repo_owner + self.repo_name = repo_name + self.tags_to_delete: List[str] = [] + self.tags_to_keep: List[str] = [] + + # Get the information about all versions of the given package + # These are active, not deleted, the default returned from the API + self.all_package_versions = self.package_api.get_active_package_versions( + self.package_name, + ) + + # Get a mapping from a tag like "1.7.0" or "feature-xyz" to the ContainerPackage + # tagged with it. It makes certain lookups easy + self.all_pkgs_tags_to_version: Dict[str, ContainerPackage] = {} + for pkg in self.all_package_versions: + for tag in pkg.tags: + self.all_pkgs_tags_to_version[tag] = pkg + logger.info( + f"Located {len(self.all_package_versions)} versions of package {self.package_name}", + ) + + self.decide_what_tags_to_keep() + + def clean(self): + for tag_to_delete in self.tags_to_delete: + package_version_info = self.all_pkgs_tags_to_version[tag_to_delete] + + if self.actually_delete: + logger.info( + f"Deleting {tag_to_delete} (id {package_version_info.id})", + ) + self.package_api.delete_package_version( + package_version_info, + ) + + else: + logger.info( + f"Would delete {tag_to_delete} (id {package_version_info.id})", + ) + else: + logger.info("No tags to delete") + + def clean_untagged(self, is_manifest_image: bool): + def _clean_untagged_manifest(): + """ + + Handles the deletion of untagged images, but where the package is a manifest, ie a multi + arch image, which means some "untagged" images need to exist still. + + Ok, bear with me, these are annoying. + + Our images are multi-arch, so the manifest is more like a pointer to a sha256 digest. + These images are untagged, but pointed to, and so should not be removed (or every pull fails). + + So for each image getting kept, parse the manifest to find the digest(s) it points to. Then + remove those from the list of untagged images. The final result is the untagged, not pointed to + version which should be safe to remove. + + Example: + Tag: ghcr.io/paperless-ngx/paperless-ngx:1.7.1 refers to + amd64: sha256:b9ed4f8753bbf5146547671052d7e91f68cdfc9ef049d06690b2bc866fec2690 + armv7: sha256:81605222df4ba4605a2ba4893276e5d08c511231ead1d5da061410e1bbec05c3 + arm64: sha256:374cd68db40734b844705bfc38faae84cc4182371de4bebd533a9a365d5e8f3b + each of which appears as untagged image, but isn't really. + + So from the list of untagged packages, remove those digests. Once all tags which + are being kept are checked, the remaining untagged packages are actually untagged + with no referrals in a manifest to them. + """ + # Simplify the untagged data, mapping name (which is a digest) to the version + # At the moment, these are the images which APPEAR untagged. + untagged_versions = {} + for x in self.all_package_versions: + if x.untagged: + untagged_versions[x.name] = x + + skips = 0 + + # Parse manifests to locate digests pointed to + for tag in sorted(self.tags_to_keep): + full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}" + logger.info(f"Checking manifest for {full_name}") + try: + proc = subprocess.run( + [ + shutil.which("docker"), + "manifest", + "inspect", + full_name, + ], + capture_output=True, + ) + + manifest_list = json.loads(proc.stdout) + for manifest_data in manifest_list["manifests"]: + manifest = DockerManifest2(manifest_data) + + if manifest.digest in untagged_versions: + logger.info( + f"Skipping deletion of {manifest.digest}," + f" referred to by {full_name}" + f" for {manifest.platform}", + ) + del untagged_versions[manifest.digest] + skips += 1 + + except Exception as err: + self.actually_delete = False + logger.exception(err) + return + + logger.info( + f"Skipping deletion of {skips} packages referred to by a manifest", + ) + + # Delete the untagged and not pointed at packages + logger.info(f"Deleting untagged packages of {self.package_name}") + for to_delete_name in untagged_versions: + to_delete_version = untagged_versions[to_delete_name] + + if self.actually_delete: + logger.info( + f"Deleting id {to_delete_version.id} named {to_delete_version.name}", + ) + self.package_api.delete_package_version( + to_delete_version, + ) + else: + logger.info( + f"Would delete {to_delete_name} (id {to_delete_version.id})", + ) + + def _clean_untagged_non_manifest(): + # If the package is not a multi-arch manifest, images without tags are safe to delete. + # They are not referred to by anything. This will leave all with at least 1 tag + + for package in self.all_package_versions: + if package.untagged: + if self.actually_delete: + logger.info( + f"Deleting id {package.id} named {package.name}", + ) + self.package_api.delete_package_version( + package, + ) + else: + logger.info( + f"Would delete {package.name} (id {package.id})", + ) + else: + logger.info( + f"Not deleting tag {package.tags[0]} of package {self.package_name}", + ) + + logger.info("Beginning untagged image cleaning") + + if is_manifest_image: + _clean_untagged_manifest() + else: + _clean_untagged_non_manifest() + + def decide_what_tags_to_keep(self): + # By default, keep anything which is tagged + self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys())) + + +class MainImageTagsCleaner(RegistryTagsCleaner): + def decide_what_tags_to_keep(self): + + # Locate the feature branches + feature_branches = {} + for branch in self.branch_api.get_branches( + owner=self.repo_owner, + repo=self.repo_name, + ): + if branch.name.startswith("feature-"): + logger.debug(f"Found feature branch {branch.name}") + feature_branches[branch.name] = branch + + logger.info(f"Located {len(feature_branches)} feature branches") + + # Filter to packages which are tagged with feature-* + packages_tagged_feature: List[ContainerPackage] = [] + for package in self.all_package_versions: + if package.tag_matches("feature-"): + packages_tagged_feature.append(package) + + # Map tags like "feature-xyz" to a ContainerPackage + feature_pkgs_tags_to_versions: Dict[str, ContainerPackage] = {} + for pkg in packages_tagged_feature: + for tag in pkg.tags: + feature_pkgs_tags_to_versions[tag] = pkg + + logger.info( + f'Located {len(feature_pkgs_tags_to_versions)} versions of package {self.package_name} tagged "feature-"', + ) + + # All the feature tags minus all the feature branches leaves us feature tags + # with no corresponding branch + self.tags_to_delete = list( + set(feature_pkgs_tags_to_versions.keys()) - set(feature_branches.keys()), + ) + + # All the tags minus the set of going to be deleted tags leaves us the + # tags which will be kept around + self.tags_to_keep = list( + set(self.all_pkgs_tags_to_version.keys()) - set(self.tags_to_delete), + ) + logger.info( + f"Located {len(self.tags_to_delete)} versions of package {self.package_name} to delete", + ) + + +class LibraryTagsCleaner(RegistryTagsCleaner): + pass + + def _main(): parser = ArgumentParser( description="Using the GitHub API locate and optionally delete container" @@ -100,190 +339,29 @@ def _main(): # Note: Only relevant to the main application, but simpler to # leave in for all packages with GithubBranchApi(gh_token) as branch_api: - feature_branches = {} - for branch in branch_api.get_branches( - repo=repo, - ): - if branch.name.startswith("feature-"): - logger.debug(f"Found feature branch {branch.name}") - feature_branches[branch.name] = branch - - logger.info(f"Located {len(feature_branches)} feature branches") - - with GithubContainerRegistryApi(gh_token, repo_owner) as container_api: - # Get the information about all versions of the given package - all_package_versions: List[ - ContainerPackage - ] = container_api.get_package_versions(args.package) - - all_pkgs_tags_to_version: Dict[str, ContainerPackage] = {} - for pkg in all_package_versions: - for tag in pkg.tags: - all_pkgs_tags_to_version[tag] = pkg - logger.info( - f"Located {len(all_package_versions)} versions of package {args.package}", - ) - - # Filter to packages which are tagged with feature-* - packages_tagged_feature: List[ContainerPackage] = [] - for package in all_package_versions: - if package.tag_matches("feature-"): - packages_tagged_feature.append(package) - - feature_pkgs_tags_to_versions: Dict[str, ContainerPackage] = {} - for pkg in packages_tagged_feature: - for tag in pkg.tags: - feature_pkgs_tags_to_versions[tag] = pkg - - logger.info( - f'Located {len(feature_pkgs_tags_to_versions)} versions of package {args.package} tagged "feature-"', - ) - - # All the feature tags minus all the feature branches leaves us feature tags - # with no corresponding branch - tags_to_delete = list( - set(feature_pkgs_tags_to_versions.keys()) - set(feature_branches.keys()), - ) - - # All the tags minus the set of going to be deleted tags leaves us the - # tags which will be kept around - tags_to_keep = list( - set(all_pkgs_tags_to_version.keys()) - set(tags_to_delete), - ) - logger.info( - f"Located {len(tags_to_delete)} versions of package {args.package} to delete", - ) - - # Delete certain package versions for which no branch existed - for tag_to_delete in tags_to_delete: - package_version_info = feature_pkgs_tags_to_versions[tag_to_delete] - - if args.delete: - logger.info( - f"Deleting {tag_to_delete} (id {package_version_info.id})", + with GithubContainerRegistryApi(gh_token, repo_owner) as container_api: + if args.package in {"paperless-ngx", "paperless-ngx/builder/cache/app"}: + cleaner = MainImageTagsCleaner( + args.package, + repo_owner, + repo, + container_api, + branch_api, ) - container_api.delete_package_version( - package_version_info, - ) - else: - logger.info( - f"Would delete {tag_to_delete} (id {package_version_info.id})", + cleaner = LibraryTagsCleaner( + args.package, + repo_owner, + repo, + container_api, + None, ) - # Deal with untagged package versions - if args.untagged: + cleaner.actually_delete = args.delete - logger.info("Handling untagged image packages") + cleaner.clean() - if not args.is_manifest: - # If the package is not a multi-arch manifest, images without tags are safe to delete. - # They are not referred to by anything. This will leave all with at least 1 tag - - for package in all_package_versions: - if package.untagged: - if args.delete: - logger.info( - f"Deleting id {package.id} named {package.name}", - ) - container_api.delete_package_version( - package, - ) - else: - logger.info( - f"Would delete {package.name} (id {package.id})", - ) - else: - logger.info( - f"Not deleting tag {package.tags[0]} of package {args.package}", - ) - else: - - """ - Ok, bear with me, these are annoying. - - Our images are multi-arch, so the manifest is more like a pointer to a sha256 digest. - These images are untagged, but pointed to, and so should not be removed (or every pull fails). - - So for each image getting kept, parse the manifest to find the digest(s) it points to. Then - remove those from the list of untagged images. The final result is the untagged, not pointed to - version which should be safe to remove. - - Example: - Tag: ghcr.io/paperless-ngx/paperless-ngx:1.7.1 refers to - amd64: sha256:b9ed4f8753bbf5146547671052d7e91f68cdfc9ef049d06690b2bc866fec2690 - armv7: sha256:81605222df4ba4605a2ba4893276e5d08c511231ead1d5da061410e1bbec05c3 - arm64: sha256:374cd68db40734b844705bfc38faae84cc4182371de4bebd533a9a365d5e8f3b - each of which appears as untagged image, but isn't really. - - So from the list of untagged packages, remove those digests. Once all tags which - are being kept are checked, the remaining untagged packages are actually untagged - with no referrals in a manifest to them. - - """ - - # Simplify the untagged data, mapping name (which is a digest) to the version - untagged_versions = {} - for x in all_package_versions: - if x.untagged: - untagged_versions[x.name] = x - - skips = 0 - # Extra security to not delete on an unexpected error - actually_delete = True - - # Parse manifests to locate digests pointed to - for tag in sorted(tags_to_keep): - full_name = f"ghcr.io/{repo_owner}/{args.package}:{tag}" - logger.info(f"Checking manifest for {full_name}") - try: - proc = subprocess.run( - [ - shutil.which("docker"), - "manifest", - "inspect", - full_name, - ], - capture_output=True, - ) - - manifest_list = json.loads(proc.stdout) - for manifest_data in manifest_list["manifests"]: - manifest = DockerManifest2(manifest_data) - - if manifest.digest in untagged_versions: - logger.debug( - f"Skipping deletion of {manifest.digest}, referred to by {full_name} for {manifest.platform}", - ) - del untagged_versions[manifest.digest] - skips += 1 - - except Exception as err: - actually_delete = False - logger.exception(err) - - logger.info( - f"Skipping deletion of {skips} packages referred to by a manifest", - ) - - # Step 3.3 - Delete the untagged and not pointed at packages - logger.info(f"Deleting untagged packages of {args.package}") - for to_delete_name in untagged_versions: - to_delete_version = untagged_versions[to_delete_name] - - if args.delete and actually_delete: - logger.info( - f"Deleting id {to_delete_version.id} named {to_delete_version.name}", - ) - container_api.delete_package_version( - to_delete_version, - ) - else: - logger.info( - f"Would delete {to_delete_name} (id {to_delete_version.id})", - ) - else: - logger.info("Leaving untagged images untouched") + cleaner.clean_untagged(args.is_manifest) if __name__ == "__main__": diff --git a/.github/scripts/common.py b/.github/scripts/common.py index 1e130eae0..bccd4fbbd 100644 --- a/.github/scripts/common.py +++ b/.github/scripts/common.py @@ -29,6 +29,11 @@ def get_cache_image_tag( def get_log_level(args) -> int: + """ + Returns a logging level, based + :param args: + :return: + """ levels = { "critical": logging.CRITICAL, "error": logging.ERROR, diff --git a/.github/scripts/github.py b/.github/scripts/github.py index 4059f89d0..63f34a1e9 100644 --- a/.github/scripts/github.py +++ b/.github/scripts/github.py @@ -113,14 +113,14 @@ class GithubBranchApi(_GithubApiBase): def __init__(self, token: str) -> None: super().__init__(token) - self._ENDPOINT = "https://api.github.com/repos/{REPO}/branches" + self._ENDPOINT = "https://api.github.com/repos/{OWNER}/{REPO}/branches" - def get_branches(self, repo: str) -> List[GithubBranch]: + def get_branches(self, owner: str, repo: str) -> List[GithubBranch]: """ Returns all current branches of the given repository owned by the given owner or organization. """ - endpoint = self._ENDPOINT.format(REPO=repo) + endpoint = self._ENDPOINT.format(OWNER=owner, REPO=repo) internal_data = self._read_all_pages(endpoint) return [GithubBranch(branch) for branch in internal_data] @@ -189,8 +189,11 @@ class GithubContainerRegistryApi(_GithubApiBase): self._PACKAGES_VERSIONS_ENDPOINT = "https://api.github.com/user/packages/{PACKAGE_TYPE}/{PACKAGE_NAME}/versions" # https://docs.github.com/en/rest/packages#delete-a-package-version-for-the-authenticated-user self._PACKAGE_VERSION_DELETE_ENDPOINT = "https://api.github.com/user/packages/{PACKAGE_TYPE}/{PACKAGE_NAME}/versions/{PACKAGE_VERSION_ID}" + self._PACKAGE_VERSION_RESTORE_ENDPOINT = ( + f"{self._PACKAGE_VERSION_DELETE_ENDPOINT}/restore" + ) - def get_package_versions( + def get_active_package_versions( self, package_name: str, ) -> List[ContainerPackage]: @@ -216,6 +219,30 @@ class GithubContainerRegistryApi(_GithubApiBase): return pkgs + def get_deleted_package_versions( + self, + package_name: str, + ) -> List[ContainerPackage]: + package_type: str = "container" + # Need to quote this for slashes in the name + package_name = urllib.parse.quote(package_name, safe="") + + endpoint = ( + self._PACKAGES_VERSIONS_ENDPOINT.format( + ORG=self._owner_or_org, + PACKAGE_TYPE=package_type, + PACKAGE_NAME=package_name, + ) + + "?state=deleted" + ) + + pkgs = [] + + for data in self._read_all_pages(endpoint): + pkgs.append(ContainerPackage(data)) + + return pkgs + def delete_package_version(self, package_data: ContainerPackage): """ Deletes the given package version from the GHCR @@ -225,3 +252,22 @@ class GithubContainerRegistryApi(_GithubApiBase): logger.warning( f"Request to delete {package_data.url} returned HTTP {resp.status_code}", ) + + def restore_package_version( + self, + package_name: str, + package_data: ContainerPackage, + ): + package_type: str = "container" + endpoint = self._PACKAGE_VERSION_RESTORE_ENDPOINT.format( + ORG=self._owner_or_org, + PACKAGE_TYPE=package_type, + PACKAGE_NAME=package_name, + PACKAGE_VERSION_ID=package_data.id, + ) + + resp = self._session.post(endpoint) + if resp.status_code != 204: + logger.warning( + f"Request to delete {endpoint} returned HTTP {resp.status_code}", + ) diff --git a/.github/workflows/cleanup-tags.yml b/.github/workflows/cleanup-tags.yml index 8c2ef87d9..993613ed1 100644 --- a/.github/workflows/cleanup-tags.yml +++ b/.github/workflows/cleanup-tags.yml @@ -15,7 +15,7 @@ on: push: paths: - ".github/workflows/cleanup-tags.yml" - - ".github/scripts/cleanup-tags.py" + - ".github/scripts/cleanup-images.py" - ".github/scripts/github.py" - ".github/scripts/common.py" @@ -24,9 +24,26 @@ concurrency: cancel-in-progress: false jobs: - cleanup: - name: Cleanup Image Tags - runs-on: ubuntu-20.04 + cleanup-images: + name: Cleanup Image Tags for ${{ matrix.primary-name }} + runs-on: ubuntu-latest + strategy: + matrix: + include: + - primary-name: "paperless-ngx" + cache-name: "paperless-ngx/builder/cache/app" + + - primary-name: "paperless-ngx/builder/qpdf" + cache-name: "paperless-ngx/builder/cache/qpdf" + + - primary-name: "paperless-ngx/builder/pikepdf" + cache-name: "paperless-ngx/builder/cache/pikepdf" + + - primary-name: "paperless-ngx/builder/jbig2enc" + cache-name: "paperless-ngx/builder/cache/jbig2enc" + + - primary-name: "paperless-ngx/builder/psycopg2" + cache-name: "paperless-ngx/builder/cache/psycopg2" env: # Requires a personal access token with the OAuth scope delete:packages TOKEN: ${{ secrets.GHA_CONTAINER_DELETE_TOKEN }} @@ -50,63 +67,29 @@ jobs: name: Install requests run: | python -m pip install requests - # Clean up primary packages - - - name: Cleanup for package "paperless-ngx" - if: "${{ env.TOKEN != '' }}" - run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --is-manifest --delete "paperless-ngx" - - - name: Cleanup for package "qpdf" - if: "${{ env.TOKEN != '' }}" - run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --is-manifest --delete "paperless-ngx/builder/qpdf" - - - name: Cleanup for package "pikepdf" - if: "${{ env.TOKEN != '' }}" - run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --is-manifest --delete "paperless-ngx/builder/pikepdf" - - - name: Cleanup for package "jbig2enc" - if: "${{ env.TOKEN != '' }}" - run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --is-manifest --delete "paperless-ngx/builder/jbig2enc" - - - name: Cleanup for package "psycopg2" - if: "${{ env.TOKEN != '' }}" - run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --is-manifest --delete "paperless-ngx/builder/psycopg2" # - # Clean up registry cache packages + # Clean up primary package # - - name: Cleanup for package "builder/cache/app" + name: Cleanup for package "${{ matrix.primary-name }}" if: "${{ env.TOKEN != '' }}" run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --delete "paperless-ngx/builder/cache/app" + python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-images.py --untagged --is-manifest "${{ matrix.primary-name }}" + # + # Clean up registry cache package + # - - name: Cleanup for package "builder/cache/qpdf" + name: Cleanup for package "${{ matrix.cache-name }}" if: "${{ env.TOKEN != '' }}" run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --delete "paperless-ngx/builder/cache/qpdf" - - - name: Cleanup for package "builder/cache/psycopg2" - if: "${{ env.TOKEN != '' }}" - run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --delete "paperless-ngx/builder/cache/psycopg2" - - - name: Cleanup for package "builder/cache/jbig2enc" - if: "${{ env.TOKEN != '' }}" - run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --delete "paperless-ngx/builder/cache/jbig2enc" - - - name: Cleanup for package "builder/cache/pikepdf" - if: "${{ env.TOKEN != '' }}" - run: | - python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --delete "paperless-ngx/builder/cache/pikepdf" + python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-images.py --untagged "${{ matrix.cache-name }}" + # + # Verify tags which are left still pull + # - name: Check all tags still pull run: | - ghcr_name=$(echo "${GITHUB_REPOSITORY}" | awk '{ print tolower($0) }') - echo "Pulling all tags of ghcr.io/${ghcr_name}" - docker pull --quiet --all-tags ghcr.io/${ghcr_name} + ghcr_name=$(echo "ghcr.io/${GITHUB_REPOSITORY_OWNER}/${{ matrix.primary-name }}" | awk '{ print tolower($0) }') + echo "Pulling all tags of ${ghcr_name}" + docker pull --quiet --all-tags ${ghcr_name} + docker image list