Simplifies the image tag cleanup into classes, fixes the library images caring about feature branches

This commit is contained in:
Trenton H 2022-10-13 15:56:11 -07:00
parent 69913ae250
commit c3a62268c7
4 changed files with 348 additions and 236 deletions

View File

@ -1,4 +1,12 @@
#!/usr/bin/env python3
"""
This script cleans up the untagged images of the main image. It checks for "feature-"
branches, correlates them to the images, and removes images which have no branch
related to them.
After removing the image, it looks at untagged images, removing those which are
not pointed to by a manifest.
"""
import json
import logging
import os
@ -8,6 +16,7 @@ from argparse import ArgumentParser
from typing import Dict
from typing import Final
from typing import List
from typing import Optional
from common import get_log_level
from github import ContainerPackage
@ -26,7 +35,7 @@ class DockerManifest2:
def __init__(self, data: Dict) -> None:
self._data = data
# This is the sha256: digest string. Corresponds to Github API name
# This is the sha256: digest string. Corresponds to GitHub API name
# if the package is an untagged package
self.digest = self._data["digest"]
platform_data_os = self._data["platform"]["os"]
@ -38,6 +47,236 @@ class DockerManifest2:
self.platform = f"{platform_data_os}/{platform_arch}{platform_variant}"
class RegistryTagsCleaner:
def __init__(
self,
package_name: str,
repo_owner: str,
repo_name: str,
package_api: GithubContainerRegistryApi,
branch_api: Optional[GithubBranchApi],
):
self.actually_delete = False
self.package_api = package_api
self.branch_api = branch_api
self.package_name = package_name
self.repo_owner = repo_owner
self.repo_name = repo_name
self.tags_to_delete: List[str] = []
self.tags_to_keep: List[str] = []
# Get the information about all versions of the given package
# These are active, not deleted, the default returned from the API
self.all_package_versions = self.package_api.get_active_package_versions(
self.package_name,
)
# Get a mapping from a tag like "1.7.0" or "feature-xyz" to the ContainerPackage
# tagged with it. It makes certain lookups easy
self.all_pkgs_tags_to_version: Dict[str, ContainerPackage] = {}
for pkg in self.all_package_versions:
for tag in pkg.tags:
self.all_pkgs_tags_to_version[tag] = pkg
logger.info(
f"Located {len(self.all_package_versions)} versions of package {self.package_name}",
)
self.decide_what_tags_to_keep()
def clean(self):
for tag_to_delete in self.tags_to_delete:
package_version_info = self.all_pkgs_tags_to_version[tag_to_delete]
if self.actually_delete:
logger.info(
f"Deleting {tag_to_delete} (id {package_version_info.id})",
)
self.package_api.delete_package_version(
package_version_info,
)
else:
logger.info(
f"Would delete {tag_to_delete} (id {package_version_info.id})",
)
else:
logger.info("No tags to delete")
def clean_untagged(self, is_manifest_image: bool):
def _clean_untagged_manifest():
"""
Handles the deletion of untagged images, but where the package is a manifest, ie a multi
arch image, which means some "untagged" images need to exist still.
Ok, bear with me, these are annoying.
Our images are multi-arch, so the manifest is more like a pointer to a sha256 digest.
These images are untagged, but pointed to, and so should not be removed (or every pull fails).
So for each image getting kept, parse the manifest to find the digest(s) it points to. Then
remove those from the list of untagged images. The final result is the untagged, not pointed to
version which should be safe to remove.
Example:
Tag: ghcr.io/paperless-ngx/paperless-ngx:1.7.1 refers to
amd64: sha256:b9ed4f8753bbf5146547671052d7e91f68cdfc9ef049d06690b2bc866fec2690
armv7: sha256:81605222df4ba4605a2ba4893276e5d08c511231ead1d5da061410e1bbec05c3
arm64: sha256:374cd68db40734b844705bfc38faae84cc4182371de4bebd533a9a365d5e8f3b
each of which appears as untagged image, but isn't really.
So from the list of untagged packages, remove those digests. Once all tags which
are being kept are checked, the remaining untagged packages are actually untagged
with no referrals in a manifest to them.
"""
# Simplify the untagged data, mapping name (which is a digest) to the version
# At the moment, these are the images which APPEAR untagged.
untagged_versions = {}
for x in self.all_package_versions:
if x.untagged:
untagged_versions[x.name] = x
skips = 0
# Parse manifests to locate digests pointed to
for tag in sorted(self.tags_to_keep):
full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}"
logger.info(f"Checking manifest for {full_name}")
try:
proc = subprocess.run(
[
shutil.which("docker"),
"manifest",
"inspect",
full_name,
],
capture_output=True,
)
manifest_list = json.loads(proc.stdout)
for manifest_data in manifest_list["manifests"]:
manifest = DockerManifest2(manifest_data)
if manifest.digest in untagged_versions:
logger.info(
f"Skipping deletion of {manifest.digest},"
f" referred to by {full_name}"
f" for {manifest.platform}",
)
del untagged_versions[manifest.digest]
skips += 1
except Exception as err:
self.actually_delete = False
logger.exception(err)
return
logger.info(
f"Skipping deletion of {skips} packages referred to by a manifest",
)
# Delete the untagged and not pointed at packages
logger.info(f"Deleting untagged packages of {self.package_name}")
for to_delete_name in untagged_versions:
to_delete_version = untagged_versions[to_delete_name]
if self.actually_delete:
logger.info(
f"Deleting id {to_delete_version.id} named {to_delete_version.name}",
)
self.package_api.delete_package_version(
to_delete_version,
)
else:
logger.info(
f"Would delete {to_delete_name} (id {to_delete_version.id})",
)
def _clean_untagged_non_manifest():
# If the package is not a multi-arch manifest, images without tags are safe to delete.
# They are not referred to by anything. This will leave all with at least 1 tag
for package in self.all_package_versions:
if package.untagged:
if self.actually_delete:
logger.info(
f"Deleting id {package.id} named {package.name}",
)
self.package_api.delete_package_version(
package,
)
else:
logger.info(
f"Would delete {package.name} (id {package.id})",
)
else:
logger.info(
f"Not deleting tag {package.tags[0]} of package {self.package_name}",
)
logger.info("Beginning untagged image cleaning")
if is_manifest_image:
_clean_untagged_manifest()
else:
_clean_untagged_non_manifest()
def decide_what_tags_to_keep(self):
# By default, keep anything which is tagged
self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys()))
class MainImageTagsCleaner(RegistryTagsCleaner):
def decide_what_tags_to_keep(self):
# Locate the feature branches
feature_branches = {}
for branch in self.branch_api.get_branches(
owner=self.repo_owner,
repo=self.repo_name,
):
if branch.name.startswith("feature-"):
logger.debug(f"Found feature branch {branch.name}")
feature_branches[branch.name] = branch
logger.info(f"Located {len(feature_branches)} feature branches")
# Filter to packages which are tagged with feature-*
packages_tagged_feature: List[ContainerPackage] = []
for package in self.all_package_versions:
if package.tag_matches("feature-"):
packages_tagged_feature.append(package)
# Map tags like "feature-xyz" to a ContainerPackage
feature_pkgs_tags_to_versions: Dict[str, ContainerPackage] = {}
for pkg in packages_tagged_feature:
for tag in pkg.tags:
feature_pkgs_tags_to_versions[tag] = pkg
logger.info(
f'Located {len(feature_pkgs_tags_to_versions)} versions of package {self.package_name} tagged "feature-"',
)
# All the feature tags minus all the feature branches leaves us feature tags
# with no corresponding branch
self.tags_to_delete = list(
set(feature_pkgs_tags_to_versions.keys()) - set(feature_branches.keys()),
)
# All the tags minus the set of going to be deleted tags leaves us the
# tags which will be kept around
self.tags_to_keep = list(
set(self.all_pkgs_tags_to_version.keys()) - set(self.tags_to_delete),
)
logger.info(
f"Located {len(self.tags_to_delete)} versions of package {self.package_name} to delete",
)
class LibraryTagsCleaner(RegistryTagsCleaner):
pass
def _main():
parser = ArgumentParser(
description="Using the GitHub API locate and optionally delete container"
@ -100,190 +339,29 @@ def _main():
# Note: Only relevant to the main application, but simpler to
# leave in for all packages
with GithubBranchApi(gh_token) as branch_api:
feature_branches = {}
for branch in branch_api.get_branches(
repo=repo,
):
if branch.name.startswith("feature-"):
logger.debug(f"Found feature branch {branch.name}")
feature_branches[branch.name] = branch
logger.info(f"Located {len(feature_branches)} feature branches")
with GithubContainerRegistryApi(gh_token, repo_owner) as container_api:
# Get the information about all versions of the given package
all_package_versions: List[
ContainerPackage
] = container_api.get_package_versions(args.package)
all_pkgs_tags_to_version: Dict[str, ContainerPackage] = {}
for pkg in all_package_versions:
for tag in pkg.tags:
all_pkgs_tags_to_version[tag] = pkg
logger.info(
f"Located {len(all_package_versions)} versions of package {args.package}",
)
# Filter to packages which are tagged with feature-*
packages_tagged_feature: List[ContainerPackage] = []
for package in all_package_versions:
if package.tag_matches("feature-"):
packages_tagged_feature.append(package)
feature_pkgs_tags_to_versions: Dict[str, ContainerPackage] = {}
for pkg in packages_tagged_feature:
for tag in pkg.tags:
feature_pkgs_tags_to_versions[tag] = pkg
logger.info(
f'Located {len(feature_pkgs_tags_to_versions)} versions of package {args.package} tagged "feature-"',
)
# All the feature tags minus all the feature branches leaves us feature tags
# with no corresponding branch
tags_to_delete = list(
set(feature_pkgs_tags_to_versions.keys()) - set(feature_branches.keys()),
)
# All the tags minus the set of going to be deleted tags leaves us the
# tags which will be kept around
tags_to_keep = list(
set(all_pkgs_tags_to_version.keys()) - set(tags_to_delete),
)
logger.info(
f"Located {len(tags_to_delete)} versions of package {args.package} to delete",
)
# Delete certain package versions for which no branch existed
for tag_to_delete in tags_to_delete:
package_version_info = feature_pkgs_tags_to_versions[tag_to_delete]
if args.delete:
logger.info(
f"Deleting {tag_to_delete} (id {package_version_info.id})",
with GithubContainerRegistryApi(gh_token, repo_owner) as container_api:
if args.package in {"paperless-ngx", "paperless-ngx/builder/cache/app"}:
cleaner = MainImageTagsCleaner(
args.package,
repo_owner,
repo,
container_api,
branch_api,
)
container_api.delete_package_version(
package_version_info,
)
else:
logger.info(
f"Would delete {tag_to_delete} (id {package_version_info.id})",
cleaner = LibraryTagsCleaner(
args.package,
repo_owner,
repo,
container_api,
None,
)
# Deal with untagged package versions
if args.untagged:
cleaner.actually_delete = args.delete
logger.info("Handling untagged image packages")
cleaner.clean()
if not args.is_manifest:
# If the package is not a multi-arch manifest, images without tags are safe to delete.
# They are not referred to by anything. This will leave all with at least 1 tag
for package in all_package_versions:
if package.untagged:
if args.delete:
logger.info(
f"Deleting id {package.id} named {package.name}",
)
container_api.delete_package_version(
package,
)
else:
logger.info(
f"Would delete {package.name} (id {package.id})",
)
else:
logger.info(
f"Not deleting tag {package.tags[0]} of package {args.package}",
)
else:
"""
Ok, bear with me, these are annoying.
Our images are multi-arch, so the manifest is more like a pointer to a sha256 digest.
These images are untagged, but pointed to, and so should not be removed (or every pull fails).
So for each image getting kept, parse the manifest to find the digest(s) it points to. Then
remove those from the list of untagged images. The final result is the untagged, not pointed to
version which should be safe to remove.
Example:
Tag: ghcr.io/paperless-ngx/paperless-ngx:1.7.1 refers to
amd64: sha256:b9ed4f8753bbf5146547671052d7e91f68cdfc9ef049d06690b2bc866fec2690
armv7: sha256:81605222df4ba4605a2ba4893276e5d08c511231ead1d5da061410e1bbec05c3
arm64: sha256:374cd68db40734b844705bfc38faae84cc4182371de4bebd533a9a365d5e8f3b
each of which appears as untagged image, but isn't really.
So from the list of untagged packages, remove those digests. Once all tags which
are being kept are checked, the remaining untagged packages are actually untagged
with no referrals in a manifest to them.
"""
# Simplify the untagged data, mapping name (which is a digest) to the version
untagged_versions = {}
for x in all_package_versions:
if x.untagged:
untagged_versions[x.name] = x
skips = 0
# Extra security to not delete on an unexpected error
actually_delete = True
# Parse manifests to locate digests pointed to
for tag in sorted(tags_to_keep):
full_name = f"ghcr.io/{repo_owner}/{args.package}:{tag}"
logger.info(f"Checking manifest for {full_name}")
try:
proc = subprocess.run(
[
shutil.which("docker"),
"manifest",
"inspect",
full_name,
],
capture_output=True,
)
manifest_list = json.loads(proc.stdout)
for manifest_data in manifest_list["manifests"]:
manifest = DockerManifest2(manifest_data)
if manifest.digest in untagged_versions:
logger.debug(
f"Skipping deletion of {manifest.digest}, referred to by {full_name} for {manifest.platform}",
)
del untagged_versions[manifest.digest]
skips += 1
except Exception as err:
actually_delete = False
logger.exception(err)
logger.info(
f"Skipping deletion of {skips} packages referred to by a manifest",
)
# Step 3.3 - Delete the untagged and not pointed at packages
logger.info(f"Deleting untagged packages of {args.package}")
for to_delete_name in untagged_versions:
to_delete_version = untagged_versions[to_delete_name]
if args.delete and actually_delete:
logger.info(
f"Deleting id {to_delete_version.id} named {to_delete_version.name}",
)
container_api.delete_package_version(
to_delete_version,
)
else:
logger.info(
f"Would delete {to_delete_name} (id {to_delete_version.id})",
)
else:
logger.info("Leaving untagged images untouched")
cleaner.clean_untagged(args.is_manifest)
if __name__ == "__main__":

View File

@ -29,6 +29,11 @@ def get_cache_image_tag(
def get_log_level(args) -> int:
"""
Returns a logging level, based
:param args:
:return:
"""
levels = {
"critical": logging.CRITICAL,
"error": logging.ERROR,

View File

@ -113,14 +113,14 @@ class GithubBranchApi(_GithubApiBase):
def __init__(self, token: str) -> None:
super().__init__(token)
self._ENDPOINT = "https://api.github.com/repos/{REPO}/branches"
self._ENDPOINT = "https://api.github.com/repos/{OWNER}/{REPO}/branches"
def get_branches(self, repo: str) -> List[GithubBranch]:
def get_branches(self, owner: str, repo: str) -> List[GithubBranch]:
"""
Returns all current branches of the given repository owned by the given
owner or organization.
"""
endpoint = self._ENDPOINT.format(REPO=repo)
endpoint = self._ENDPOINT.format(OWNER=owner, REPO=repo)
internal_data = self._read_all_pages(endpoint)
return [GithubBranch(branch) for branch in internal_data]
@ -189,8 +189,11 @@ class GithubContainerRegistryApi(_GithubApiBase):
self._PACKAGES_VERSIONS_ENDPOINT = "https://api.github.com/user/packages/{PACKAGE_TYPE}/{PACKAGE_NAME}/versions"
# https://docs.github.com/en/rest/packages#delete-a-package-version-for-the-authenticated-user
self._PACKAGE_VERSION_DELETE_ENDPOINT = "https://api.github.com/user/packages/{PACKAGE_TYPE}/{PACKAGE_NAME}/versions/{PACKAGE_VERSION_ID}"
self._PACKAGE_VERSION_RESTORE_ENDPOINT = (
f"{self._PACKAGE_VERSION_DELETE_ENDPOINT}/restore"
)
def get_package_versions(
def get_active_package_versions(
self,
package_name: str,
) -> List[ContainerPackage]:
@ -216,6 +219,30 @@ class GithubContainerRegistryApi(_GithubApiBase):
return pkgs
def get_deleted_package_versions(
self,
package_name: str,
) -> List[ContainerPackage]:
package_type: str = "container"
# Need to quote this for slashes in the name
package_name = urllib.parse.quote(package_name, safe="")
endpoint = (
self._PACKAGES_VERSIONS_ENDPOINT.format(
ORG=self._owner_or_org,
PACKAGE_TYPE=package_type,
PACKAGE_NAME=package_name,
)
+ "?state=deleted"
)
pkgs = []
for data in self._read_all_pages(endpoint):
pkgs.append(ContainerPackage(data))
return pkgs
def delete_package_version(self, package_data: ContainerPackage):
"""
Deletes the given package version from the GHCR
@ -225,3 +252,22 @@ class GithubContainerRegistryApi(_GithubApiBase):
logger.warning(
f"Request to delete {package_data.url} returned HTTP {resp.status_code}",
)
def restore_package_version(
self,
package_name: str,
package_data: ContainerPackage,
):
package_type: str = "container"
endpoint = self._PACKAGE_VERSION_RESTORE_ENDPOINT.format(
ORG=self._owner_or_org,
PACKAGE_TYPE=package_type,
PACKAGE_NAME=package_name,
PACKAGE_VERSION_ID=package_data.id,
)
resp = self._session.post(endpoint)
if resp.status_code != 204:
logger.warning(
f"Request to delete {endpoint} returned HTTP {resp.status_code}",
)

View File

@ -15,7 +15,7 @@ on:
push:
paths:
- ".github/workflows/cleanup-tags.yml"
- ".github/scripts/cleanup-tags.py"
- ".github/scripts/cleanup-images.py"
- ".github/scripts/github.py"
- ".github/scripts/common.py"
@ -24,9 +24,26 @@ concurrency:
cancel-in-progress: false
jobs:
cleanup:
name: Cleanup Image Tags
runs-on: ubuntu-20.04
cleanup-images:
name: Cleanup Image Tags for ${{ matrix.primary-name }}
runs-on: ubuntu-latest
strategy:
matrix:
include:
- primary-name: "paperless-ngx"
cache-name: "paperless-ngx/builder/cache/app"
- primary-name: "paperless-ngx/builder/qpdf"
cache-name: "paperless-ngx/builder/cache/qpdf"
- primary-name: "paperless-ngx/builder/pikepdf"
cache-name: "paperless-ngx/builder/cache/pikepdf"
- primary-name: "paperless-ngx/builder/jbig2enc"
cache-name: "paperless-ngx/builder/cache/jbig2enc"
- primary-name: "paperless-ngx/builder/psycopg2"
cache-name: "paperless-ngx/builder/cache/psycopg2"
env:
# Requires a personal access token with the OAuth scope delete:packages
TOKEN: ${{ secrets.GHA_CONTAINER_DELETE_TOKEN }}
@ -50,63 +67,29 @@ jobs:
name: Install requests
run: |
python -m pip install requests
# Clean up primary packages
-
name: Cleanup for package "paperless-ngx"
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --is-manifest --delete "paperless-ngx"
-
name: Cleanup for package "qpdf"
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --is-manifest --delete "paperless-ngx/builder/qpdf"
-
name: Cleanup for package "pikepdf"
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --is-manifest --delete "paperless-ngx/builder/pikepdf"
-
name: Cleanup for package "jbig2enc"
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --is-manifest --delete "paperless-ngx/builder/jbig2enc"
-
name: Cleanup for package "psycopg2"
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --is-manifest --delete "paperless-ngx/builder/psycopg2"
#
# Clean up registry cache packages
# Clean up primary package
#
-
name: Cleanup for package "builder/cache/app"
name: Cleanup for package "${{ matrix.primary-name }}"
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --delete "paperless-ngx/builder/cache/app"
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-images.py --untagged --is-manifest "${{ matrix.primary-name }}"
#
# Clean up registry cache package
#
-
name: Cleanup for package "builder/cache/qpdf"
name: Cleanup for package "${{ matrix.cache-name }}"
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --delete "paperless-ngx/builder/cache/qpdf"
-
name: Cleanup for package "builder/cache/psycopg2"
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --delete "paperless-ngx/builder/cache/psycopg2"
-
name: Cleanup for package "builder/cache/jbig2enc"
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --delete "paperless-ngx/builder/cache/jbig2enc"
-
name: Cleanup for package "builder/cache/pikepdf"
if: "${{ env.TOKEN != '' }}"
run: |
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-tags.py --loglevel info --untagged --delete "paperless-ngx/builder/cache/pikepdf"
python ${GITHUB_WORKSPACE}/.github/scripts/cleanup-images.py --untagged "${{ matrix.cache-name }}"
#
# Verify tags which are left still pull
#
-
name: Check all tags still pull
run: |
ghcr_name=$(echo "${GITHUB_REPOSITORY}" | awk '{ print tolower($0) }')
echo "Pulling all tags of ghcr.io/${ghcr_name}"
docker pull --quiet --all-tags ghcr.io/${ghcr_name}
ghcr_name=$(echo "ghcr.io/${GITHUB_REPOSITORY_OWNER}/${{ matrix.primary-name }}" | awk '{ print tolower($0) }')
echo "Pulling all tags of ${ghcr_name}"
docker pull --quiet --all-tags ${ghcr_name}
docker image list