Instead of pulling all images, inspect the manifest index and each digest index

2025-07-24 18:04:39 -05:00 · 2023-02-16 09:42:22 -08:00 · 2023-02-16 09:42:22 -08:00 · ee272da95f
commit ee272da95f
parent 8098ac6b15
1 changed files with 96 additions and 76 deletions
--- a/.github/scripts/cleanup-tags.py
+++ b/.github/scripts/cleanup-tags.py
@ -7,6 +7,7 @@ import subprocess
 from argparse import ArgumentParser
 from typing import Dict
 from typing import Final
+from typing import Iterator
 from typing import List
 from typing import Optional

@ -15,16 +16,17 @@ from github import ContainerPackage
 from github import GithubBranchApi
 from github import GithubContainerRegistryApi

-import docker
-
 logger = logging.getLogger("cleanup-tags")


-class DockerManifest2:
+class ImageProperties:
    """
-    Data class wrapping the Docker Image Manifest Version 2.
+    Data class wrapping the properties of an entry in the image index
+    manifests list.  It is NOT an actual image with layers, etc

-    See https://docs.docker.com/registry/spec/manifest-v2-2/
+    https://docs.docker.com/registry/spec/manifest-v2-2/
+    https://github.com/opencontainers/image-spec/blob/main/manifest.md
+    https://github.com/opencontainers/image-spec/blob/main/descriptor.md
    """

    def __init__(self, data: Dict) -> None:
@ -41,6 +43,45 @@ class DockerManifest2:
        self.platform = f"{platform_data_os}/{platform_arch}{platform_variant}"


+class ImageIndex:
+    """
+    Data class wrapping up logic for an OCI Image Index
+    JSON data.  Primary use is to access the manifests listing
+
+    See https://github.com/opencontainers/image-spec/blob/main/image-index.md
+    """
+
+    def __init__(self, package_url: str, tag: str) -> None:
+        self.qualified_name = f"{package_url}:{tag}"
+        logger.info(f"Getting image index for {self.qualified_name}")
+        try:
+            proc = subprocess.run(
+                [
+                    shutil.which("docker"),
+                    "buildx",
+                    "imagetools",
+                    "inspect",
+                    "--raw",
+                    self.qualified_name,
+                ],
+                capture_output=True,
+                check=True,
+            )
+
+            self._data = json.loads(proc.stdout)
+
+        except subprocess.CalledProcessError as e:
+            logger.error(
+                f"Failed to get image index for {self.qualified_name}: {e.stderr}",
+            )
+            raise e
+
+    @property
+    def image_pointers(self) -> Iterator[ImageProperties]:
+        for manifest_data in self._data["manifests"]:
+            yield ImageProperties(manifest_data)
+
+
 class RegistryTagsCleaner:
    """
    This is the base class for the image registry cleaning.  Given a package
@ -87,7 +128,10 @@ class RegistryTagsCleaner:

    def clean(self):
        """
-        This method will delete image versions, based on the selected tags to delete
+        This method will delete image versions, based on the selected tags to delete.
+        It behaves more like an unlinking than actual deletion.  Removing the tag
+        simply removes a pointer to an image, but the actual image data remains accessible
+        if one has the sha256 digest of it.
        """
        for tag_to_delete in self.tags_to_delete:
            package_version_info = self.all_pkgs_tags_to_version[tag_to_delete]
@ -151,31 +195,17 @@ class RegistryTagsCleaner:

            # Parse manifests to locate digests pointed to
            for tag in sorted(self.tags_to_keep):
-                full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}"
-                logger.info(f"Checking manifest for {full_name}")
-                # TODO: It would be nice to use RegistryData from docker
-                # except the ID doesn't map to anything in the manifest
                try:
-                    proc = subprocess.run(
-                        [
-                            shutil.which("docker"),
-                            "buildx",
-                            "imagetools",
-                            "inspect",
-                            "--raw",
-                            full_name,
-                        ],
-                        capture_output=True,
+                    image_index = ImageIndex(
+                        f"ghcr.io/{self.repo_owner}/{self.package_name}",
+                        tag,
                    )
-
-                    manifest_list = json.loads(proc.stdout)
-                    for manifest_data in manifest_list["manifests"]:
-                        manifest = DockerManifest2(manifest_data)
+                    for manifest in image_index.image_pointers:

                        if manifest.digest in untagged_versions:
                            logger.info(
                                f"Skipping deletion of {manifest.digest},"
-                                f" referred to by {full_name}"
+                                f" referred to by {image_index.qualified_name}"
                                f" for {manifest.platform}",
                            )
                            del untagged_versions[manifest.digest]
@ -247,64 +277,54 @@ class RegistryTagsCleaner:
        # By default, keep anything which is tagged
        self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys()))

-    def check_tags_pull(self):
+    def check_remaining_tags_valid(self):
        """
-        This method uses the Docker Python SDK to confirm all tags which were
-        kept still pull, for all platforms.
+        Checks the non-deleted tags are still valid.  The assumption is if the
+        manifest is can be inspected and each image manifest if points to can be
+        inspected, the image will still pull.

-        TODO: This is much slower (although more comprehensive).  Maybe a Pool?
+        https://github.com/opencontainers/image-spec/blob/main/image-index.md
        """
        logger.info("Beginning confirmation step")
-        client = docker.from_env()
-        imgs = []
+        a_tag_failed = False
        for tag in sorted(self.tags_to_keep):
-            repository = f"ghcr.io/{self.repo_owner}/{self.package_name}"
-            for arch, variant in [("amd64", None), ("arm64", None), ("arm", "v7")]:
-                # From 11.2.0 onwards, qpdf is cross compiled, so there is a single arch, amd64
-                # skip others in this case
-                if "qpdf" in self.package_name and arch != "amd64" and tag == "11.2.0":
-                    continue
-                # Skip beta and release candidate tags
-                elif "beta" in tag:
-                    continue

-                # Build the platform name
-                if variant is not None:
-                    platform = f"linux/{arch}/{variant}"
-                else:
-                    platform = f"linux/{arch}"
+            try:
+                image_index = ImageIndex(
+                    f"ghcr.io/{self.repo_owner}/{self.package_name}",
+                    tag,
+                )
+                for manifest in image_index.image_pointers:
+                    logger.info(f"Checking {manifest.digest} for {manifest.platform}")

-                try:
-                    logger.info(f"Pulling {repository}:{tag} for {platform}")
-                    image = client.images.pull(
-                        repository=repository,
-                        tag=tag,
-                        platform=platform,
-                    )
-                    imgs.append(image)
-                except docker.errors.APIError as e:
-                    logger.error(
-                        f"Failed to pull {repository}:{tag}: {e}",
-                    )
+                    # This follows the pointer from the index to an actual image, layers and all
+                    # Note the format is @
+                    digest_name = f"ghcr.io/{self.repo_owner}/{self.package_name}@{manifest.digest}"

-            # Prevent out of space errors by removing after a few
-            # pulls
-            if len(imgs) > 50:
-                for image in imgs:
                    try:
-                        client.images.remove(image.id)
-                    except docker.errors.APIError as e:
-                        err_str = str(e)
-                        # Ignore attempts to remove images that are partly shared
-                        # Ignore images which are somehow gone already
-                        if (
-                            "must be forced" not in err_str
-                            and "No such image" not in err_str
-                        ):
-                            logger.error(
-                                f"Remove image ghcr.io/{self.repo_owner}/{self.package_name}:{tag} failed: {e}",
-                            )
-                imgs = []
+
+                        subprocess.run(
+                            [
+                                shutil.which("docker"),
+                                "buildx",
+                                "imagetools",
+                                "inspect",
+                                "--raw",
+                                digest_name,
+                            ],
+                            capture_output=True,
+                            check=True,
+                        )
+                    except subprocess.CalledProcessError as e:
+                        logger.error(f"Failed to inspect digest: {e.stderr}")
+                        a_tag_failed = True
+            except subprocess.CalledProcessError as e:
+                a_tag_failed = True
+                logger.error(f"Failed to inspect: {e.stderr}")
+                continue
+
+        if a_tag_failed:
+            raise Exception("At least one image tag failed to inspect")


 class MainImageTagsCleaner(RegistryTagsCleaner):
@ -366,7 +386,7 @@ class MainImageTagsCleaner(RegistryTagsCleaner):

 class LibraryTagsCleaner(RegistryTagsCleaner):
    """
-    Exists for the off change that someday, the installer library images
+    Exists for the off chance that someday, the installer library images
    will need their own logic
    """

@ -464,7 +484,7 @@ def _main():

            # Verify remaining tags still pull
            if args.is_manifest:
-                cleaner.check_tags_pull()
+                cleaner.check_remaining_tags_valid()


 if __name__ == "__main__":