Instead of pulling all images, inspect the manifest index and each digest index

This commit is contained in:
Trenton H 2023-02-16 09:42:22 -08:00
parent 4f9885d6b6
commit 74b7c49e1b

View File

@ -7,6 +7,7 @@ import subprocess
from argparse import ArgumentParser from argparse import ArgumentParser
from typing import Dict from typing import Dict
from typing import Final from typing import Final
from typing import Iterator
from typing import List from typing import List
from typing import Optional from typing import Optional
@ -15,16 +16,17 @@ from github import ContainerPackage
from github import GithubBranchApi from github import GithubBranchApi
from github import GithubContainerRegistryApi from github import GithubContainerRegistryApi
import docker
logger = logging.getLogger("cleanup-tags") logger = logging.getLogger("cleanup-tags")
class DockerManifest2: class ImageProperties:
""" """
Data class wrapping the Docker Image Manifest Version 2. Data class wrapping the properties of an entry in the image index
manifests list. It is NOT an actual image with layers, etc
See https://docs.docker.com/registry/spec/manifest-v2-2/ https://docs.docker.com/registry/spec/manifest-v2-2/
https://github.com/opencontainers/image-spec/blob/main/manifest.md
https://github.com/opencontainers/image-spec/blob/main/descriptor.md
""" """
def __init__(self, data: Dict) -> None: def __init__(self, data: Dict) -> None:
@ -41,6 +43,45 @@ class DockerManifest2:
self.platform = f"{platform_data_os}/{platform_arch}{platform_variant}" self.platform = f"{platform_data_os}/{platform_arch}{platform_variant}"
class ImageIndex:
"""
Data class wrapping up logic for an OCI Image Index
JSON data. Primary use is to access the manifests listing
See https://github.com/opencontainers/image-spec/blob/main/image-index.md
"""
def __init__(self, package_url: str, tag: str) -> None:
self.qualified_name = f"{package_url}:{tag}"
logger.info(f"Getting image index for {self.qualified_name}")
try:
proc = subprocess.run(
[
shutil.which("docker"),
"buildx",
"imagetools",
"inspect",
"--raw",
self.qualified_name,
],
capture_output=True,
check=True,
)
self._data = json.loads(proc.stdout)
except subprocess.CalledProcessError as e:
logger.error(
f"Failed to get image index for {self.qualified_name}: {e.stderr}",
)
raise e
@property
def image_pointers(self) -> Iterator[ImageProperties]:
for manifest_data in self._data["manifests"]:
yield ImageProperties(manifest_data)
class RegistryTagsCleaner: class RegistryTagsCleaner:
""" """
This is the base class for the image registry cleaning. Given a package This is the base class for the image registry cleaning. Given a package
@ -87,7 +128,10 @@ class RegistryTagsCleaner:
def clean(self): def clean(self):
""" """
This method will delete image versions, based on the selected tags to delete This method will delete image versions, based on the selected tags to delete.
It behaves more like an unlinking than actual deletion. Removing the tag
simply removes a pointer to an image, but the actual image data remains accessible
if one has the sha256 digest of it.
""" """
for tag_to_delete in self.tags_to_delete: for tag_to_delete in self.tags_to_delete:
package_version_info = self.all_pkgs_tags_to_version[tag_to_delete] package_version_info = self.all_pkgs_tags_to_version[tag_to_delete]
@ -151,31 +195,17 @@ class RegistryTagsCleaner:
# Parse manifests to locate digests pointed to # Parse manifests to locate digests pointed to
for tag in sorted(self.tags_to_keep): for tag in sorted(self.tags_to_keep):
full_name = f"ghcr.io/{self.repo_owner}/{self.package_name}:{tag}"
logger.info(f"Checking manifest for {full_name}")
# TODO: It would be nice to use RegistryData from docker
# except the ID doesn't map to anything in the manifest
try: try:
proc = subprocess.run( image_index = ImageIndex(
[ f"ghcr.io/{self.repo_owner}/{self.package_name}",
shutil.which("docker"), tag,
"buildx",
"imagetools",
"inspect",
"--raw",
full_name,
],
capture_output=True,
) )
for manifest in image_index.image_pointers:
manifest_list = json.loads(proc.stdout)
for manifest_data in manifest_list["manifests"]:
manifest = DockerManifest2(manifest_data)
if manifest.digest in untagged_versions: if manifest.digest in untagged_versions:
logger.info( logger.info(
f"Skipping deletion of {manifest.digest}," f"Skipping deletion of {manifest.digest},"
f" referred to by {full_name}" f" referred to by {image_index.qualified_name}"
f" for {manifest.platform}", f" for {manifest.platform}",
) )
del untagged_versions[manifest.digest] del untagged_versions[manifest.digest]
@ -247,64 +277,54 @@ class RegistryTagsCleaner:
# By default, keep anything which is tagged # By default, keep anything which is tagged
self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys())) self.tags_to_keep = list(set(self.all_pkgs_tags_to_version.keys()))
def check_tags_pull(self): def check_remaining_tags_valid(self):
""" """
This method uses the Docker Python SDK to confirm all tags which were Checks the non-deleted tags are still valid. The assumption is if the
kept still pull, for all platforms. manifest is can be inspected and each image manifest if points to can be
inspected, the image will still pull.
TODO: This is much slower (although more comprehensive). Maybe a Pool? https://github.com/opencontainers/image-spec/blob/main/image-index.md
""" """
logger.info("Beginning confirmation step") logger.info("Beginning confirmation step")
client = docker.from_env() a_tag_failed = False
imgs = []
for tag in sorted(self.tags_to_keep): for tag in sorted(self.tags_to_keep):
repository = f"ghcr.io/{self.repo_owner}/{self.package_name}"
for arch, variant in [("amd64", None), ("arm64", None), ("arm", "v7")]:
# From 11.2.0 onwards, qpdf is cross compiled, so there is a single arch, amd64
# skip others in this case
if "qpdf" in self.package_name and arch != "amd64" and tag == "11.2.0":
continue
# Skip beta and release candidate tags
elif "beta" in tag:
continue
# Build the platform name
if variant is not None:
platform = f"linux/{arch}/{variant}"
else:
platform = f"linux/{arch}"
try: try:
logger.info(f"Pulling {repository}:{tag} for {platform}") image_index = ImageIndex(
image = client.images.pull( f"ghcr.io/{self.repo_owner}/{self.package_name}",
repository=repository, tag,
tag=tag,
platform=platform,
)
imgs.append(image)
except docker.errors.APIError as e:
logger.error(
f"Failed to pull {repository}:{tag}: {e}",
) )
for manifest in image_index.image_pointers:
logger.info(f"Checking {manifest.digest} for {manifest.platform}")
# This follows the pointer from the index to an actual image, layers and all
# Note the format is @
digest_name = f"ghcr.io/{self.repo_owner}/{self.package_name}@{manifest.digest}"
# Prevent out of space errors by removing after a few
# pulls
if len(imgs) > 50:
for image in imgs:
try: try:
client.images.remove(image.id)
except docker.errors.APIError as e: subprocess.run(
err_str = str(e) [
# Ignore attempts to remove images that are partly shared shutil.which("docker"),
# Ignore images which are somehow gone already "buildx",
if ( "imagetools",
"must be forced" not in err_str "inspect",
and "No such image" not in err_str "--raw",
): digest_name,
logger.error( ],
f"Remove image ghcr.io/{self.repo_owner}/{self.package_name}:{tag} failed: {e}", capture_output=True,
check=True,
) )
imgs = [] except subprocess.CalledProcessError as e:
logger.error(f"Failed to inspect digest: {e.stderr}")
a_tag_failed = True
except subprocess.CalledProcessError as e:
a_tag_failed = True
logger.error(f"Failed to inspect: {e.stderr}")
continue
if a_tag_failed:
raise Exception("At least one image tag failed to inspect")
class MainImageTagsCleaner(RegistryTagsCleaner): class MainImageTagsCleaner(RegistryTagsCleaner):
@ -366,7 +386,7 @@ class MainImageTagsCleaner(RegistryTagsCleaner):
class LibraryTagsCleaner(RegistryTagsCleaner): class LibraryTagsCleaner(RegistryTagsCleaner):
""" """
Exists for the off change that someday, the installer library images Exists for the off chance that someday, the installer library images
will need their own logic will need their own logic
""" """
@ -464,7 +484,7 @@ def _main():
# Verify remaining tags still pull # Verify remaining tags still pull
if args.is_manifest: if args.is_manifest:
cleaner.check_tags_pull() cleaner.check_remaining_tags_valid()
if __name__ == "__main__": if __name__ == "__main__":