-
Notifications
You must be signed in to change notification settings - Fork 34
/
image-prune
executable file
·232 lines (177 loc) · 9.08 KB
/
image-prune
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python3
# This file is part of Cockpit.
#
# Copyright (C) 2013 Red Hat, Inc.
#
# Cockpit is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Cockpit is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cockpit; If not, see <http://www.gnu.org/licenses/>.
import argparse
import datetime
import logging
import os
import re
import subprocess
import sys
import tempfile
import time
import urllib.error
import urllib.parse
from collections.abc import Collection, Iterator
from lib import s3
from lib.directories import get_images_data_dir
from task import github
logger = logging.getLogger('image-prune')
# Days after which images expire if not in use
IMAGE_EXPIRE = 14
def git(*args: str, cwd: str = '.') -> str:
return subprocess.check_output(['git', *args], text=True, cwd=cwd)
def get_images_in_commit(commit: str, cwd: str = '.') -> Iterator[str]:
"""Returns all images pointed to by a symlink in images/ from the given commit"""
logger.debug('Keeping all images in %s...', commit)
for line in git('cat-file', '-p', f'{commit}:images/', cwd=cwd).splitlines():
mode, kind, blob, _name = line.split()
if mode == '120000' and kind == 'blob':
image = git('cat-file', 'blob', blob, cwd=cwd)
logger.debug(' - %s', image)
yield image
def get_images_in_branches(merge_target: str, *branch_patterns: str, cwd: str = '.') -> Iterator[str]:
"""Returns all images introduced by branches, relative to merge_target"""
# KISS: just get a diff and grab anything that looks like an image update
for ref_line in git('for-each-ref', *branch_patterns, cwd=cwd).splitlines():
_rev, _kind, ref = ref_line.split()
logger.debug('Considering images changed in %s...', ref)
for patch_line in git('diff', f'{merge_target}...{ref}', '--', 'images/', cwd=cwd).splitlines():
if match := re.search(r'^\+([-0-9a-z]+-[0-9a-f]+.(qcow2|iso|tar.gz))', patch_line):
image = match.group(1)
logger.debug(' - %s', image)
yield image
def get_remote_images() -> Iterator[str]:
"""Returns all images currently used by PRs and origin branches on GitHub"""
api = github.GitHub()
logger.debug('Querying open PRs on %s', api.repo)
open_prs = [pr['number'] for pr in api.pulls()]
logger.debug('Open PRs: %s', open_prs)
# Use a temporary git repository to work in to avoid touching the local one
with tempfile.TemporaryDirectory() as tmpdir:
git('init', '-b', 'main', cwd=tmpdir)
# We store HEAD as `head`, origin branches as `origin/{name}` and PRs as `pr/{nr}`
patterns = {'HEAD:refs/heads/head', 'refs/heads/*:refs/heads/origin/*'}
patterns.update(f'refs/pull/{nr}/head:refs/heads/pr/{nr}' for nr in open_prs)
quiet = ['--quiet'] if not logger.isEnabledFor(logging.DEBUG) else []
git('fetch', *quiet, f'https://github.com/{api.repo}', *patterns, cwd=tmpdir)
yield from get_images_in_commit('head', cwd=tmpdir)
yield from get_images_in_branches('head', 'refs/heads/origin/*', 'refs/heads/pr/*', cwd=tmpdir)
def get_keepers(offline: bool = False, checkout_only: bool = False) -> Collection[str]:
keepers = set(get_images_in_commit('HEAD'))
if not checkout_only:
keepers.update(get_images_in_branches('HEAD', 'refs/heads/*'))
if not offline:
keepers.update(get_remote_images())
return keepers
class ImageCache:
def list_files(self) -> Iterator[tuple[float, str]]:
"""yields tuples of (mtime, basename)"""
raise NotImplementedError
def delete_file(self, filename: str) -> None:
"""delete the given basename from the cache"""
raise NotImplementedError
def enough_space(self) -> bool:
"""returns True if disk space is not currently an issue"""
raise NotImplementedError
def prune(self, keepers: Collection[str], force: bool = False, dryrun: bool = False) -> None:
expiry_threshold = time.time() - IMAGE_EXPIRE * 86400
# Sort by mtime, oldest first.
for mtime, image in sorted(self.list_files()):
if not image.endswith(('.iso', 'tar.gz', '.qcow2', '.partial')):
logger.debug('Skipping file %s with unknown extension', image)
continue
if image in keepers:
logger.debug('Skipping image %s which is in the keepers list', image)
continue
if not force and self.enough_space() and mtime > expiry_threshold:
logger.debug('Skipping image %s which is new enough (and not forced or low on space)', image)
continue
print(f"Pruning {image}", file=sys.stderr)
if not dryrun:
self.delete_file(image)
class LocalImageDirectory(ImageCache):
def __init__(self, directory: str | None = None):
self.directory = directory or get_images_data_dir()
self.space_threshold = float(os.environ.get("PRUNE_THRESHOLD_G", 15))
def list_files(self) -> Iterator[tuple[float, str]]:
for entry in os.scandir(self.directory):
if entry.is_file(follow_symlinks=False):
yield entry.stat().st_mtime, entry.name
def delete_file(self, filename: str) -> None:
os.unlink(os.path.join(self.directory, filename))
def enough_space(self) -> bool:
buf = os.statvfs(self.directory)
free = buf.f_bavail * buf.f_frsize / (1024 * 1024 * 1024)
return free >= self.space_threshold
class S3ImageStore(ImageCache):
def __init__(self, url: urllib.parse.ParseResult):
self.url = url
# A bit magic: we have 2 buckets and 250GB quota. Try to keep each bucket below ca. 100GB.
self.max_bytes = float(os.environ.get("S3_IMAGES_MAX_GB", 100)) * 1000 * 1000 * 1000
self.sizes: dict[str, str] = {}
def list_files(self) -> Iterator[tuple[float, str]]:
result = s3.list_bucket(self.url)
# make sure this gets done before we start iterating
self.sizes = dict(s3.parse_list(result, "Key", "Size")) # type: ignore[arg-type] # https://github.com/python/typeshed/issues/11532
for name, stamp in s3.parse_list(result, "Key", "LastModified"):
# rfc3339ish...
yield datetime.datetime.strptime(stamp, '%Y-%m-%dT%H:%M:%S.%f%z').timestamp(), name
def delete_file(self, filename: str) -> None:
try:
with s3.urlopen(self.url._replace(path=os.path.join(self.url.path, filename)), method='DELETE'):
pass
except urllib.error.HTTPError as e:
# 404 → already gone? *shrug* eventual consistency or parallel image-prune run
if e.code != 404:
raise
del self.sizes[filename]
def enough_space(self) -> bool:
total = sum(int(size) for size in self.sizes.values())
return total < self.max_bytes
def main() -> None:
parser = argparse.ArgumentParser(description='Prune downloaded images')
parser.add_argument("--debug", '-d', action="store_true", help="Enable debugging output")
parser.add_argument("--force", '-f', action="store_true", help="Delete images even if they aren't old")
parser.add_argument("-n", "--dry-run", dest="dryrun", action="store_true",
help="Don't actually delete images and links")
parser.add_argument("-c", "--checkout-only", dest="checkout_only", action="store_true",
help="Consider only the current HEAD commit")
parser.add_argument("-o", "--offline", dest="offline", action="store_true",
help="Don't access external sources such as GitHub")
cache_arg = parser.add_argument_group(title='Cache location',
description='default: --directory ~/.cache/cockpit-images')
cache_arg = cache_arg.add_mutually_exclusive_group()
cache_arg.add_argument('--directory', metavar='DIR', help="images cache directory to prune")
cache_arg.add_argument('--s3', metavar='URL', help="url of S3 store to prune", type=urllib.parse.urlparse)
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
collection: ImageCache
if args.s3:
collection = S3ImageStore(args.s3)
else:
collection = LocalImageDirectory(args.directory)
keepers = get_keepers(offline=args.offline,
checkout_only=args.checkout_only)
collection.prune(keepers,
force=args.force,
dryrun=args.dryrun)
if not args.force and not collection.enough_space():
sys.exit('Insufficient free space after attempting to prune images')
if __name__ == '__main__':
main()