Coverage for src/phoenixpackagecleanup/clean_channel.py: 42%
84 statements
« prev ^ index » next coverage.py v7.11.3, created at 2025-12-04 17:19 +0000
« prev ^ index » next coverage.py v7.11.3, created at 2025-12-04 17:19 +0000
1import argparse
2import json
3import logging
4import re
5import tempfile
6import urllib.request
7from collections import defaultdict
8from datetime import datetime, timedelta, timezone
9from pathlib import Path
10from urllib.error import HTTPError
12from phoenixpackagecleanup.logging_utils import init_logging
13from phoenixpackagecleanup.package import PackageInfoCollection, parse_repodata
16def download_file(url: str, filepath: Path, user_agent: str, timeout_s: int):
17 """Download a file (similar to wget) using urllib
19 The main reason we need this method is to download the repodata.json file. wget works out of the box, but
20 when using urllib, we have to specify a User-agent otherwise the request is rejected (403: Forbidden)
22 Parameters
23 ----------
24 url : str
25 URL of the file to download
26 filepath : Path
27 Path to where the downloaded data will be written
28 user_agent : str
29 User-agent value to use in the request header
30 timeout_s : int
31 Timeout in second for the request
32 """
33 file_request = urllib.request.Request(url=url, headers={"User-agent": user_agent})
34 # if we get an error from server, it will be raised by urlopen, no need to catch
35 response = urllib.request.urlopen(file_request, timeout=timeout_s)
36 with open(filepath, "wb") as of:
37 of.write(response.read())
40def load_repodata(repodata_filepath: Path) -> PackageInfoCollection:
41 """Load the repodata.json file
43 Parameters
44 ----------
45 repodata_filepath : Path
46 Path to the repodata.json file
48 Returns
49 -------
50 ChannelPackages
51 Packages in the channel, grouped by package name
52 """
53 with open(repodata_filepath, "r") as file:
54 repodata = json.load(file)
56 return parse_repodata(repodata)
59def remove_version_matching_regex(
60 package_collection: PackageInfoCollection, remove_version_regex: str
61) -> PackageInfoCollection:
62 """Remove packages matching a regex from the package collection
64 Parameters
65 ----------
66 package_collection : PackageInfoCollection
67 Collection of packages to filter
69 Returns
70 -------
71 PackageInfoCollection
72 Collection of packages with the packages matching the regex removed
73 """
74 filtered_collection = PackageInfoCollection(defaultdict(list))
76 for package_name, packages_info in package_collection.packages_info.items():
77 # Filter packages : packages not matching the remove_version_regex are not added to the output collection
78 filtered_packages = [pkg for pkg in packages_info if not re.match(remove_version_regex, pkg.version)]
79 if filtered_packages: 79 ↛ 76line 79 didn't jump to line 76 because the condition on line 79 was always true
80 filtered_collection.packages_info[package_name] = filtered_packages
82 return filtered_collection
85def select_package_to_delete(
86 channel_packages: PackageInfoCollection,
87 current_time: datetime,
88 delete_older_than_days: int,
89 min_number_of_packages: int,
90 keep_version_regex: str,
91) -> PackageInfoCollection:
92 """Selects the packages to clean-up from a package collection based on rules inspired by gitlab's clean-up rules
94 The selection rules work as the following:
95 - for each package name, get the list of package version/archives
96 - exclude the `min_number_of_packages` most recent packages from the list
97 - exclude all packages that have been uploaded more recently than `delete_older_than_days` days old.
98 - what remains in the list is selected for deletion
100 Parameters
101 ----------
102 channel_packages : PackageInfoCollection
103 Collection of packages to clean-up.
104 current_time : datetime
105 The time at which the script is running, used to remove packages based on upload timestamp
106 delete_older_than_days : int
107 Packages which upload timestamp is less than `delete_older_than_days` days old are not considered for deletion
108 min_number_of_packages : int
109 Only if at least `min_number_of_packages` are available for a package will the packages be considered for
110 deletion
111 keep_version_regex : str
112 regex applied to packages versions: if the package version match this, it will NOT be considered for
113 deletion.
115 Returns
116 -------
117 PackageInfoCollection
118 Collection of packages that should be deleted to clean-up.
119 """
121 # remove packages matching the keep_version_regex
122 filtered_by_regex_pkgs = remove_version_matching_regex(channel_packages, keep_version_regex)
124 selected_for_deletion_collection = PackageInfoCollection(defaultdict(list))
126 for package_name, packages_info in filtered_by_regex_pkgs.packages_info.items():
127 # sort package list in decreasing upload time
128 packages_sorted_per_timestamp_decreasing = sorted(
129 packages_info, key=lambda package_info: package_info.upload_time, reverse=True
130 )
131 # remove the min_number_of_packages newest package from list, and iterate
132 for candidate_for_clean_up in packages_sorted_per_timestamp_decreasing[min_number_of_packages:]:
133 if current_time - candidate_for_clean_up.upload_time > timedelta(days=delete_older_than_days):
134 selected_for_deletion_collection.packages_info[package_name].append(candidate_for_clean_up)
136 return selected_for_deletion_collection
139def clean_up_channel(packages_to_delete: PackageInfoCollection, delete_api_url: str, api_token: str, timeout_s: int):
140 """Clean up the package channel by sending DELETE requests to the delete api of the channel
142 Parameters
143 ----------
144 packages_to_delete : PackageInfoCollection
145 Collections of package to delete in the channel. The packages filenames will be appended to the
146 `delete_api_url` to create the DELETE request.
147 delete_api_url : str
148 API url to delete the packages. Eg: "https://prefix.dev/api/v1/delete/phoenix-dev/linux-64/"
149 api_token : str
150 Token to identify with the delete API.
151 timeout_s : int
152 timeout in seconds for the delete request.
153 """
154 for package_info_list in packages_to_delete.packages_info.values():
155 for package_info in package_info_list:
156 delete_url = delete_api_url + "/{}".format(package_info.filename)
158 delete_request = urllib.request.Request(
159 url=delete_url, headers={"Authorization": "Bearer {}".format(api_token)}, method="DELETE"
160 )
161 try:
162 _ = urllib.request.urlopen(delete_request, timeout=timeout_s)
163 except HTTPError:
164 logging.warning("Failed delete request with delete_url: {}".format(delete_url))
165 raise
168def get_argument_parser():
169 parser = argparse.ArgumentParser(
170 description="Program which deletes unused packages", formatter_class=argparse.ArgumentDefaultsHelpFormatter
171 )
172 parser.add_argument("-t", "--token", help="Token to be used to log in prefix.dev", required=True, type=str)
173 parser.add_argument(
174 "-c",
175 "--channel_url",
176 help="Base URL of the channel, for instance https://prefix.dev/phoenix-dev/. It is combined with the "
177 "channel's subdirectories to get a working url.",
178 required=True,
179 type=str,
180 )
181 parser.add_argument(
182 "-a",
183 "--api_delete_url",
184 help="URL to submit DELETE requests for packages to clean-up, for instance "
185 "https://prefix.dev/api/v1/delete/phoenix-dev/linux-64. It is combined with the channel's subdirectories"
186 " to get a working url for API calls.",
187 required=True,
188 type=str,
189 )
190 parser.add_argument(
191 "-n",
192 "--min_number_of_packages",
193 help="Minimum number of packages version to keep for a package",
194 required=True,
195 type=int,
196 )
197 parser.add_argument(
198 "-d",
199 "--min_age_days",
200 help="Token to be used to define number of days to keep packages",
201 required=True,
202 type=int,
203 )
204 parser.add_argument(
205 "-b",
206 "--keep_version_regex",
207 help="Packages which are not matching this regex will be considered for deletion. ",
208 required=True,
209 type=str,
210 )
211 parser.add_argument(
212 "-l", "--logfile", help="Path to where the scripts log should be written", required=True, type=Path
213 )
214 parser.add_argument(
215 "-v",
216 "--loglevel",
217 help="Log level as defined by the python logging library",
218 choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
219 required=True,
220 )
221 return parser
224def main():
225 parser = get_argument_parser()
226 args = parser.parse_args()
228 init_logging("PhoenixPackageCleanUp", args.logfile, args.loglevel)
230 channel_subdirs = ["noarch", "linux-64", "linux-aarch64", "linux-ppc64le", "osx-64", "osx-arm64", "win-64"]
231 failed_all_subdir = True
233 for subdir in channel_subdirs:
234 # https://prefix.dev/phoenix-dev/linux-64/repodata.json
235 # https://prefix.dev/api/v1/delete/phoenix-dev/linux-64
237 with tempfile.TemporaryDirectory() as tmpdir:
238 # try to read the subdir repodata.json. If there are no channels in the subdir, the subdir actually
239 # doesn't exists and we get an HTTP 404.
240 repodata_filepath = Path(tmpdir) / (subdir + "_repodata.json")
241 try:
242 download_file(
243 args.channel_url + "/" + subdir + "/repodata.json",
244 repodata_filepath,
245 "phoenix-clean-up-schedule",
246 60,
247 )
248 except HTTPError:
249 logging.warning("Could not clean {}/{} subdir, no packages?".format(args.channel_url, subdir))
250 continue
252 logging.info("Found repodata.json for {}/{} subdir, cleaning".format(args.channel_url, subdir))
253 failed_all_subdir = False
254 # we could read the subdir repodata.json: proceed with the clean-up.
255 channel_packages = load_repodata(repodata_filepath)
257 # log the packages actually in the channel
258 logging.info("Packages on the channel: {}".format(channel_packages))
260 # Filter excluded packages
261 current_time = datetime.now(timezone.utc)
262 packages_to_delete = select_package_to_delete(
263 channel_packages,
264 current_time,
265 args.min_age_days,
266 args.min_number_of_packages,
267 args.keep_version_regex,
268 )
270 # Display the exclude packages
271 logging.info("Packages to delete: {}".format(packages_to_delete))
273 clean_up_channel(
274 packages_to_delete, # request fails if there are double /
275 args.api_delete_url + ("/" if not args.api_delete_url.endswith("/") else "") + subdir,
276 args.token,
277 60,
278 )
280 if failed_all_subdir:
281 raise Exception(
282 "Could not clean any sub-directory in channel {}. Is the URL correct?".format(args.channel_url)
283 )
286if __name__ == "__main__": 286 ↛ 287line 286 didn't jump to line 287 because the condition on line 286 was never true
287 main()