Coverage for src/phoenixpackagecleanup/clean_channel.py: 44%
80 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-09-05 18:23 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-09-05 18:23 +0000
1import argparse
2import json
3import logging
4import re
5import tempfile
6import urllib.request
7from collections import defaultdict
8from datetime import datetime, timedelta, timezone
9from pathlib import Path
10from urllib.error import HTTPError
12from phoenixpackagecleanup.logging_utils import init_logging
13from phoenixpackagecleanup.package import PackageInfoCollection, parse_repodata
16def download_file(url: str, filepath: Path, user_agent: str, timeout_s: int):
17 """Download a file (similar to wget) using urllib
19 The main reason we need this method is to download the repodata.json file. wget works out of the box, but
20 when using urllib, we have to specify a User-agent otherwise the request is rejected (403: Forbidden)
22 Parameters
23 ----------
24 url : str
25 URL of the file to download
26 filepath : Path
27 Path to where the downloaded data will be written
28 user_agent : str
29 User-agent value to use in the request header
30 timeout_s : int
31 Timeout in second for the request
32 """
33 file_request = urllib.request.Request(url=url, headers={"User-agent": user_agent})
34 # if we get an error from server, it will be raised by urlopen, no need to catch
35 response = urllib.request.urlopen(file_request, timeout=timeout_s)
36 with open(filepath, "wb") as of:
37 of.write(response.read())
40def load_repodata(repodata_filepath: Path) -> PackageInfoCollection:
41 """Load the repodata.json file
43 Parameters
44 ----------
45 repodata_filepath : Path
46 Path to the repodata.json file
48 Returns
49 -------
50 ChannelPackages
51 Packages in the channel, grouped by package name
52 """
53 with open(repodata_filepath, "r") as file:
54 repodata = json.load(file)
56 return parse_repodata(repodata)
59def remove_version_matching_regex(
60 package_collection: PackageInfoCollection, remove_version_regex: str
61) -> PackageInfoCollection:
62 """Remove packages matching a regex from the package collection
64 Parameters
65 ----------
66 package_collection : PackageInfoCollection
67 Collection of packages to filter
69 Returns
70 -------
71 PackageInfoCollection
72 Collection of packages with the packages matching the regex removed
73 """
74 filtered_collection = PackageInfoCollection(defaultdict(list))
76 for package_name, packages_info in package_collection.packages_info.items():
77 # Filter packages : packages not matching the remove_version_regex are not added to the output collection
78 filtered_packages = [pkg for pkg in packages_info if not re.match(remove_version_regex, pkg.version)]
79 if filtered_packages: 79 ↛ 76line 79 didn't jump to line 76 because the condition on line 79 was always true
80 filtered_collection.packages_info[package_name] = filtered_packages
82 return filtered_collection
85def select_package_to_delete(
86 channel_packages: PackageInfoCollection,
87 current_time: datetime,
88 delete_older_than_days: int,
89 min_number_of_packages: int,
90 keep_version_regex: str,
91) -> PackageInfoCollection:
92 """Selects the packages to clean-up from a package collection based on rules inspired by gitlab's clean-up rules
94 The selection rules work as the following:
95 - for each package name, get the list of package version/archives
96 - exclude the `min_number_of_packages` most recent packages from the list
97 - exclude all packages that have been uploaded more recently than `delete_older_than_days` days old.
98 - what remains in the list is selected for deletion
100 Parameters
101 ----------
102 channel_packages : PackageInfoCollection
103 Collection of packages to clean-up.
104 current_time : datetime
105 The time at which the script is running, used to remove packages based on upload timestamp
106 delete_older_than_days : int
107 Packages which upload timestamp is less than `delete_older_than_days` days old are not considered for deletion
108 min_number_of_packages : int
109 Only if at least `min_number_of_packages` are available for a package will the packages be considered for
110 deletion
111 keep_version_regex : str
112 regex applied to packages versions: if the package version match this, it will NOT be considered for
113 deletion.
115 Returns
116 -------
117 PackageInfoCollection
118 Collection of packages that should be deleted to clean-up.
119 """
121 # remove packages matching the keep_version_regex
122 filtered_by_regex_pkgs = remove_version_matching_regex(channel_packages, keep_version_regex)
124 selected_for_deletion_collection = PackageInfoCollection(defaultdict(list))
126 for package_name, packages_info in filtered_by_regex_pkgs.packages_info.items():
127 # sort package list in decreasing upload time
128 packages_sorted_per_timestamp_decreasing = sorted(
129 packages_info, key=lambda package_info: package_info.upload_time, reverse=True
130 )
131 # remove the min_number_of_packages newest package from list, and iterate
132 for candidate_for_clean_up in packages_sorted_per_timestamp_decreasing[min_number_of_packages:]:
133 if current_time - candidate_for_clean_up.upload_time > timedelta(days=delete_older_than_days):
134 selected_for_deletion_collection.packages_info[package_name].append(candidate_for_clean_up)
136 return selected_for_deletion_collection
139def clean_up_channel(packages_to_delete: PackageInfoCollection, delete_api_url: str, api_token: str, timeout_s: int):
140 """Clean up the package channel by sending DELETE requests to the delete api of the channel
142 Parameters
143 ----------
144 packages_to_delete : PackageInfoCollection
145 Collections of package to delete in the channel. The packages filenames will be appended to the
146 `delete_api_url` to create the DELETE request.
147 delete_api_url : str
148 API url to delete the packages. Eg: "https://prefix.dev/api/v1/delete/phoenix-dev/linux-64/"
149 api_token : str
150 Token to identify with the delete API.
151 timeout_s : int
152 timeout in seconds for the delete request.
153 """
154 for package_info_list in packages_to_delete.packages_info.values():
155 for package_info in package_info_list:
156 delete_url = delete_api_url + "/{}".format(package_info.filename)
158 delete_request = urllib.request.Request(
159 url=delete_url, headers={"Authorization": "Bearer {}".format(api_token)}, method="DELETE"
160 )
161 _ = urllib.request.urlopen(delete_request, timeout=timeout_s)
164def get_argument_parser():
165 parser = argparse.ArgumentParser(
166 description="Program which deletes unused packages", formatter_class=argparse.ArgumentDefaultsHelpFormatter
167 )
168 parser.add_argument("-t", "--token", help="Token to be used to log in prefix.dev", required=True, type=str)
169 parser.add_argument(
170 "-c",
171 "--channel_url",
172 help="Base URL of the channel, for instance https://prefix.dev/phoenix-dev/. It is combined with the "
173 "channel's subdirectories to get a working url.",
174 required=True,
175 type=str,
176 )
177 parser.add_argument(
178 "-a",
179 "--api_delete_url",
180 help="URL to submit DELETE requests for packages to clean-up, for instance "
181 "https://prefix.dev/api/v1/delete/phoenix-dev/linux-64. It is combined with the channel's subdirectories"
182 " to get a working url for API calls.",
183 required=True,
184 type=str,
185 )
186 parser.add_argument(
187 "-n",
188 "--min_number_of_packages",
189 help="Minimum number of packages version to keep for a package",
190 required=True,
191 type=int,
192 )
193 parser.add_argument(
194 "-d",
195 "--min_age_days",
196 help="Token to be used to define number of days to keep packages",
197 required=True,
198 type=int,
199 )
200 parser.add_argument(
201 "-b",
202 "--keep_version_regex",
203 help="Packages which are not matching this regex will be considered for deletion. ",
204 required=True,
205 type=str,
206 )
207 parser.add_argument(
208 "-l", "--logfile", help="Path to where the scripts log should be written", required=True, type=Path
209 )
210 parser.add_argument(
211 "-v",
212 "--loglevel",
213 help="Log level as defined by the python logging library",
214 choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
215 required=True,
216 )
217 return parser
220def main():
221 parser = get_argument_parser()
222 args = parser.parse_args()
224 init_logging("PhoenixPackageCleanUp", args.logfile, args.loglevel)
226 channel_subdirs = ["noarch", "linux-64", "linux-aarch64", "linux-ppc64le", "osx-64", "osx-arm64", "win-64"]
227 failed_all_subdir = True
229 for subdir in channel_subdirs:
230 # https://prefix.dev/phoenix-dev/linux-64/repodata.json \
231 # https://prefix.dev/api/v1/delete/phoenix-dev/linux-64 \
233 with tempfile.TemporaryDirectory() as tmpdir:
234 # try to read the subdir repodata.json. If there are no channels in the subdir, the subdir actually
235 # doesn't exists and we get an HTTP 404.
236 repodata_filepath = Path(tmpdir) / (subdir + "_repodata.json")
237 try:
238 download_file(
239 args.channel_url + "/" + subdir + "/repodata.json",
240 repodata_filepath,
241 "phoenix-clean-up-schedule",
242 60,
243 )
244 except HTTPError:
245 logging.warning("Could not clean {}/{} subdir, no packages?".format(args.channel_url, subdir))
246 continue
248 logging.info("Found repodata.json for {}/{} subdir, cleaning".format(args.channel_url, subdir))
249 failed_all_subdir = False
250 # we could read the subdir repodata.json: proceed with the clean-up.
251 channel_packages = load_repodata(repodata_filepath)
253 # log the packages actually in the channel
254 logging.info("Packages on the channel: {}".format(channel_packages))
256 # Filter excluded packages
257 current_time = datetime.now(timezone.utc)
258 packages_to_delete = select_package_to_delete(
259 channel_packages,
260 current_time,
261 args.min_age_days,
262 args.min_number_of_packages,
263 args.keep_version_regex,
264 )
266 # Display the exclude packages
267 logging.info("Packages to delete: {}".format(packages_to_delete))
269 clean_up_channel(packages_to_delete, args.api_delete_url + "/" + subdir, args.token, 60)
271 if failed_all_subdir:
272 raise Exception(
273 "Could not clean any sub-directory in channel {}. Is the URL correct?".format(args.channel_url)
274 )
277if __name__ == "__main__": 277 ↛ 278line 277 didn't jump to line 278 because the condition on line 277 was never true
278 main()