Coverage for src/phoenixpackagecleanup/clean_channel.py: 42%

84 statements  

« prev     ^ index     » next       coverage.py v7.11.3, created at 2025-12-04 17:19 +0000

1import argparse 

2import json 

3import logging 

4import re 

5import tempfile 

6import urllib.request 

7from collections import defaultdict 

8from datetime import datetime, timedelta, timezone 

9from pathlib import Path 

10from urllib.error import HTTPError 

11 

12from phoenixpackagecleanup.logging_utils import init_logging 

13from phoenixpackagecleanup.package import PackageInfoCollection, parse_repodata 

14 

15 

16def download_file(url: str, filepath: Path, user_agent: str, timeout_s: int): 

17 """Download a file (similar to wget) using urllib 

18 

19 The main reason we need this method is to download the repodata.json file. wget works out of the box, but 

20 when using urllib, we have to specify a User-agent otherwise the request is rejected (403: Forbidden) 

21 

22 Parameters 

23 ---------- 

24 url : str 

25 URL of the file to download 

26 filepath : Path 

27 Path to where the downloaded data will be written 

28 user_agent : str 

29 User-agent value to use in the request header 

30 timeout_s : int 

31 Timeout in second for the request 

32 """ 

33 file_request = urllib.request.Request(url=url, headers={"User-agent": user_agent}) 

34 # if we get an error from server, it will be raised by urlopen, no need to catch 

35 response = urllib.request.urlopen(file_request, timeout=timeout_s) 

36 with open(filepath, "wb") as of: 

37 of.write(response.read()) 

38 

39 

40def load_repodata(repodata_filepath: Path) -> PackageInfoCollection: 

41 """Load the repodata.json file 

42 

43 Parameters 

44 ---------- 

45 repodata_filepath : Path 

46 Path to the repodata.json file 

47 

48 Returns 

49 ------- 

50 ChannelPackages 

51 Packages in the channel, grouped by package name 

52 """ 

53 with open(repodata_filepath, "r") as file: 

54 repodata = json.load(file) 

55 

56 return parse_repodata(repodata) 

57 

58 

59def remove_version_matching_regex( 

60 package_collection: PackageInfoCollection, remove_version_regex: str 

61) -> PackageInfoCollection: 

62 """Remove packages matching a regex from the package collection 

63 

64 Parameters 

65 ---------- 

66 package_collection : PackageInfoCollection 

67 Collection of packages to filter 

68 

69 Returns 

70 ------- 

71 PackageInfoCollection 

72 Collection of packages with the packages matching the regex removed 

73 """ 

74 filtered_collection = PackageInfoCollection(defaultdict(list)) 

75 

76 for package_name, packages_info in package_collection.packages_info.items(): 

77 # Filter packages : packages not matching the remove_version_regex are not added to the output collection 

78 filtered_packages = [pkg for pkg in packages_info if not re.match(remove_version_regex, pkg.version)] 

79 if filtered_packages: 79 ↛ 76line 79 didn't jump to line 76 because the condition on line 79 was always true

80 filtered_collection.packages_info[package_name] = filtered_packages 

81 

82 return filtered_collection 

83 

84 

85def select_package_to_delete( 

86 channel_packages: PackageInfoCollection, 

87 current_time: datetime, 

88 delete_older_than_days: int, 

89 min_number_of_packages: int, 

90 keep_version_regex: str, 

91) -> PackageInfoCollection: 

92 """Selects the packages to clean-up from a package collection based on rules inspired by gitlab's clean-up rules 

93 

94 The selection rules work as the following: 

95 - for each package name, get the list of package version/archives 

96 - exclude the `min_number_of_packages` most recent packages from the list 

97 - exclude all packages that have been uploaded more recently than `delete_older_than_days` days old. 

98 - what remains in the list is selected for deletion 

99 

100 Parameters 

101 ---------- 

102 channel_packages : PackageInfoCollection 

103 Collection of packages to clean-up. 

104 current_time : datetime 

105 The time at which the script is running, used to remove packages based on upload timestamp 

106 delete_older_than_days : int 

107 Packages which upload timestamp is less than `delete_older_than_days` days old are not considered for deletion 

108 min_number_of_packages : int 

109 Only if at least `min_number_of_packages` are available for a package will the packages be considered for 

110 deletion 

111 keep_version_regex : str 

112 regex applied to packages versions: if the package version match this, it will NOT be considered for 

113 deletion. 

114 

115 Returns 

116 ------- 

117 PackageInfoCollection 

118 Collection of packages that should be deleted to clean-up. 

119 """ 

120 

121 # remove packages matching the keep_version_regex 

122 filtered_by_regex_pkgs = remove_version_matching_regex(channel_packages, keep_version_regex) 

123 

124 selected_for_deletion_collection = PackageInfoCollection(defaultdict(list)) 

125 

126 for package_name, packages_info in filtered_by_regex_pkgs.packages_info.items(): 

127 # sort package list in decreasing upload time 

128 packages_sorted_per_timestamp_decreasing = sorted( 

129 packages_info, key=lambda package_info: package_info.upload_time, reverse=True 

130 ) 

131 # remove the min_number_of_packages newest package from list, and iterate 

132 for candidate_for_clean_up in packages_sorted_per_timestamp_decreasing[min_number_of_packages:]: 

133 if current_time - candidate_for_clean_up.upload_time > timedelta(days=delete_older_than_days): 

134 selected_for_deletion_collection.packages_info[package_name].append(candidate_for_clean_up) 

135 

136 return selected_for_deletion_collection 

137 

138 

139def clean_up_channel(packages_to_delete: PackageInfoCollection, delete_api_url: str, api_token: str, timeout_s: int): 

140 """Clean up the package channel by sending DELETE requests to the delete api of the channel 

141 

142 Parameters 

143 ---------- 

144 packages_to_delete : PackageInfoCollection 

145 Collections of package to delete in the channel. The packages filenames will be appended to the 

146 `delete_api_url` to create the DELETE request. 

147 delete_api_url : str 

148 API url to delete the packages. Eg: "https://prefix.dev/api/v1/delete/phoenix-dev/linux-64/" 

149 api_token : str 

150 Token to identify with the delete API. 

151 timeout_s : int 

152 timeout in seconds for the delete request. 

153 """ 

154 for package_info_list in packages_to_delete.packages_info.values(): 

155 for package_info in package_info_list: 

156 delete_url = delete_api_url + "/{}".format(package_info.filename) 

157 

158 delete_request = urllib.request.Request( 

159 url=delete_url, headers={"Authorization": "Bearer {}".format(api_token)}, method="DELETE" 

160 ) 

161 try: 

162 _ = urllib.request.urlopen(delete_request, timeout=timeout_s) 

163 except HTTPError: 

164 logging.warning("Failed delete request with delete_url: {}".format(delete_url)) 

165 raise 

166 

167 

168def get_argument_parser(): 

169 parser = argparse.ArgumentParser( 

170 description="Program which deletes unused packages", formatter_class=argparse.ArgumentDefaultsHelpFormatter 

171 ) 

172 parser.add_argument("-t", "--token", help="Token to be used to log in prefix.dev", required=True, type=str) 

173 parser.add_argument( 

174 "-c", 

175 "--channel_url", 

176 help="Base URL of the channel, for instance https://prefix.dev/phoenix-dev/. It is combined with the " 

177 "channel's subdirectories to get a working url.", 

178 required=True, 

179 type=str, 

180 ) 

181 parser.add_argument( 

182 "-a", 

183 "--api_delete_url", 

184 help="URL to submit DELETE requests for packages to clean-up, for instance " 

185 "https://prefix.dev/api/v1/delete/phoenix-dev/linux-64. It is combined with the channel's subdirectories" 

186 " to get a working url for API calls.", 

187 required=True, 

188 type=str, 

189 ) 

190 parser.add_argument( 

191 "-n", 

192 "--min_number_of_packages", 

193 help="Minimum number of packages version to keep for a package", 

194 required=True, 

195 type=int, 

196 ) 

197 parser.add_argument( 

198 "-d", 

199 "--min_age_days", 

200 help="Token to be used to define number of days to keep packages", 

201 required=True, 

202 type=int, 

203 ) 

204 parser.add_argument( 

205 "-b", 

206 "--keep_version_regex", 

207 help="Packages which are not matching this regex will be considered for deletion. ", 

208 required=True, 

209 type=str, 

210 ) 

211 parser.add_argument( 

212 "-l", "--logfile", help="Path to where the scripts log should be written", required=True, type=Path 

213 ) 

214 parser.add_argument( 

215 "-v", 

216 "--loglevel", 

217 help="Log level as defined by the python logging library", 

218 choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], 

219 required=True, 

220 ) 

221 return parser 

222 

223 

224def main(): 

225 parser = get_argument_parser() 

226 args = parser.parse_args() 

227 

228 init_logging("PhoenixPackageCleanUp", args.logfile, args.loglevel) 

229 

230 channel_subdirs = ["noarch", "linux-64", "linux-aarch64", "linux-ppc64le", "osx-64", "osx-arm64", "win-64"] 

231 failed_all_subdir = True 

232 

233 for subdir in channel_subdirs: 

234 # https://prefix.dev/phoenix-dev/linux-64/repodata.json 

235 # https://prefix.dev/api/v1/delete/phoenix-dev/linux-64 

236 

237 with tempfile.TemporaryDirectory() as tmpdir: 

238 # try to read the subdir repodata.json. If there are no channels in the subdir, the subdir actually 

239 # doesn't exists and we get an HTTP 404. 

240 repodata_filepath = Path(tmpdir) / (subdir + "_repodata.json") 

241 try: 

242 download_file( 

243 args.channel_url + "/" + subdir + "/repodata.json", 

244 repodata_filepath, 

245 "phoenix-clean-up-schedule", 

246 60, 

247 ) 

248 except HTTPError: 

249 logging.warning("Could not clean {}/{} subdir, no packages?".format(args.channel_url, subdir)) 

250 continue 

251 

252 logging.info("Found repodata.json for {}/{} subdir, cleaning".format(args.channel_url, subdir)) 

253 failed_all_subdir = False 

254 # we could read the subdir repodata.json: proceed with the clean-up. 

255 channel_packages = load_repodata(repodata_filepath) 

256 

257 # log the packages actually in the channel 

258 logging.info("Packages on the channel: {}".format(channel_packages)) 

259 

260 # Filter excluded packages 

261 current_time = datetime.now(timezone.utc) 

262 packages_to_delete = select_package_to_delete( 

263 channel_packages, 

264 current_time, 

265 args.min_age_days, 

266 args.min_number_of_packages, 

267 args.keep_version_regex, 

268 ) 

269 

270 # Display the exclude packages 

271 logging.info("Packages to delete: {}".format(packages_to_delete)) 

272 

273 clean_up_channel( 

274 packages_to_delete, # request fails if there are double / 

275 args.api_delete_url + ("/" if not args.api_delete_url.endswith("/") else "") + subdir, 

276 args.token, 

277 60, 

278 ) 

279 

280 if failed_all_subdir: 

281 raise Exception( 

282 "Could not clean any sub-directory in channel {}. Is the URL correct?".format(args.channel_url) 

283 ) 

284 

285 

286if __name__ == "__main__": 286 ↛ 287line 286 didn't jump to line 287 because the condition on line 286 was never true

287 main()