Coverage for src/phoenixpackagecleanup/clean_channel.py: 44%

80 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-09-05 18:23 +0000

1import argparse 

2import json 

3import logging 

4import re 

5import tempfile 

6import urllib.request 

7from collections import defaultdict 

8from datetime import datetime, timedelta, timezone 

9from pathlib import Path 

10from urllib.error import HTTPError 

11 

12from phoenixpackagecleanup.logging_utils import init_logging 

13from phoenixpackagecleanup.package import PackageInfoCollection, parse_repodata 

14 

15 

16def download_file(url: str, filepath: Path, user_agent: str, timeout_s: int): 

17 """Download a file (similar to wget) using urllib 

18 

19 The main reason we need this method is to download the repodata.json file. wget works out of the box, but 

20 when using urllib, we have to specify a User-agent otherwise the request is rejected (403: Forbidden) 

21 

22 Parameters 

23 ---------- 

24 url : str 

25 URL of the file to download 

26 filepath : Path 

27 Path to where the downloaded data will be written 

28 user_agent : str 

29 User-agent value to use in the request header 

30 timeout_s : int 

31 Timeout in second for the request 

32 """ 

33 file_request = urllib.request.Request(url=url, headers={"User-agent": user_agent}) 

34 # if we get an error from server, it will be raised by urlopen, no need to catch 

35 response = urllib.request.urlopen(file_request, timeout=timeout_s) 

36 with open(filepath, "wb") as of: 

37 of.write(response.read()) 

38 

39 

40def load_repodata(repodata_filepath: Path) -> PackageInfoCollection: 

41 """Load the repodata.json file 

42 

43 Parameters 

44 ---------- 

45 repodata_filepath : Path 

46 Path to the repodata.json file 

47 

48 Returns 

49 ------- 

50 ChannelPackages 

51 Packages in the channel, grouped by package name 

52 """ 

53 with open(repodata_filepath, "r") as file: 

54 repodata = json.load(file) 

55 

56 return parse_repodata(repodata) 

57 

58 

59def remove_version_matching_regex( 

60 package_collection: PackageInfoCollection, remove_version_regex: str 

61) -> PackageInfoCollection: 

62 """Remove packages matching a regex from the package collection 

63 

64 Parameters 

65 ---------- 

66 package_collection : PackageInfoCollection 

67 Collection of packages to filter 

68 

69 Returns 

70 ------- 

71 PackageInfoCollection 

72 Collection of packages with the packages matching the regex removed 

73 """ 

74 filtered_collection = PackageInfoCollection(defaultdict(list)) 

75 

76 for package_name, packages_info in package_collection.packages_info.items(): 

77 # Filter packages : packages not matching the remove_version_regex are not added to the output collection 

78 filtered_packages = [pkg for pkg in packages_info if not re.match(remove_version_regex, pkg.version)] 

79 if filtered_packages: 79 ↛ 76line 79 didn't jump to line 76 because the condition on line 79 was always true

80 filtered_collection.packages_info[package_name] = filtered_packages 

81 

82 return filtered_collection 

83 

84 

85def select_package_to_delete( 

86 channel_packages: PackageInfoCollection, 

87 current_time: datetime, 

88 delete_older_than_days: int, 

89 min_number_of_packages: int, 

90 keep_version_regex: str, 

91) -> PackageInfoCollection: 

92 """Selects the packages to clean-up from a package collection based on rules inspired by gitlab's clean-up rules 

93 

94 The selection rules work as the following: 

95 - for each package name, get the list of package version/archives 

96 - exclude the `min_number_of_packages` most recent packages from the list 

97 - exclude all packages that have been uploaded more recently than `delete_older_than_days` days old. 

98 - what remains in the list is selected for deletion 

99 

100 Parameters 

101 ---------- 

102 channel_packages : PackageInfoCollection 

103 Collection of packages to clean-up. 

104 current_time : datetime 

105 The time at which the script is running, used to remove packages based on upload timestamp 

106 delete_older_than_days : int 

107 Packages which upload timestamp is less than `delete_older_than_days` days old are not considered for deletion 

108 min_number_of_packages : int 

109 Only if at least `min_number_of_packages` are available for a package will the packages be considered for 

110 deletion 

111 keep_version_regex : str 

112 regex applied to packages versions: if the package version match this, it will NOT be considered for 

113 deletion. 

114 

115 Returns 

116 ------- 

117 PackageInfoCollection 

118 Collection of packages that should be deleted to clean-up. 

119 """ 

120 

121 # remove packages matching the keep_version_regex 

122 filtered_by_regex_pkgs = remove_version_matching_regex(channel_packages, keep_version_regex) 

123 

124 selected_for_deletion_collection = PackageInfoCollection(defaultdict(list)) 

125 

126 for package_name, packages_info in filtered_by_regex_pkgs.packages_info.items(): 

127 # sort package list in decreasing upload time 

128 packages_sorted_per_timestamp_decreasing = sorted( 

129 packages_info, key=lambda package_info: package_info.upload_time, reverse=True 

130 ) 

131 # remove the min_number_of_packages newest package from list, and iterate 

132 for candidate_for_clean_up in packages_sorted_per_timestamp_decreasing[min_number_of_packages:]: 

133 if current_time - candidate_for_clean_up.upload_time > timedelta(days=delete_older_than_days): 

134 selected_for_deletion_collection.packages_info[package_name].append(candidate_for_clean_up) 

135 

136 return selected_for_deletion_collection 

137 

138 

139def clean_up_channel(packages_to_delete: PackageInfoCollection, delete_api_url: str, api_token: str, timeout_s: int): 

140 """Clean up the package channel by sending DELETE requests to the delete api of the channel 

141 

142 Parameters 

143 ---------- 

144 packages_to_delete : PackageInfoCollection 

145 Collections of package to delete in the channel. The packages filenames will be appended to the 

146 `delete_api_url` to create the DELETE request. 

147 delete_api_url : str 

148 API url to delete the packages. Eg: "https://prefix.dev/api/v1/delete/phoenix-dev/linux-64/" 

149 api_token : str 

150 Token to identify with the delete API. 

151 timeout_s : int 

152 timeout in seconds for the delete request. 

153 """ 

154 for package_info_list in packages_to_delete.packages_info.values(): 

155 for package_info in package_info_list: 

156 delete_url = delete_api_url + "/{}".format(package_info.filename) 

157 

158 delete_request = urllib.request.Request( 

159 url=delete_url, headers={"Authorization": "Bearer {}".format(api_token)}, method="DELETE" 

160 ) 

161 _ = urllib.request.urlopen(delete_request, timeout=timeout_s) 

162 

163 

164def get_argument_parser(): 

165 parser = argparse.ArgumentParser( 

166 description="Program which deletes unused packages", formatter_class=argparse.ArgumentDefaultsHelpFormatter 

167 ) 

168 parser.add_argument("-t", "--token", help="Token to be used to log in prefix.dev", required=True, type=str) 

169 parser.add_argument( 

170 "-c", 

171 "--channel_url", 

172 help="Base URL of the channel, for instance https://prefix.dev/phoenix-dev/. It is combined with the " 

173 "channel's subdirectories to get a working url.", 

174 required=True, 

175 type=str, 

176 ) 

177 parser.add_argument( 

178 "-a", 

179 "--api_delete_url", 

180 help="URL to submit DELETE requests for packages to clean-up, for instance " 

181 "https://prefix.dev/api/v1/delete/phoenix-dev/linux-64. It is combined with the channel's subdirectories" 

182 " to get a working url for API calls.", 

183 required=True, 

184 type=str, 

185 ) 

186 parser.add_argument( 

187 "-n", 

188 "--min_number_of_packages", 

189 help="Minimum number of packages version to keep for a package", 

190 required=True, 

191 type=int, 

192 ) 

193 parser.add_argument( 

194 "-d", 

195 "--min_age_days", 

196 help="Token to be used to define number of days to keep packages", 

197 required=True, 

198 type=int, 

199 ) 

200 parser.add_argument( 

201 "-b", 

202 "--keep_version_regex", 

203 help="Packages which are not matching this regex will be considered for deletion. ", 

204 required=True, 

205 type=str, 

206 ) 

207 parser.add_argument( 

208 "-l", "--logfile", help="Path to where the scripts log should be written", required=True, type=Path 

209 ) 

210 parser.add_argument( 

211 "-v", 

212 "--loglevel", 

213 help="Log level as defined by the python logging library", 

214 choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], 

215 required=True, 

216 ) 

217 return parser 

218 

219 

220def main(): 

221 parser = get_argument_parser() 

222 args = parser.parse_args() 

223 

224 init_logging("PhoenixPackageCleanUp", args.logfile, args.loglevel) 

225 

226 channel_subdirs = ["noarch", "linux-64", "linux-aarch64", "linux-ppc64le", "osx-64", "osx-arm64", "win-64"] 

227 failed_all_subdir = True 

228 

229 for subdir in channel_subdirs: 

230 # https://prefix.dev/phoenix-dev/linux-64/repodata.json \ 

231 # https://prefix.dev/api/v1/delete/phoenix-dev/linux-64 \ 

232 

233 with tempfile.TemporaryDirectory() as tmpdir: 

234 # try to read the subdir repodata.json. If there are no channels in the subdir, the subdir actually 

235 # doesn't exists and we get an HTTP 404. 

236 repodata_filepath = Path(tmpdir) / (subdir + "_repodata.json") 

237 try: 

238 download_file( 

239 args.channel_url + "/" + subdir + "/repodata.json", 

240 repodata_filepath, 

241 "phoenix-clean-up-schedule", 

242 60, 

243 ) 

244 except HTTPError: 

245 logging.warning("Could not clean {}/{} subdir, no packages?".format(args.channel_url, subdir)) 

246 continue 

247 

248 logging.info("Found repodata.json for {}/{} subdir, cleaning".format(args.channel_url, subdir)) 

249 failed_all_subdir = False 

250 # we could read the subdir repodata.json: proceed with the clean-up. 

251 channel_packages = load_repodata(repodata_filepath) 

252 

253 # log the packages actually in the channel 

254 logging.info("Packages on the channel: {}".format(channel_packages)) 

255 

256 # Filter excluded packages 

257 current_time = datetime.now(timezone.utc) 

258 packages_to_delete = select_package_to_delete( 

259 channel_packages, 

260 current_time, 

261 args.min_age_days, 

262 args.min_number_of_packages, 

263 args.keep_version_regex, 

264 ) 

265 

266 # Display the exclude packages 

267 logging.info("Packages to delete: {}".format(packages_to_delete)) 

268 

269 clean_up_channel(packages_to_delete, args.api_delete_url + "/" + subdir, args.token, 60) 

270 

271 if failed_all_subdir: 

272 raise Exception( 

273 "Could not clean any sub-directory in channel {}. Is the URL correct?".format(args.channel_url) 

274 ) 

275 

276 

277if __name__ == "__main__": 277 ↛ 278line 277 didn't jump to line 278 because the condition on line 277 was never true

278 main()