Coverage for moptipy / evaluation / tabulate_result_tests.py: 86%

176 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-24 08:49 +0000

1r""" 

2Provides :func:`tabulate_result_tests` creating statistical comparison tables. 

3 

4The function :func:`tabulate_result_tests` can compare two or more algorithms 

5on multiple problem instances by using the Mann-Whitney U test [1-3] with the 

6Bonferroni correction [4]. 

7 

81. Daniel F. Bauer. Constructing Confidence Sets Using Rank Statistics. 

9 In *Journal of the American Statistical Association.* 67(339):687-690. 

10 September 1972. doi: https://doi.org/10.1080/01621459.1972.10481279. 

112. Sidney Siegel and N. John Castellan Jr. *Nonparametric Statistics for The 

12 Behavioral Sciences.* 1988 In the Humanities/Social Sciences/Languages 

13 series. New York, NY, USA: McGraw-Hill. ISBN: 0-07-057357-3. 

143. Myles Hollander and Douglas Alan Wolfe. *Nonparametric Statistical 

15 Methods.* 1973. New York, NY, USA: John Wiley and Sons Ltd. 

16 ISBN: 047140635X. 

174. Olive Jean Dunn. Multiple Comparisons Among Means. In *Journal of the 

18 American Statistical Association.* 56(293):52-64. March 1961. 

19 doi: https://doi.org/10.1080/01621459.1961.10482090. 

20""" 

21 

22from math import isfinite 

23from statistics import mean, median 

24from typing import Any, Callable, Final, Iterable, cast 

25 

26from pycommons.io.path import Path 

27from pycommons.types import type_error 

28from scipy.stats import mannwhitneyu # type: ignore 

29 

30from moptipy.api.logging import KEY_BEST_F 

31from moptipy.evaluation.end_results import EndResult 

32from moptipy.evaluation.end_results import getter as end_result_getter 

33from moptipy.utils.formatted_string import FormattedStr 

34from moptipy.utils.markdown import Markdown 

35from moptipy.utils.number_renderer import ( 

36 DEFAULT_NUMBER_RENDERER, 

37 NumberRenderer, 

38) 

39from moptipy.utils.table import Table 

40from moptipy.utils.text_format import TextFormatDriver 

41 

42#: the string constant for alpha 

43__ALPHA: Final[FormattedStr] = FormattedStr.special("\u03b1") 

44#: the string constant for dash or unclear 

45__DASH: Final[FormattedStr] = FormattedStr.special("\u2014") 

46#: the character for less 

47__LESS: Final[FormattedStr] = FormattedStr("<", code=True) 

48#: the character for equal 

49__UNCLEAR: Final[FormattedStr] = FormattedStr("?", code=True) 

50#: the character for greater 

51__GREATER: Final[FormattedStr] = FormattedStr(">", code=True) 

52 

53#: the name of the win-equal-loss column 

54__WEL: Final[tuple[FormattedStr, str, FormattedStr, str, FormattedStr]] \ 

55 = __LESS, " / ", __UNCLEAR, " / ", __GREATER 

56 

57 

58def __compare(a: list[int | float], 

59 b: list[int | float]) -> int: 

60 """ 

61 Compare two lists of numbers. 

62 

63 `-1` is returned if `a` has both a smaller mean and median value than `b`. 

64 `1` is returned if `a` has both a larger mean and median value than `b`. 

65 `0` is returned otherwise. 

66 

67 :param a: the first list of numbers 

68 :param b: the second list of numbers 

69 :returns: `-1` if the numbers in `a` tend to be smaller than those in `b`, 

70 `1` if the numbers in `a` tend to be larger than those in `b`, 

71 `0` if undecided 

72 """ 

73 smaller: bool 

74 ma = mean(a) 

75 mb = mean(b) 

76 if ma < mb: 

77 smaller = True 

78 elif ma > mb: 

79 smaller = False 

80 else: 

81 return 0 

82 ma = median(a) 

83 mb = median(b) 

84 if ma < mb: 

85 return -1 if smaller else 0 

86 if ma > mb: 

87 return 0 if smaller else 1 

88 return 0 

89 

90 

91def __compare_to_str(cmp: int, p_str: FormattedStr | None, 

92 p: float, alpha_prime: float) -> str | list[str]: 

93 """ 

94 Transform a comparison result and a string to an output. 

95 

96 :param cmp: the comparison result 

97 :param p_str: the rendered `p` value string 

98 :param p: the actual `p` value 

99 :param alpha_prime: the `alpha` value 

100 :returns: the output 

101 """ 

102 if cmp == 0: 

103 return __DASH 

104 if p >= alpha_prime: 

105 return [p_str, " ", __UNCLEAR] 

106 if cmp < 0: 

107 return [p_str, " ", __LESS] 

108 if cmp > 0: 

109 return [p_str, " ", __GREATER] 

110 raise ValueError( 

111 f"huh? p={p}, p_str={p_str}, cmp={cmp}, alpha'={alpha_prime}") 

112 

113 

114def tabulate_result_tests( 

115 end_results: Iterable[EndResult], 

116 file_name: str = "tests", 

117 dir_name: str = ".", 

118 alpha: float = 0.02, 

119 text_format_driver: TextFormatDriver | Callable[[], TextFormatDriver] 

120 = Markdown.instance, 

121 algorithm_sort_key: Callable[[str], Any] = lambda a: a, 

122 instance_sort_key: Callable[[str], Any] = lambda i: i, 

123 instance_namer: Callable[[str], str] = lambda x: x, 

124 algorithm_namer: Callable[[str], str] = lambda x: x, 

125 use_lang: bool = False, 

126 p_renderer: NumberRenderer = DEFAULT_NUMBER_RENDERER, 

127 value_getter: Callable[[EndResult], int | float] 

128 = end_result_getter(KEY_BEST_F)) -> Path: 

129 r""" 

130 Tabulate the results of statistical comparisons of end result qualities. 

131 

132 `end_results` contains a sequence of 

133 :class:`~moptipy.evaluation.end_results.EndResult` records, each of which 

134 represents the result of one run of one algorithm on one instance. This 

135 function performs a two-tailed Mann-Whitney U test for each algorithm pair 

136 on each problem instance to see if the performances are statistically 

137 significantly different. The results of these tests are tabulated, 

138 together with their `p`-values, i.e., the probabilities that the observed 

139 differences would occur if the two algorithms would perform the same. 

140 

141 If `p` is sufficiently small, this means that it is unlikely that the 

142 difference in performance of the two compared algorithms that was observed 

143 stems from randomness. But what does "sufficiently small" mean? 

144 As parameter, this function accepts a significance threshold 

145 `0<alpha<0.5`. `alpha` is, so to say, the upper limit of the "probability 

146 to be wrong" if we claim something like "algorithm A is better than 

147 algorithm B" that we are going to accept. In other words, if the table 

148 says that algorithm A is better than algorithm B, the chance that this is 

149 wrong is not more than `alpha`. 

150 

151 However, if we do many such tests, our chance to make at least one mistake 

152 grows. If we do `n_tests` tests, then the chance that all of them are 

153 "right" would be `1-[(1-alpha)^n_tests]`. Since we are going to do 

154 multiple tests, the Bonferroni correction is therefore applied and 

155 `alpha'=alpha/n_tests` is computed. Then, the chance to have at least one 

156 of the `n_tests` test results to be wrong is not higher than `alpha`. 

157 

158 The test results are presented as follows: 

159 The first column of the generated table denotes the problem instances. 

160 Each of the other columns represents a pair of algorithms. In each cell, 

161 the pair is compared based on the results on the instance of the row. The 

162 cell ten holds the `p`-value of the two-tailed Mann-Whitney U test. If the 

163 first algorithm is significantly better (at `p<alpha'`) than the second 

164 algorithm, then the cell is marked with `<`. If the first algorithm is 

165 significantly worse (at `p<alpha'`) than the second algorithm, then the 

166 cell is marked with `>`. If the observed differences are not significant 

167 (`p>=alpha'`), then the cell is marked with `?`. 

168 

169 However, there could also be a situation where a statistical comparison 

170 makes no sense as no difference could reliably be detected anyway. For 

171 example, if one algorithm has a smaller median result but a larger mean 

172 result, or if the medians are the same, or if the means are the same. 

173 Regardless of what outcome a test would have, we could not really claim 

174 that any of the algorithms was better or worse. In such cases, no test 

175 is performed and `-` is printed instead (signified by `&mdash;` in the 

176 markdown format). 

177 

178 Finally, the bottom row sums up the numbers of `<`, `?`, and `>` outcomes 

179 for each algorithm pair. 

180 

181 Depending on the parameter `text_format_driver`, the tables can be 

182 rendered in different formats, such as 

183 :py:class:`~moptipy.utils.markdown.Markdown`, 

184 :py:class:`~moptipy.utils.latex.LaTeX`, and 

185 :py:class:`~moptipy.utils.html.HTML`. 

186 

187 :param end_results: the end results data 

188 :param file_name: the base file name 

189 :param dir_name: the base directory 

190 :param alpha: the threshold at which the two-tailed test result is 

191 accepted. 

192 :param text_format_driver: the text format driver 

193 :param algorithm_sort_key: a function returning sort keys for algorithms 

194 :param instance_sort_key: a function returning sort keys for instances 

195 :param instance_namer: the name function for instances receives an 

196 instance ID and returns an instance name; default=identity function 

197 :param algorithm_namer: the name function for algorithms receives an 

198 algorithm ID and returns an instance name; default=identity function 

199 :param use_lang: should we use the language to define the filename 

200 :param p_renderer: the renderer for all probabilities 

201 :param value_getter: the getter for the values that should be compared. By 

202 default, the best obtained objective values are compared. However, if 

203 you let the runs continue until they reach a certain goal quality, 

204 then you may want to compare the runtimes consumed until that quality 

205 is reached. Basically, you can use any of the getters provided by 

206 :meth:`moptipy.evaluation.end_results.getter`, but you must 

207 take care that the comparison makes sense, i.e., compare qualities 

208 under fixed-budget scenarios (the default behavior) or compare 

209 runtimes under scenarios with goal qualities - but do not mix up the 

210 scenarios. 

211 :returns: the path to the file with the tabulated test results 

212 """ 

213 # Before doing anything, let's do some type checking on the parameters. 

214 # I want to ensure that this function is called correctly before we begin 

215 # to actually process the data. It is better to fail early than to deliver 

216 # some incorrect results. 

217 if not isinstance(end_results, Iterable): 

218 raise type_error(end_results, "end_results", Iterable) 

219 if not isinstance(file_name, str): 

220 raise type_error(file_name, "file_name", str) 

221 if not isinstance(dir_name, str): 

222 raise type_error(dir_name, "dir_name", str) 

223 if not isinstance(alpha, float): 

224 raise type_error(alpha, "alpha", float) 

225 if not (0.0 < alpha < 0.5): 

226 raise ValueError(f"invalid alpha={alpha}, must be 0<alpha<0.5.") 

227 if callable(text_format_driver): 

228 text_format_driver = text_format_driver() 

229 if not isinstance(text_format_driver, TextFormatDriver): 

230 raise type_error(text_format_driver, "text_format_driver", 

231 TextFormatDriver, True) 

232 if not callable(algorithm_namer): 

233 raise type_error(algorithm_namer, "algorithm_namer", call=True) 

234 if not callable(instance_namer): 

235 raise type_error(instance_namer, "instance_namer", call=True) 

236 if not isinstance(use_lang, bool): 

237 raise type_error(use_lang, "use_lang", bool) 

238 

239 if not isinstance(p_renderer, NumberRenderer): 

240 raise type_error(p_renderer, "p_renderer", NumberRenderer) 

241 if not callable(value_getter): 

242 raise type_error(value_getter, "value_getter", call=True) 

243 

244 # now gather the data: algorithms -> instances -> bestF 

245 data_dict: dict[str, dict[str, list[int | float]]] = {} 

246 inst_set: set[str] = set() 

247 for i, end_result in enumerate(end_results): 

248 if not isinstance(end_result, EndResult): 

249 raise type_error(end_result, f"end_results[{i}]", EndResult) 

250 value = value_getter(end_result) 

251 if not isinstance(value, int | float): 

252 raise type_error( 

253 value, f"value_getter({end_result}", (int, float)) 

254 if not isfinite(value): 

255 raise ValueError(f"value_getter({end_result}={value}") 

256 inst_set.add(end_result.instance) 

257 if end_result.algorithm in data_dict: 

258 inst_res = data_dict[end_result.algorithm] 

259 else: 

260 data_dict[end_result.algorithm] = inst_res = {} 

261 if end_result.instance in inst_res: 

262 inst_res[end_result.instance].append(value) 

263 else: 

264 inst_res[end_result.instance] = [value] 

265 

266 # finished collecting data 

267 n_algos: Final[int] = len(data_dict) 

268 if n_algos <= 0: 

269 raise ValueError("end results are empty!") 

270 if not (1 < n_algos < 5): 

271 raise ValueError(f"invalid number {n_algos} of " 

272 "algorithms, must be 1<n_algos<5.") 

273 instances: Final[list] = sorted(inst_set, key=instance_sort_key) 

274 n_insts: Final[int] = len(instances) 

275 algorithms: Final[list] = sorted(data_dict.keys(), key=algorithm_sort_key) 

276 

277 # validate data 

278 for algo in algorithms: 

279 if len(data_dict[algo]) != n_insts: 

280 raise ValueError( 

281 f"algorithm {algo!r} has only data for {len(data_dict[algo])}" 

282 f" instances, but should have for {n_insts}.") 

283 

284 # compute the Bonferroni correction 

285 n_cols: Final[int] = ((n_algos - 1) * n_algos) // 2 

286 n_pairs: Final[int] = n_insts * n_cols 

287 alpha_prime: Final[float] = alpha / n_pairs 

288 if not (0.0 < alpha_prime < 0.5): 

289 raise ValueError(f"invalid alpha'={alpha_prime}, must be 0<alpha'" 

290 f"<0.5, stems from alpha={alpha} and N={n_pairs}") 

291 

292 # compute the comparison results 

293 comparisons: Final[list[list[int]]] = \ 

294 [[__compare(data_dict[algorithms[i]][inst], 

295 data_dict[algorithms[j]][inst]) 

296 for inst in instances] 

297 for i in range(n_algos - 1) 

298 for j in range(i + 1, n_algos)] 

299 

300 # compute the p_values 

301 p_values: Final[list[list[float]]] = \ 

302 [[float(mannwhitneyu(data_dict[algorithms[ij[0]]][inst], 

303 data_dict[algorithms[ij[1]]][inst]).pvalue) 

304 if comparisons[z][k] != 0 else None 

305 for k, inst in enumerate(instances)] 

306 for z, ij in enumerate([(ii, jj) for ii in range(n_algos - 1) 

307 for jj in range(ii + 1, n_algos)])] 

308 

309 # now format all p values to strings 

310 p_flat: Final[list[float]] = [y for x in p_values for y in x] 

311 p_flat.append(alpha) 

312 p_flat.append(alpha_prime) 

313 p_flat_strs: Final[list[FormattedStr | None]] = \ 

314 p_renderer.render(p_flat) 

315 alpha_str: FormattedStr = cast("FormattedStr", p_flat_strs[-2]) 

316 a2 = FormattedStr.number(alpha) 

317 if len(a2) <= len(alpha_str): 

318 alpha_str = a2 

319 alpha_prime_str: FormattedStr = cast("FormattedStr", p_flat_strs[-1]) 

320 a2 = FormattedStr.number(alpha_prime) 

321 if len(a2) <= len(alpha_prime_str): 

322 alpha_prime_str = a2 

323 del p_flat_strs[-1] 

324 del p_flat_strs[-1] 

325 p_strs: Final[list[list[FormattedStr | None]]] = \ 

326 [p_flat_strs[(i * n_insts):((i + 1) * n_insts)] for i in range(n_cols)] 

327 

328 summary_row: list[Iterable[str] | str] | None = None 

329 if n_insts > 0: 

330 summary_row = [__WEL] 

331 for i, col in enumerate(comparisons): 

332 wins: int = 0 

333 equals: int = 0 

334 losses: int = 0 

335 pv: list[float] = p_values[i] 

336 for j, cv in enumerate(col): 

337 if (cv == 0) or (pv[j] >= alpha_prime): 

338 equals += 1 

339 elif cv < 0: 

340 wins += 1 

341 else: 

342 losses += 1 

343 summary_row.append(f"{wins}/{equals}/{losses}") 

344 

345 #: the columns 

346 cols: Final[list[list[str | list[str]]]] = \ 

347 [[__compare_to_str(comparisons[i][j], p_strs[i][j], p_values[i][j], 

348 alpha_prime) 

349 for j in range(n_insts)] 

350 for i in range(n_cols)] 

351 cols.insert(0, [FormattedStr.add_format(instance_namer(inst), code=True) 

352 for inst in instances]) 

353 

354 # We now got the p_values, the comparison results, and the p_strs. 

355 

356 # We now can render the headlines. If there is only one pair of 

357 # algorithms, we render a single headline with all the information. 

358 # If there are multiple algorithm pairs, we do two headlines. 

359 head_lines: list[list[str | list[str]]] 

360 multi_header: Final[bool] = \ 

361 (n_algos > 2) and (not isinstance(text_format_driver, Markdown)) 

362 if multi_header: 

363 head_lines = [[], []] 

364 

365 def __app(a, b) -> None: 

366 head_lines[0].append(a) 

367 head_lines[1].append(b) 

368 else: 

369 head_lines = [[]] 

370 

371 def __app(a, b) -> None: 

372 x = [] 

373 if isinstance(a, str): 

374 x.append(a) 

375 else: 

376 x.extend(a) 

377 x.append(" ") 

378 if isinstance(b, str): 

379 x.append(b) 

380 else: 

381 x.extend(b) 

382 head_lines[0].append(x) 

383 

384 __app("Mann-Whitney U", [__ALPHA, "=", alpha_str, ", ", __ALPHA, 

385 "'=", alpha_prime_str]) 

386 for i in range(n_algos - 1): 

387 for j in range(i + 1, n_algos): 

388 an1 = FormattedStr.add_format(algorithm_namer(algorithms[i]), 

389 code=True) 

390 an2 = FormattedStr.add_format(algorithm_namer(algorithms[j]), 

391 code=True) 

392 if len(an1) >= len(an2): 

393 __app(an1, ["vs. ", an2]) 

394 else: 

395 __app([an1, " vs."], an2) 

396 

397 # write the table 

398 dest: Final[Path] = text_format_driver.filename( 

399 file_name, dir_name, use_lang) 

400 with dest.open_for_write() as wd, Table( 

401 wd, ("l" if multi_header else "r") + ("c" * n_cols), 

402 text_format_driver) as table: 

403 with table.header() as header: 

404 for hl in head_lines: 

405 with header.row() as row: 

406 for cell in hl: 

407 row.cell(cell) 

408 with table.section() as section: 

409 section.cols(cols) 

410 if summary_row is not None: 

411 with table.section() as section, section.row() as row: 

412 for s in summary_row: 

413 row.cell(s) 

414 

415 return dest