Coverage for moptipy/evaluation/tabulate_result

1r"""

2Provides :func:`tabulate_result_tests` creating statistical comparison tables.

4The function :func:`tabulate_result_tests` can compare two or more algorithms

5on multiple problem instances by using the Mann-Whitney U test [1-3] with the

6Bonferroni correction [4].

81. Daniel F. Bauer. Constructing Confidence Sets Using Rank Statistics.

9 In *Journal of the American Statistical Association.* 67(339):687-690.

10 September 1972. doi: https://doi.org/10.1080/01621459.1972.10481279.

112. Sidney Siegel and N. John Castellan Jr. *Nonparametric Statistics for The

12 Behavioral Sciences.* 1988 In the Humanities/Social Sciences/Languages

13 series. New York, NY, USA: McGraw-Hill. ISBN: 0-07-057357-3.

143. Myles Hollander and Douglas Alan Wolfe. *Nonparametric Statistical

15 Methods.* 1973. New York, NY, USA: John Wiley and Sons Ltd.

16 ISBN: 047140635X.

174. Olive Jean Dunn. Multiple Comparisons Among Means. In *Journal of the

18 American Statistical Association.* 56(293):52-64. March 1961.

19 doi: https://doi.org/10.1080/01621459.1961.10482090.

20"""

22from math import isfinite

23from statistics import mean, median

24from typing import Any, Callable, Final, Iterable, cast

26from pycommons.io.path import Path

27from pycommons.types import type_error

28from scipy.stats import mannwhitneyu # type: ignore

30from moptipy.api.logging import KEY_BEST_F

31from moptipy.evaluation.end_results import EndResult

32from moptipy.evaluation.end_results import getter as end_result_getter

33from moptipy.utils.formatted_string import FormattedStr

34from moptipy.utils.markdown import Markdown

35from moptipy.utils.number_renderer import (

36 DEFAULT_NUMBER_RENDERER,

37 NumberRenderer,

38)

39from moptipy.utils.table import Table

40from moptipy.utils.text_format import TextFormatDriver

42#: the string constant for alpha

43__ALPHA: Final[FormattedStr] = FormattedStr.special("\u03b1")

44#: the string constant for dash or unclear

45__DASH: Final[FormattedStr] = FormattedStr.special("\u2014")

46#: the character for less

47__LESS: Final[FormattedStr] = FormattedStr("<", code=True)

48#: the character for equal

49__UNCLEAR: Final[FormattedStr] = FormattedStr("?", code=True)

50#: the character for greater

51__GREATER: Final[FormattedStr] = FormattedStr(">", code=True)

53#: the name of the win-equal-loss column

54__WEL: Final[tuple[FormattedStr, str, FormattedStr, str, FormattedStr]] \

55 = __LESS, " / ", __UNCLEAR, " / ", __GREATER

58def __compare(a: list[int | float],

59 b: list[int | float]) -> int:

60 """

61 Compare two lists of numbers.

63 `-1` is returned if `a` has both a smaller mean and median value than `b`.

64 `1` is returned if `a` has both a larger mean and median value than `b`.

65 `0` is returned otherwise.

67 :param a: the first list of numbers

68 :param b: the second list of numbers

69 :returns: `-1` if the numbers in `a` tend to be smaller than those in `b`,

70 `1` if the numbers in `a` tend to be larger than those in `b`,

71 `0` if undecided

72 """

73 smaller: bool

74 ma = mean(a)

75 mb = mean(b)

76 if ma < mb:

77 smaller = True

78 elif ma > mb:

79 smaller = False

80 else:

81 return 0

82 ma = median(a)

83 mb = median(b)

84 if ma < mb:

85 return -1 if smaller else 0

86 if ma > mb:

87 return 0 if smaller else 1

88 return 0

91def __compare_to_str(cmp: int, p_str: FormattedStr | None,

92 p: float, alpha_prime: float) -> str | list[str]:

93 """

94 Transform a comparison result and a string to an output.

96 :param cmp: the comparison result

97 :param p_str: the rendered `p` value string

98 :param p: the actual `p` value

99 :param alpha_prime: the `alpha` value

100 :returns: the output

101 """

102 if cmp == 0:

103 return __DASH

104 if p >= alpha_prime:

105 return [p_str, " ", __UNCLEAR]

106 if cmp < 0:

107 return [p_str, " ", __LESS]

108 if cmp > 0:

109 return [p_str, " ", __GREATER]

110 raise ValueError(

111 f"huh? p={p}, p_str={p_str}, cmp={cmp}, alpha'={alpha_prime}")

112

113

114def tabulate_result_tests(

115 end_results: Iterable[EndResult],

116 file_name: str = "tests",

117 dir_name: str = ".",

118 alpha: float = 0.02,

119 text_format_driver: TextFormatDriver | Callable[[], TextFormatDriver]

120 = Markdown.instance,

121 algorithm_sort_key: Callable[[str], Any] = lambda a: a,

122 instance_sort_key: Callable[[str], Any] = lambda i: i,

123 instance_namer: Callable[[str], str] = lambda x: x,

124 algorithm_namer: Callable[[str], str] = lambda x: x,

125 use_lang: bool = False,

126 p_renderer: NumberRenderer = DEFAULT_NUMBER_RENDERER,

127 value_getter: Callable[[EndResult], int | float]

128 = end_result_getter(KEY_BEST_F)) -> Path:

129 r"""

130 Tabulate the results of statistical comparisons of end result qualities.

131

132 `end_results` contains a sequence of

133 :class:`~moptipy.evaluation.end_results.EndResult` records, each of which

134 represents the result of one run of one algorithm on one instance. This

135 function performs a two-tailed Mann-Whitney U test for each algorithm pair

136 on each problem instance to see if the performances are statistically

137 significantly different. The results of these tests are tabulated,

138 together with their `p`-values, i.e., the probabilities that the observed

139 differences would occur if the two algorithms would perform the same.

140

141 If `p` is sufficiently small, this means that it is unlikely that the

142 difference in performance of the two compared algorithms that was observed

143 stems from randomness. But what does "sufficiently small" mean?

144 As parameter, this function accepts a significance threshold

145 `0<alpha<0.5`. `alpha` is, so to say, the upper limit of the "probability

146 to be wrong" if we claim something like "algorithm A is better than

147 algorithm B" that we are going to accept. In other words, if the table

148 says that algorithm A is better than algorithm B, the chance that this is

149 wrong is not more than `alpha`.

150

151 However, if we do many such tests, our chance to make at least one mistake

152 grows. If we do `n_tests` tests, then the chance that all of them are

153 "right" would be `1-[(1-alpha)^n_tests]`. Since we are going to do

154 multiple tests, the Bonferroni correction is therefore applied and

155 `alpha'=alpha/n_tests` is computed. Then, the chance to have at least one

156 of the `n_tests` test results to be wrong is not higher than `alpha`.

157

158 The test results are presented as follows:

159 The first column of the generated table denotes the problem instances.

160 Each of the other columns represents a pair of algorithms. In each cell,

161 the pair is compared based on the results on the instance of the row. The

162 cell ten holds the `p`-value of the two-tailed Mann-Whitney U test. If the

163 first algorithm is significantly better (at `p<alpha'`) than the second

164 algorithm, then the cell is marked with `<`. If the first algorithm is

165 significantly worse (at `p<alpha'`) than the second algorithm, then the

166 cell is marked with `>`. If the observed differences are not significant

167 (`p>=alpha'`), then the cell is marked with `?`.

168

169 However, there could also be a situation where a statistical comparison

170 makes no sense as no difference could reliably be detected anyway. For

171 example, if one algorithm has a smaller median result but a larger mean

172 result, or if the medians are the same, or if the means are the same.

173 Regardless of what outcome a test would have, we could not really claim

174 that any of the algorithms was better or worse. In such cases, no test

175 is performed and `-` is printed instead (signified by `—` in the

176 markdown format).

177

178 Finally, the bottom row sums up the numbers of `<`, `?`, and `>` outcomes

179 for each algorithm pair.

180

181 Depending on the parameter `text_format_driver`, the tables can be

182 rendered in different formats, such as

183 :py:class:`~moptipy.utils.markdown.Markdown`,

184 :py:class:`~moptipy.utils.latex.LaTeX`, and

185 :py:class:`~moptipy.utils.html.HTML`.

186

187 :param end_results: the end results data

188 :param file_name: the base file name

189 :param dir_name: the base directory

190 :param alpha: the threshold at which the two-tailed test result is

191 accepted.

192 :param text_format_driver: the text format driver

193 :param algorithm_sort_key: a function returning sort keys for algorithms

194 :param instance_sort_key: a function returning sort keys for instances

195 :param instance_namer: the name function for instances receives an

196 instance ID and returns an instance name; default=identity function

197 :param algorithm_namer: the name function for algorithms receives an

198 algorithm ID and returns an instance name; default=identity function

199 :param use_lang: should we use the language to define the filename

200 :param p_renderer: the renderer for all probabilities

201 :param value_getter: the getter for the values that should be compared. By

202 default, the best obtained objective values are compared. However, if

203 you let the runs continue until they reach a certain goal quality,

204 then you may want to compare the runtimes consumed until that quality

205 is reached. Basically, you can use any of the getters provided by

206 :meth:`moptipy.evaluation.end_results.getter`, but you must

207 take care that the comparison makes sense, i.e., compare qualities

208 under fixed-budget scenarios (the default behavior) or compare

209 runtimes under scenarios with goal qualities - but do not mix up the

210 scenarios.

211 :returns: the path to the file with the tabulated test results

212 """

213 # Before doing anything, let's do some type checking on the parameters.

214 # I want to ensure that this function is called correctly before we begin

215 # to actually process the data. It is better to fail early than to deliver

216 # some incorrect results.

217 if not isinstance(end_results, Iterable):

218 raise type_error(end_results, "end_results", Iterable)

219 if not isinstance(file_name, str):

220 raise type_error(file_name, "file_name", str)

221 if not isinstance(dir_name, str):

222 raise type_error(dir_name, "dir_name", str)

223 if not isinstance(alpha, float):

224 raise type_error(alpha, "alpha", float)

225 if not (0.0 < alpha < 0.5):

226 raise ValueError(f"invalid alpha={alpha}, must be 0<alpha<0.5.")

227 if callable(text_format_driver):

228 text_format_driver = text_format_driver()

229 if not isinstance(text_format_driver, TextFormatDriver):

230 raise type_error(text_format_driver, "text_format_driver",

231 TextFormatDriver, True)

232 if not callable(algorithm_namer):

233 raise type_error(algorithm_namer, "algorithm_namer", call=True)

234 if not callable(instance_namer):

235 raise type_error(instance_namer, "instance_namer", call=True)

236 if not isinstance(use_lang, bool):

237 raise type_error(use_lang, "use_lang", bool)

238

239 if not isinstance(p_renderer, NumberRenderer):

240 raise type_error(p_renderer, "p_renderer", NumberRenderer)

241 if not callable(value_getter):

242 raise type_error(value_getter, "value_getter", call=True)

243

244 # now gather the data: algorithms -> instances -> bestF

245 data_dict: dict[str, dict[str, list[int | float]]] = {}

246 inst_set: set[str] = set()

247 for i, end_result in enumerate(end_results):

248 if not isinstance(end_result, EndResult):

249 raise type_error(end_result, f"end_results[{i}]", EndResult)

250 value = value_getter(end_result)

251 if not isinstance(value, int | float):

252 raise type_error(

253 value, f"value_getter({end_result}", (int, float))

254 if not isfinite(value):

255 raise ValueError(f"value_getter({end_result}={value}")

256 inst_set.add(end_result.instance)

257 if end_result.algorithm in data_dict:

258 inst_res = data_dict[end_result.algorithm]

259 else:

260 data_dict[end_result.algorithm] = inst_res = {}

261 if end_result.instance in inst_res:

262 inst_res[end_result.instance].append(value)

263 else:

264 inst_res[end_result.instance] = [value]

265

266 # finished collecting data

267 n_algos: Final[int] = len(data_dict)

268 if n_algos <= 0:

269 raise ValueError("end results are empty!")

270 if not (1 < n_algos < 5):

271 raise ValueError(f"invalid number {n_algos} of "

272 "algorithms, must be 1<n_algos<5.")

273 instances: Final[list] = sorted(inst_set, key=instance_sort_key)

274 n_insts: Final[int] = len(instances)

275 algorithms: Final[list] = sorted(data_dict.keys(), key=algorithm_sort_key)

276

277 # validate data

278 for algo in algorithms:

279 if len(data_dict[algo]) != n_insts:

280 raise ValueError(

281 f"algorithm {algo!r} has only data for {len(data_dict[algo])}"

282 f" instances, but should have for {n_insts}.")

283

284 # compute the Bonferroni correction

285 n_cols: Final[int] = ((n_algos - 1) * n_algos) // 2

286 n_pairs: Final[int] = n_insts * n_cols

287 alpha_prime: Final[float] = alpha / n_pairs

288 if not (0.0 < alpha_prime < 0.5):

289 raise ValueError(f"invalid alpha'={alpha_prime}, must be 0<alpha'"

290 f"<0.5, stems from alpha={alpha} and N={n_pairs}")

291

292 # compute the comparison results

293 comparisons: Final[list[list[int]]] = \

294 [[__compare(data_dict[algorithms[i]][inst],

295 data_dict[algorithms[j]][inst])

296 for inst in instances]

297 for i in range(n_algos - 1)

298 for j in range(i + 1, n_algos)]

299

300 # compute the p_values

301 p_values: Final[list[list[float]]] = \

302 [[float(mannwhitneyu(data_dict[algorithms[ij[0]]][inst],

303 data_dict[algorithms[ij[1]]][inst]).pvalue)

304 if comparisons[z][k] != 0 else None

305 for k, inst in enumerate(instances)]

306 for z, ij in enumerate([(ii, jj) for ii in range(n_algos - 1)

307 for jj in range(ii + 1, n_algos)])]

308

309 # now format all p values to strings

310 p_flat: Final[list[float]] = [y for x in p_values for y in x]

311 p_flat.append(alpha)

312 p_flat.append(alpha_prime)

313 p_flat_strs: Final[list[FormattedStr | None]] = \

314 p_renderer.render(p_flat)

315 alpha_str: FormattedStr = cast("FormattedStr", p_flat_strs[-2])

316 a2 = FormattedStr.number(alpha)

317 if len(a2) <= len(alpha_str):

318 alpha_str = a2

319 alpha_prime_str: FormattedStr = cast("FormattedStr", p_flat_strs[-1])

320 a2 = FormattedStr.number(alpha_prime)

321 if len(a2) <= len(alpha_prime_str):

322 alpha_prime_str = a2

323 del p_flat_strs[-1]

324 del p_flat_strs[-1]

325 p_strs: Final[list[list[FormattedStr | None]]] = \

326 [p_flat_strs[(i * n_insts):((i + 1) * n_insts)] for i in range(n_cols)]

327

328 summary_row: list[Iterable[str] | str] | None = None

329 if n_insts > 0:

330 summary_row = [__WEL]

331 for i, col in enumerate(comparisons):

332 wins: int = 0

333 equals: int = 0

334 losses: int = 0

335 pv: list[float] = p_values[i]

336 for j, cv in enumerate(col):

337 if (cv == 0) or (pv[j] >= alpha_prime):

338 equals += 1

339 elif cv < 0:

340 wins += 1

341 else:

342 losses += 1

343 summary_row.append(f"{wins}/{equals}/{losses}")

344

345 #: the columns

346 cols: Final[list[list[str | list[str]]]] = \

347 [[__compare_to_str(comparisons[i][j], p_strs[i][j], p_values[i][j],

348 alpha_prime)

349 for j in range(n_insts)]

350 for i in range(n_cols)]

351 cols.insert(0, [FormattedStr.add_format(instance_namer(inst), code=True)

352 for inst in instances])

353

354 # We now got the p_values, the comparison results, and the p_strs.

355

356 # We now can render the headlines. If there is only one pair of

357 # algorithms, we render a single headline with all the information.

358 # If there are multiple algorithm pairs, we do two headlines.

359 head_lines: list[list[str | list[str]]]

360 multi_header: Final[bool] = \

361 (n_algos > 2) and (not isinstance(text_format_driver, Markdown))

362 if multi_header:

363 head_lines = [[], []]

364

365 def __app(a, b) -> None:

366 head_lines[0].append(a)

367 head_lines[1].append(b)

368 else:

369 head_lines = [[]]

370

371 def __app(a, b) -> None:

372 x = []

373 if isinstance(a, str):

374 x.append(a)

375 else:

376 x.extend(a)

377 x.append(" ")

378 if isinstance(b, str):

379 x.append(b)

380 else:

381 x.extend(b)

382 head_lines[0].append(x)

383

384 __app("Mann-Whitney U", [__ALPHA, "=", alpha_str, ", ", __ALPHA,

385 "'=", alpha_prime_str])

386 for i in range(n_algos - 1):

387 for j in range(i + 1, n_algos):

388 an1 = FormattedStr.add_format(algorithm_namer(algorithms[i]),

389 code=True)

390 an2 = FormattedStr.add_format(algorithm_namer(algorithms[j]),

391 code=True)

392 if len(an1) >= len(an2):

393 __app(an1, ["vs. ", an2])

394 else:

395 __app([an1, " vs."], an2)

396

397 # write the table

398 dest: Final[Path] = text_format_driver.filename(

399 file_name, dir_name, use_lang)

400 with dest.open_for_write() as wd, Table(

401 wd, ("l" if multi_header else "r") + ("c" * n_cols),

402 text_format_driver) as table:

403 with table.header() as header:

404 for hl in head_lines:

405 with header.row() as row:

406 for cell in hl:

407 row.cell(cell)

408 with table.section() as section:

409 section.cols(cols)

410 if summary_row is not None:

411 with table.section() as section, section.row() as row:

412 for s in summary_row:

413 row.cell(s)

414

415 return dest

Coverage for moptipy / evaluation / tabulate_result_tests.py: 86%

176 statements