Coverage for moptipy / evaluation / tabulate_result_tests.py: 86%
176 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-24 08:49 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-24 08:49 +0000
1r"""
2Provides :func:`tabulate_result_tests` creating statistical comparison tables.
4The function :func:`tabulate_result_tests` can compare two or more algorithms
5on multiple problem instances by using the Mann-Whitney U test [1-3] with the
6Bonferroni correction [4].
81. Daniel F. Bauer. Constructing Confidence Sets Using Rank Statistics.
9 In *Journal of the American Statistical Association.* 67(339):687-690.
10 September 1972. doi: https://doi.org/10.1080/01621459.1972.10481279.
112. Sidney Siegel and N. John Castellan Jr. *Nonparametric Statistics for The
12 Behavioral Sciences.* 1988 In the Humanities/Social Sciences/Languages
13 series. New York, NY, USA: McGraw-Hill. ISBN: 0-07-057357-3.
143. Myles Hollander and Douglas Alan Wolfe. *Nonparametric Statistical
15 Methods.* 1973. New York, NY, USA: John Wiley and Sons Ltd.
16 ISBN: 047140635X.
174. Olive Jean Dunn. Multiple Comparisons Among Means. In *Journal of the
18 American Statistical Association.* 56(293):52-64. March 1961.
19 doi: https://doi.org/10.1080/01621459.1961.10482090.
20"""
22from math import isfinite
23from statistics import mean, median
24from typing import Any, Callable, Final, Iterable, cast
26from pycommons.io.path import Path
27from pycommons.types import type_error
28from scipy.stats import mannwhitneyu # type: ignore
30from moptipy.api.logging import KEY_BEST_F
31from moptipy.evaluation.end_results import EndResult
32from moptipy.evaluation.end_results import getter as end_result_getter
33from moptipy.utils.formatted_string import FormattedStr
34from moptipy.utils.markdown import Markdown
35from moptipy.utils.number_renderer import (
36 DEFAULT_NUMBER_RENDERER,
37 NumberRenderer,
38)
39from moptipy.utils.table import Table
40from moptipy.utils.text_format import TextFormatDriver
42#: the string constant for alpha
43__ALPHA: Final[FormattedStr] = FormattedStr.special("\u03b1")
44#: the string constant for dash or unclear
45__DASH: Final[FormattedStr] = FormattedStr.special("\u2014")
46#: the character for less
47__LESS: Final[FormattedStr] = FormattedStr("<", code=True)
48#: the character for equal
49__UNCLEAR: Final[FormattedStr] = FormattedStr("?", code=True)
50#: the character for greater
51__GREATER: Final[FormattedStr] = FormattedStr(">", code=True)
53#: the name of the win-equal-loss column
54__WEL: Final[tuple[FormattedStr, str, FormattedStr, str, FormattedStr]] \
55 = __LESS, " / ", __UNCLEAR, " / ", __GREATER
58def __compare(a: list[int | float],
59 b: list[int | float]) -> int:
60 """
61 Compare two lists of numbers.
63 `-1` is returned if `a` has both a smaller mean and median value than `b`.
64 `1` is returned if `a` has both a larger mean and median value than `b`.
65 `0` is returned otherwise.
67 :param a: the first list of numbers
68 :param b: the second list of numbers
69 :returns: `-1` if the numbers in `a` tend to be smaller than those in `b`,
70 `1` if the numbers in `a` tend to be larger than those in `b`,
71 `0` if undecided
72 """
73 smaller: bool
74 ma = mean(a)
75 mb = mean(b)
76 if ma < mb:
77 smaller = True
78 elif ma > mb:
79 smaller = False
80 else:
81 return 0
82 ma = median(a)
83 mb = median(b)
84 if ma < mb:
85 return -1 if smaller else 0
86 if ma > mb:
87 return 0 if smaller else 1
88 return 0
91def __compare_to_str(cmp: int, p_str: FormattedStr | None,
92 p: float, alpha_prime: float) -> str | list[str]:
93 """
94 Transform a comparison result and a string to an output.
96 :param cmp: the comparison result
97 :param p_str: the rendered `p` value string
98 :param p: the actual `p` value
99 :param alpha_prime: the `alpha` value
100 :returns: the output
101 """
102 if cmp == 0:
103 return __DASH
104 if p >= alpha_prime:
105 return [p_str, " ", __UNCLEAR]
106 if cmp < 0:
107 return [p_str, " ", __LESS]
108 if cmp > 0:
109 return [p_str, " ", __GREATER]
110 raise ValueError(
111 f"huh? p={p}, p_str={p_str}, cmp={cmp}, alpha'={alpha_prime}")
114def tabulate_result_tests(
115 end_results: Iterable[EndResult],
116 file_name: str = "tests",
117 dir_name: str = ".",
118 alpha: float = 0.02,
119 text_format_driver: TextFormatDriver | Callable[[], TextFormatDriver]
120 = Markdown.instance,
121 algorithm_sort_key: Callable[[str], Any] = lambda a: a,
122 instance_sort_key: Callable[[str], Any] = lambda i: i,
123 instance_namer: Callable[[str], str] = lambda x: x,
124 algorithm_namer: Callable[[str], str] = lambda x: x,
125 use_lang: bool = False,
126 p_renderer: NumberRenderer = DEFAULT_NUMBER_RENDERER,
127 value_getter: Callable[[EndResult], int | float]
128 = end_result_getter(KEY_BEST_F)) -> Path:
129 r"""
130 Tabulate the results of statistical comparisons of end result qualities.
132 `end_results` contains a sequence of
133 :class:`~moptipy.evaluation.end_results.EndResult` records, each of which
134 represents the result of one run of one algorithm on one instance. This
135 function performs a two-tailed Mann-Whitney U test for each algorithm pair
136 on each problem instance to see if the performances are statistically
137 significantly different. The results of these tests are tabulated,
138 together with their `p`-values, i.e., the probabilities that the observed
139 differences would occur if the two algorithms would perform the same.
141 If `p` is sufficiently small, this means that it is unlikely that the
142 difference in performance of the two compared algorithms that was observed
143 stems from randomness. But what does "sufficiently small" mean?
144 As parameter, this function accepts a significance threshold
145 `0<alpha<0.5`. `alpha` is, so to say, the upper limit of the "probability
146 to be wrong" if we claim something like "algorithm A is better than
147 algorithm B" that we are going to accept. In other words, if the table
148 says that algorithm A is better than algorithm B, the chance that this is
149 wrong is not more than `alpha`.
151 However, if we do many such tests, our chance to make at least one mistake
152 grows. If we do `n_tests` tests, then the chance that all of them are
153 "right" would be `1-[(1-alpha)^n_tests]`. Since we are going to do
154 multiple tests, the Bonferroni correction is therefore applied and
155 `alpha'=alpha/n_tests` is computed. Then, the chance to have at least one
156 of the `n_tests` test results to be wrong is not higher than `alpha`.
158 The test results are presented as follows:
159 The first column of the generated table denotes the problem instances.
160 Each of the other columns represents a pair of algorithms. In each cell,
161 the pair is compared based on the results on the instance of the row. The
162 cell ten holds the `p`-value of the two-tailed Mann-Whitney U test. If the
163 first algorithm is significantly better (at `p<alpha'`) than the second
164 algorithm, then the cell is marked with `<`. If the first algorithm is
165 significantly worse (at `p<alpha'`) than the second algorithm, then the
166 cell is marked with `>`. If the observed differences are not significant
167 (`p>=alpha'`), then the cell is marked with `?`.
169 However, there could also be a situation where a statistical comparison
170 makes no sense as no difference could reliably be detected anyway. For
171 example, if one algorithm has a smaller median result but a larger mean
172 result, or if the medians are the same, or if the means are the same.
173 Regardless of what outcome a test would have, we could not really claim
174 that any of the algorithms was better or worse. In such cases, no test
175 is performed and `-` is printed instead (signified by `—` in the
176 markdown format).
178 Finally, the bottom row sums up the numbers of `<`, `?`, and `>` outcomes
179 for each algorithm pair.
181 Depending on the parameter `text_format_driver`, the tables can be
182 rendered in different formats, such as
183 :py:class:`~moptipy.utils.markdown.Markdown`,
184 :py:class:`~moptipy.utils.latex.LaTeX`, and
185 :py:class:`~moptipy.utils.html.HTML`.
187 :param end_results: the end results data
188 :param file_name: the base file name
189 :param dir_name: the base directory
190 :param alpha: the threshold at which the two-tailed test result is
191 accepted.
192 :param text_format_driver: the text format driver
193 :param algorithm_sort_key: a function returning sort keys for algorithms
194 :param instance_sort_key: a function returning sort keys for instances
195 :param instance_namer: the name function for instances receives an
196 instance ID and returns an instance name; default=identity function
197 :param algorithm_namer: the name function for algorithms receives an
198 algorithm ID and returns an instance name; default=identity function
199 :param use_lang: should we use the language to define the filename
200 :param p_renderer: the renderer for all probabilities
201 :param value_getter: the getter for the values that should be compared. By
202 default, the best obtained objective values are compared. However, if
203 you let the runs continue until they reach a certain goal quality,
204 then you may want to compare the runtimes consumed until that quality
205 is reached. Basically, you can use any of the getters provided by
206 :meth:`moptipy.evaluation.end_results.getter`, but you must
207 take care that the comparison makes sense, i.e., compare qualities
208 under fixed-budget scenarios (the default behavior) or compare
209 runtimes under scenarios with goal qualities - but do not mix up the
210 scenarios.
211 :returns: the path to the file with the tabulated test results
212 """
213 # Before doing anything, let's do some type checking on the parameters.
214 # I want to ensure that this function is called correctly before we begin
215 # to actually process the data. It is better to fail early than to deliver
216 # some incorrect results.
217 if not isinstance(end_results, Iterable):
218 raise type_error(end_results, "end_results", Iterable)
219 if not isinstance(file_name, str):
220 raise type_error(file_name, "file_name", str)
221 if not isinstance(dir_name, str):
222 raise type_error(dir_name, "dir_name", str)
223 if not isinstance(alpha, float):
224 raise type_error(alpha, "alpha", float)
225 if not (0.0 < alpha < 0.5):
226 raise ValueError(f"invalid alpha={alpha}, must be 0<alpha<0.5.")
227 if callable(text_format_driver):
228 text_format_driver = text_format_driver()
229 if not isinstance(text_format_driver, TextFormatDriver):
230 raise type_error(text_format_driver, "text_format_driver",
231 TextFormatDriver, True)
232 if not callable(algorithm_namer):
233 raise type_error(algorithm_namer, "algorithm_namer", call=True)
234 if not callable(instance_namer):
235 raise type_error(instance_namer, "instance_namer", call=True)
236 if not isinstance(use_lang, bool):
237 raise type_error(use_lang, "use_lang", bool)
239 if not isinstance(p_renderer, NumberRenderer):
240 raise type_error(p_renderer, "p_renderer", NumberRenderer)
241 if not callable(value_getter):
242 raise type_error(value_getter, "value_getter", call=True)
244 # now gather the data: algorithms -> instances -> bestF
245 data_dict: dict[str, dict[str, list[int | float]]] = {}
246 inst_set: set[str] = set()
247 for i, end_result in enumerate(end_results):
248 if not isinstance(end_result, EndResult):
249 raise type_error(end_result, f"end_results[{i}]", EndResult)
250 value = value_getter(end_result)
251 if not isinstance(value, int | float):
252 raise type_error(
253 value, f"value_getter({end_result}", (int, float))
254 if not isfinite(value):
255 raise ValueError(f"value_getter({end_result}={value}")
256 inst_set.add(end_result.instance)
257 if end_result.algorithm in data_dict:
258 inst_res = data_dict[end_result.algorithm]
259 else:
260 data_dict[end_result.algorithm] = inst_res = {}
261 if end_result.instance in inst_res:
262 inst_res[end_result.instance].append(value)
263 else:
264 inst_res[end_result.instance] = [value]
266 # finished collecting data
267 n_algos: Final[int] = len(data_dict)
268 if n_algos <= 0:
269 raise ValueError("end results are empty!")
270 if not (1 < n_algos < 5):
271 raise ValueError(f"invalid number {n_algos} of "
272 "algorithms, must be 1<n_algos<5.")
273 instances: Final[list] = sorted(inst_set, key=instance_sort_key)
274 n_insts: Final[int] = len(instances)
275 algorithms: Final[list] = sorted(data_dict.keys(), key=algorithm_sort_key)
277 # validate data
278 for algo in algorithms:
279 if len(data_dict[algo]) != n_insts:
280 raise ValueError(
281 f"algorithm {algo!r} has only data for {len(data_dict[algo])}"
282 f" instances, but should have for {n_insts}.")
284 # compute the Bonferroni correction
285 n_cols: Final[int] = ((n_algos - 1) * n_algos) // 2
286 n_pairs: Final[int] = n_insts * n_cols
287 alpha_prime: Final[float] = alpha / n_pairs
288 if not (0.0 < alpha_prime < 0.5):
289 raise ValueError(f"invalid alpha'={alpha_prime}, must be 0<alpha'"
290 f"<0.5, stems from alpha={alpha} and N={n_pairs}")
292 # compute the comparison results
293 comparisons: Final[list[list[int]]] = \
294 [[__compare(data_dict[algorithms[i]][inst],
295 data_dict[algorithms[j]][inst])
296 for inst in instances]
297 for i in range(n_algos - 1)
298 for j in range(i + 1, n_algos)]
300 # compute the p_values
301 p_values: Final[list[list[float]]] = \
302 [[float(mannwhitneyu(data_dict[algorithms[ij[0]]][inst],
303 data_dict[algorithms[ij[1]]][inst]).pvalue)
304 if comparisons[z][k] != 0 else None
305 for k, inst in enumerate(instances)]
306 for z, ij in enumerate([(ii, jj) for ii in range(n_algos - 1)
307 for jj in range(ii + 1, n_algos)])]
309 # now format all p values to strings
310 p_flat: Final[list[float]] = [y for x in p_values for y in x]
311 p_flat.append(alpha)
312 p_flat.append(alpha_prime)
313 p_flat_strs: Final[list[FormattedStr | None]] = \
314 p_renderer.render(p_flat)
315 alpha_str: FormattedStr = cast("FormattedStr", p_flat_strs[-2])
316 a2 = FormattedStr.number(alpha)
317 if len(a2) <= len(alpha_str):
318 alpha_str = a2
319 alpha_prime_str: FormattedStr = cast("FormattedStr", p_flat_strs[-1])
320 a2 = FormattedStr.number(alpha_prime)
321 if len(a2) <= len(alpha_prime_str):
322 alpha_prime_str = a2
323 del p_flat_strs[-1]
324 del p_flat_strs[-1]
325 p_strs: Final[list[list[FormattedStr | None]]] = \
326 [p_flat_strs[(i * n_insts):((i + 1) * n_insts)] for i in range(n_cols)]
328 summary_row: list[Iterable[str] | str] | None = None
329 if n_insts > 0:
330 summary_row = [__WEL]
331 for i, col in enumerate(comparisons):
332 wins: int = 0
333 equals: int = 0
334 losses: int = 0
335 pv: list[float] = p_values[i]
336 for j, cv in enumerate(col):
337 if (cv == 0) or (pv[j] >= alpha_prime):
338 equals += 1
339 elif cv < 0:
340 wins += 1
341 else:
342 losses += 1
343 summary_row.append(f"{wins}/{equals}/{losses}")
345 #: the columns
346 cols: Final[list[list[str | list[str]]]] = \
347 [[__compare_to_str(comparisons[i][j], p_strs[i][j], p_values[i][j],
348 alpha_prime)
349 for j in range(n_insts)]
350 for i in range(n_cols)]
351 cols.insert(0, [FormattedStr.add_format(instance_namer(inst), code=True)
352 for inst in instances])
354 # We now got the p_values, the comparison results, and the p_strs.
356 # We now can render the headlines. If there is only one pair of
357 # algorithms, we render a single headline with all the information.
358 # If there are multiple algorithm pairs, we do two headlines.
359 head_lines: list[list[str | list[str]]]
360 multi_header: Final[bool] = \
361 (n_algos > 2) and (not isinstance(text_format_driver, Markdown))
362 if multi_header:
363 head_lines = [[], []]
365 def __app(a, b) -> None:
366 head_lines[0].append(a)
367 head_lines[1].append(b)
368 else:
369 head_lines = [[]]
371 def __app(a, b) -> None:
372 x = []
373 if isinstance(a, str):
374 x.append(a)
375 else:
376 x.extend(a)
377 x.append(" ")
378 if isinstance(b, str):
379 x.append(b)
380 else:
381 x.extend(b)
382 head_lines[0].append(x)
384 __app("Mann-Whitney U", [__ALPHA, "=", alpha_str, ", ", __ALPHA,
385 "'=", alpha_prime_str])
386 for i in range(n_algos - 1):
387 for j in range(i + 1, n_algos):
388 an1 = FormattedStr.add_format(algorithm_namer(algorithms[i]),
389 code=True)
390 an2 = FormattedStr.add_format(algorithm_namer(algorithms[j]),
391 code=True)
392 if len(an1) >= len(an2):
393 __app(an1, ["vs. ", an2])
394 else:
395 __app([an1, " vs."], an2)
397 # write the table
398 dest: Final[Path] = text_format_driver.filename(
399 file_name, dir_name, use_lang)
400 with dest.open_for_write() as wd, Table(
401 wd, ("l" if multi_header else "r") + ("c" * n_cols),
402 text_format_driver) as table:
403 with table.header() as header:
404 for hl in head_lines:
405 with header.row() as row:
406 for cell in hl:
407 row.cell(cell)
408 with table.section() as section:
409 section.cols(cols)
410 if summary_row is not None:
411 with table.section() as section, section.row() as row:
412 for s in summary_row:
413 row.cell(s)
415 return dest