Coverage for moptipy / evaluation / selector.py: 93%

252 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-24 08:49 +0000

1""" 

2A tool for selecting a consistent subset of data from partial experiments. 

3 

4When we have partial experimental data, maybe collected from experiments that 

5are still ongoing, we want to still evaluate them in some consistent way. The 

6right method for doing this could be to select a subset of that data that is 

7consistent, i.e., a subset where the algorithms have the same number of runs 

8on the instances using the same seeds. The function :func:`select_consistent` 

9offered by this module provides the functionality to make such a selection. 

10It may be a bit slow, but hopefully it will pick the largest possible 

11consistent sub-selection or, at least, get close to it. 

12 

13The current method to select the data is rather heuristic. It always begins 

14with the full set of data and aims to delete the element that will cause the 

15least other deletions down the road, until we arrive in a consistent state. 

16I strongly suspect that doing this perfectly would be NP-hard, so we cannot 

17implement this. Instead, we use different heuristics and then pick the best 

18result. 

19""" 

20 

21from collections import Counter 

22from operator import itemgetter 

23from typing import Any, Callable, Final, Iterable, TypeVar 

24 

25from pycommons.io.console import logger 

26from pycommons.types import type_error 

27 

28from moptipy.evaluation.base import PerRunData 

29 

30#: the type variable for the selector routine 

31T = TypeVar("T", bound=PerRunData) 

32 

33#: the algorithm key 

34KEY_ALGORITHM: Final[int] = 0 

35#: the instance key 

36KEY_INSTANCE: Final[int] = KEY_ALGORITHM + 1 

37#: the encoding key 

38KEY_ENCODING: Final[int] = KEY_INSTANCE + 1 

39#: the objective key 

40KEY_OBJECTIVE: Final[int] = KEY_ENCODING + 1 

41#: the seed key 

42KEY_SEED: Final[int] = KEY_OBJECTIVE + 1 

43#: the number of keys 

44TOTAL_KEYS: Final[int] = KEY_SEED + 1 

45 

46 

47def __data_tup(d: PerRunData) -> tuple[str, str, str, str, tuple[str, int]]: 

48 """ 

49 Get a raw data tuple for the given record. 

50 

51 :param d: the data record 

52 :returns: the data tuple 

53 """ 

54 if not isinstance(d, PerRunData): 

55 raise type_error(d, "dataElement", PerRunData) 

56 return d.algorithm, d.instance, d.encoding, d.objective, ( 

57 d.instance, d.rand_seed) 

58 

59 

60def __score_inc(scores: tuple[Counter, ...], key: tuple[Any, ...]) -> None: 

61 """ 

62 Increment the score of a given tuple. 

63 

64 :param scores: the scores 

65 :param key: the key 

66 """ 

67 for i, k in enumerate(key): 

68 scores[i][k] += 1 

69 

70 

71def __score_dec(scores: tuple[Counter, ...], key: tuple[Any, ...]) -> None: 

72 """ 

73 Decrement the score of a given tuple. 

74 

75 :param scores: the scores 

76 :param key: the key 

77 """ 

78 for i, k in enumerate(key): 

79 scores[i][k] -= 1 

80 z: int = scores[i][k] 

81 if z <= 0: 

82 del scores[i][k] 

83 if z < 0: 

84 raise ValueError("Got negative score?") 

85 

86 

87def __ret(source: list, expected: int, log: bool) -> list: 

88 """ 

89 Prepare a source list for return. 

90 

91 :param source: the list 

92 :param expected: the number of expected records 

93 :param log: shall we log information 

94 :returns: the result 

95 """ 

96 for i, er in enumerate(source): 

97 source[i] = er[1] 

98 if list.__len__(source) != expected: 

99 raise ValueError("Inconsistent list length!") 

100 if log: 

101 logger(f"Now returning {expected} records of data.") 

102 return source # type: ignore 

103 

104 

105def __scorer_tired(d: tuple, scores: tuple[Counter, ...], _) -> list[int]: 

106 """ 

107 Score based on the tired score. 

108 

109 :param d: the tuple to score 

110 :param scores: the scores 

111 :returns: the score 

112 """ 

113 return sorted(scores[i][t] for i, t in enumerate(d)) 

114 

115 

116def __scorer_normalized_tired(d: tuple, scores: tuple[Counter, ...], 

117 total: tuple[int, ...]) -> list[float]: 

118 """ 

119 Score based on the tired score. 

120 

121 :param d: the tuple to score 

122 :param scores: the scores 

123 :param total: the total of the scores 

124 :returns: the score 

125 """ 

126 return sorted( 

127 scores[i][t] / total[i] for i, t in enumerate(d) if total[i] > 0) 

128 

129 

130def __scorer_sum(d: tuple, scores: tuple[Counter, ...], _) -> int: 

131 """ 

132 Score based on the sum score. 

133 

134 :param d: the tuple to score 

135 :param scores: the scores 

136 :returns: the score 

137 """ 

138 return sum(scores[i][t] for i, t in enumerate(d)) 

139 

140 

141def __scorer_normalized_sum(d: tuple, scores: tuple[Counter, ...], 

142 total: tuple[int, ...]) -> float: 

143 """ 

144 Score based on the sum score. 

145 

146 :param d: the tuple to score 

147 :param scores: the scores 

148 :param total: the total of the scores 

149 :returns: the score 

150 """ 

151 return sum( 

152 scores[i][t] / total[i] for i, t in enumerate(d) if total[i] > 0) 

153 

154 

155def __scorer_product(d: tuple, scores: tuple[Counter, ...], _) -> int: 

156 """ 

157 Score based on the product score. 

158 

159 :param d: the tuple to score 

160 :param scores: the scores 

161 :returns: the score 

162 """ 

163 result: int = 1 

164 for i, t in enumerate(d): 

165 result *= scores[i][t] 

166 return result 

167 

168 

169def __scorer_normalized_min(d: tuple, scores: tuple[Counter, ...], 

170 total: tuple[int, ...]) -> float: 

171 """ 

172 Score based on the normalized minimum score. 

173 

174 :param d: the tuple to score 

175 :param scores: the scores 

176 :param total: the total of the scores 

177 :returns: the score 

178 """ 

179 return min( 

180 scores[i][t] / total[i] for i, t in enumerate(d) if total[i] > 0) 

181 

182 

183def __not(_: tuple[Counter, ...]) -> int: 

184 """ 

185 Do nothing, return a placeholder. 

186 

187 :returns -1: always 

188 """ 

189 return -1 

190 

191 

192def __total(scores: tuple[Counter, ...]) -> tuple[int, ...] | None: 

193 """ 

194 Get the total scores. 

195 

196 :param scores: the scores 

197 :returns: the total scores 

198 """ 

199 result: tuple[int, ...] = tuple( 

200 t.total() if len(t) > 1 else -1 for t in scores) 

201 return None if (tuple.__len__(result) < 1) or ( 

202 max(result) <= 0) else result 

203 

204 

205#: the scorers 

206__SCORERS: Final[tuple[tuple[str, Callable[[ 

207 tuple, tuple[Counter, ...], Any], Any], Callable[[ 

208 tuple[Counter, ...]], Any]], ...]] = ( 

209 ("tired", __scorer_tired, __not), 

210 ("normalized_tired", __scorer_normalized_tired, __total), 

211 ("sum", __scorer_sum, __not), 

212 ("normalized_sum", __scorer_normalized_sum, __total), 

213 ("product", __scorer_product, __total), 

214 ("normalized_min", __scorer_normalized_min, __total), 

215) 

216 

217 

218def select_consistent(data: Iterable[T], log: bool = True, 

219 thorough: bool = True) -> list[T]: 

220 """ 

221 Select data such that the numbers of runs are consistent. 

222 

223 The input is a set of data items which represent some records over the 

224 runs of algorithms on instances. It may be that not all algorithms have 

225 been applied to all instances. Maybe the number of runs is inconsistent 

226 over the algorithm-instance combinations, too. Maybe some algorithms 

227 have more runs on some instances. Maybe the runs are even different, 

228 it could be that some algorithms have runs for seed `A`, `B`, and `C` on 

229 instance `I`, while others have runs for seed `C` and `D`. This function 

230 is designed to retain only the runs with seed `C` in such a case. It may 

231 discard algorithms or instances or algorithm-instance-seed combinations 

232 in order to obtain a selection of data where all algorithms have been 

233 applied to all instances as same as often and using the same seeds. 

234 

235 Now there are different ways to select such consistent subsets of a 

236 dataset. Of course, we want to select the data such that as much as 

237 possible of the data is retained and as little as possible is discarded. 

238 This may be a hard optimization problem in itself. Here, we offer a 

239 heuristic solution. Basically, we step-by-step try to cut away the 

240 setups that are covered by the least amount of runs. We keep repeating 

241 this until we arrive in a situation where all setups have the same 

242 amount of runs. We then check if there were some strange symmetries that 

243 still make the data inconsistent and, if we found some, try to delete 

244 one run to break the symmetries and then repeat the cleaning-up process. 

245 In the end, we should get a list of overall consistent data elements that 

246 can be used during a normal experiment evaluation procedure. 

247 

248 This iterative process may be rather slow on larger datasets, but it is 

249 maybe the best approximation we can offer to retain as much data as 

250 possible. 

251 

252 :param data: the source data 

253 :param log: shall we log the progress 

254 :param thorough: use the slower method which may give us more data 

255 :return: a list with the selected data 

256 

257 >>> def __p(x) -> str: 

258 ... return (f"{x.algorithm}/{x.instance}/{x.objective}/{x.encoding}/" 

259 ... f"{x.rand_seed}") 

260 

261 >>> a1i1o1e1s1 = PerRunData("a1", "i1", "o1", "e1", 1) 

262 >>> a1i1o1e1s2 = PerRunData("a1", "i1", "o1", "e1", 2) 

263 >>> a1i1o1e1s3 = PerRunData("a1", "i1", "o1", "e1", 3) 

264 >>> a1i2o1e1s1 = PerRunData("a1", "i2", "o1", "e1", 1) 

265 >>> a1i2o1e1s2 = PerRunData("a1", "i2", "o1", "e1", 2) 

266 >>> a1i2o1e1s3 = PerRunData("a1", "i2", "o1", "e1", 3) 

267 >>> a2i1o1e1s1 = PerRunData("a2", "i1", "o1", "e1", 1) 

268 >>> a2i1o1e1s2 = PerRunData("a2", "i1", "o1", "e1", 2) 

269 >>> a2i1o1e1s3 = PerRunData("a2", "i1", "o1", "e1", 3) 

270 >>> a2i2o1e1s1 = PerRunData("a1", "i2", "o1", "e1", 1) 

271 >>> a2i2o1e1s2 = PerRunData("a2", "i2", "o1", "e1", 2) 

272 >>> a2i2o1e1s3 = PerRunData("a2", "i2", "o1", "e1", 3) 

273 

274 >>> list(map(__p, select_consistent(( 

275 ... a1i1o1e1s1, a1i1o1e1s2, a1i1o1e1s3, 

276 ... a1i2o1e1s1, a1i2o1e1s2, a1i2o1e1s3, 

277 ... a2i1o1e1s1, a2i1o1e1s2, 

278 ... a2i2o1e1s2, a2i2o1e1s3)))) 

279 ['a1/i1/o1/e1/1', 'a1/i1/o1/e1/2', 'a1/i2/o1/e1/2', 'a1/i2/o1/e1/3',\ 

280 'a2/i1/o1/e1/1', 'a2/i1/o1/e1/2', 'a2/i2/o1/e1/2', 'a2/i2/o1/e1/3'] 

281 

282 >>> list(map(__p, select_consistent(( 

283 ... a1i1o1e1s2, a1i1o1e1s3, 

284 ... a1i2o1e1s1, a1i2o1e1s2, a1i2o1e1s3, 

285 ... a2i1o1e1s1, a2i1o1e1s2, 

286 ... a2i2o1e1s2, a2i2o1e1s3)))) 

287 ['a1/i2/o1/e1/2', 'a1/i2/o1/e1/3', 'a2/i2/o1/e1/2', 'a2/i2/o1/e1/3'] 

288 

289 >>> list(map(__p, select_consistent(( 

290 ... a1i1o1e1s2, a1i1o1e1s3, 

291 ... a1i2o1e1s1, a1i2o1e1s2, a1i2o1e1s3, 

292 ... a2i1o1e1s1, a2i1o1e1s2, 

293 ... a2i2o1e1s2)))) 

294 ['a1/i1/o1/e1/2', 'a1/i2/o1/e1/2', 'a2/i1/o1/e1/2', 'a2/i2/o1/e1/2'] 

295 

296 >>> list(map(__p, select_consistent(( 

297 ... a1i1o1e1s1, a1i1o1e1s2, a1i1o1e1s3, 

298 ... a2i2o1e1s1, a2i2o1e1s2, a2i2o1e1s3)))) 

299 ['a1/i1/o1/e1/1', 'a1/i1/o1/e1/2', 'a1/i1/o1/e1/3'] 

300 

301 >>> list(map(__p, select_consistent(( 

302 ... a1i1o1e1s1, a1i1o1e1s2, a1i1o1e1s3, 

303 ... a2i1o1e1s1, a2i1o1e1s2, a2i1o1e1s3)))) 

304 ['a1/i1/o1/e1/1', 'a1/i1/o1/e1/2', 'a1/i1/o1/e1/3', \ 

305'a2/i1/o1/e1/1', 'a2/i1/o1/e1/2', 'a2/i1/o1/e1/3'] 

306 

307 >>> list(map(__p, select_consistent(( 

308 ... a1i1o1e1s1, a1i1o1e1s2, a1i2o1e1s2, a1i2o1e1s3)))) 

309 ['a1/i1/o1/e1/1', 'a1/i1/o1/e1/2', 'a1/i2/o1/e1/2', 'a1/i2/o1/e1/3'] 

310 

311 >>> list(map(__p, select_consistent(( 

312 ... a1i1o1e1s1, a1i1o1e1s2, a1i2o1e1s2)))) 

313 ['a1/i1/o1/e1/1', 'a1/i1/o1/e1/2'] 

314 

315 >>> list(map(__p, select_consistent(( 

316 ... a1i1o1e1s1, a2i1o1e1s2)))) 

317 ['a1/i1/o1/e1/1'] 

318 

319 >>> list(map(__p, select_consistent(( 

320 ... a1i1o1e1s1, a2i1o1e1s2, a2i1o1e1s3)))) 

321 ['a2/i1/o1/e1/2', 'a2/i1/o1/e1/3'] 

322 

323 >>> try: 

324 ... select_consistent((a1i1o1e1s1, a1i1o1e1s2, a1i2o1e1s2, a1i2o1e1s2)) 

325 ... except ValueError as ve: 

326 ... print(ve) 

327 Found 4 records but only 3 different keys! 

328 

329 >>> try: 

330 ... select_consistent(1) 

331 ... except TypeError as te: 

332 ... print(te) 

333 data should be an instance of typing.Iterable but is int, namely 1. 

334 

335 >>> try: 

336 ... select_consistent((a2i1o1e1s2, a2i1o1e1s3), 3) 

337 ... except TypeError as te: 

338 ... print(te) 

339 log should be an instance of bool but is int, namely 3. 

340 

341 >>> try: 

342 ... select_consistent({234}) 

343 ... except TypeError as te: 

344 ... print(te) 

345 dataElement should be an instance of moptipy.evaluation.base.PerRunData \ 

346but is int, namely 234. 

347 

348 >>> try: 

349 ... select_consistent((a2i1o1e1s2, a2i1o1e1s3), True, 4) 

350 ... except TypeError as te: 

351 ... print(te) 

352 thorough should be an instance of bool but is int, namely 4. 

353 """ 

354 if not isinstance(data, Iterable): 

355 raise type_error(data, "data", Iterable) 

356 if not isinstance(log, bool): 

357 raise type_error(log, "log", bool) 

358 if not isinstance(thorough, bool): 

359 raise type_error(thorough, "thorough", bool) 

360 

361 # We obtain a sorted list of the data in order to make the results 

362 # consistent regardless of the order in which the data comes in. 

363 # The data is only sorted by its features and not by any other information 

364 # attached to it. 

365 dataset: Final[list[list]] = [[__data_tup(t), t, 0] for t in data] 

366 dataset.sort(key=itemgetter(0)) 

367 dataset_size: int = list.__len__(dataset) 

368 if log: 

369 logger(f"Found {dataset_size} records of data.") 

370 

371 set_l: Final[int] = set.__len__({x[0] for x in dataset}) 

372 if set_l != dataset_size: 

373 raise ValueError( 

374 f"Found {dataset_size} records but only {set_l} different keys!") 

375 

376 # Compute the scores: count how often each algorithm, instance, 

377 # objective, encoding, and (instance, seed) combination are 

378 # encountered. 

379 key_tuple_len: Final[int] = tuple.__len__(dataset[0][0]) 

380 if key_tuple_len != TOTAL_KEYS: 

381 raise ValueError("Unexpected error!") 

382 

383 # the map for the score calculations 

384 scores: Final[tuple[Counter, ...]] = tuple( 

385 Counter() for _ in range(key_tuple_len)) 

386 

387 # the final result 

388 best_length: int = -1 

389 best_data: list[list] | None = None 

390 

391 for scorer_name, scorer, scorer_total in __SCORERS if thorough else ( 

392 __SCORERS[0], ): 

393 logger(f"Now scoring according to the {scorer_name!r} method.") 

394 # compute the original item scores 

395 for sc in scores: 

396 sc.clear() 

397 

398 source: list[list] = dataset.copy() 

399 count: int = dataset_size 

400 for er in source: 

401 __score_inc(scores, er[0]) 

402 

403 changed: bool = True 

404 while changed: 

405 changed = False 

406 

407 # Find the setups with maximum and minimum scores. 

408 min_score: Any = None 

409 max_score: Any = None 

410 norm: Any = scorer_total(scores) 

411 

412 if norm is None: # cannot proceed 

413 if log: 

414 logger(f"Method {scorer_name!r} is not applicable.") 

415 count = -1 # we can do nothing 

416 break 

417 

418 for er in source: 

419 score = scorer(er[0], scores, norm) 

420 er[2] = score 

421 if (min_score is None) or (min_score > score): 

422 min_score = score 

423 if (max_score is None) or (max_score < score): 

424 max_score = score 

425 del norm 

426 

427 if min_score < max_score: # Some setups have lower scores. 

428 del_count: int = 0 # We will delete them. 

429 for i in range(count - 1, -1, -1): 

430 er = source[i] 

431 if er[2] <= min_score: 

432 del source[i] 

433 __score_dec(scores, er[0]) 

434 del_count += 1 

435 if del_count <= 0: 

436 raise ValueError( 

437 f"Did not delete anything under {scorer_name!r}?") 

438 

439 new_count: int = list.__len__(source) 

440 if new_count != (count - del_count): 

441 raise ValueError("List inconsistent after deletion " 

442 f"under {scorer_name!r}?") 

443 if log: 

444 logger( 

445 f"Deleted {del_count} of the {count} records because " 

446 f"their score was {min_score} while the maximum score" 

447 f" was {max_score}. Retained {new_count} records " 

448 f"under {scorer_name!r}.") 

449 count = new_count 

450 changed = True 

451 continue 

452 

453 if log: 

454 logger(f"All setups now have the same score {max_score} under" 

455 f" {scorer_name!r}.") 

456 if count <= best_length: 

457 if log: 

458 logger(f"We now only have {count} setups, which means we " 

459 "cannot get better than the current best set with " 

460 f"{best_length} setups, so we quit after score-" 

461 f"based cleaning under {scorer_name!r}.") 

462 count = -1 

463 break 

464 

465 # If we get here, all elements have the same score. 

466 # This means that we are basically done. 

467 # 

468 # However, this may also happen if a very odd division exists in 

469 # the data. Maybe we have one algorithm that was applied to one 

470 # instance ten times and another algorithm applied to another 

471 # instance ten times. This data would still be inconsistent, as it 

472 # does not allow for any comparison. 

473 

474 # Compute the different values for everything except the seeds. 

475 keys: tuple[int, ...] = tuple( 

476 v for v, s in enumerate(scores) 

477 if (v != KEY_SEED) and (len(s) > 1)) 

478 

479 if tuple.__len__(keys) > 1: 

480 if log: 

481 logger("Now checking for inconsistencies in " 

482 "algorithm/instance/objective/encoding under" 

483 f" {scorer_name!r}.") 

484 # Only if there are different values in at least one tuple 

485 # dimension, we need to check for strange situations. 

486 

487 # For each of (instance, algorithm, encoding, objective), we 

488 # must have the same number of other records. 

489 # If not, then we have some strange symmetric situation that 

490 # needs to be solved. 

491 per_value: dict[Any, set[Any]] = {} 

492 last: set[Any] | None = None 

493 for key in keys: 

494 other_keys: tuple[int, ...] = tuple( 

495 kk for kk in keys if kk != key) 

496 for er in source: 

497 kv = er[0][key] 

498 other = tuple(er[0][ok] for ok in other_keys) 

499 if kv in per_value: 

500 per_value[kv].add(other) 

501 else: 

502 per_value[kv] = last = {other} 

503 for v in per_value.values(): 

504 if v != last: 

505 changed = True 

506 break 

507 per_value.clear() 

508 

509 # We just need to delete one random element. This will 

510 # break the symmetry 

511 if changed: 

512 if log: 

513 logger( 

514 f"Deleting one of the {count} elements to " 

515 "break the erroneous symmetry under " 

516 f"{scorer_name}.") 

517 er = source[-1] 

518 del source[-1] 

519 __score_dec(scores, er[0]) 

520 count -= 1 

521 break 

522 del per_value 

523 del keys 

524 if changed: 

525 continue 

526 

527 if log: 

528 logger("No inconsistencies in algorithm/instance/objecti" 

529 f"ve/encoding found under {scorer_name!r}.") 

530 elif log: 

531 logger("No inconsistencies in algorithm/instance/objective/" 

532 f"encoding possible under {scorer_name!r}.") 

533 if count <= best_length: 

534 if log: 

535 logger(f"We now only have {count} setups, which means we " 

536 "cannot get better than the current best set with " 

537 f"{best_length} setups, so we quit after algorithm" 

538 "/instance/objective/encoding cleaning under " 

539 f"{scorer_name!r}.") 

540 count = -1 

541 break 

542 

543 # If we get here, the only problem left could be if algorithms 

544 # have different seeds for the same instances. We thus need to 

545 # check that for each instance, all the seeds are the same. 

546 # Notice that such inconsistencies can only occur if different 

547 # seeds occurred exactly as same as often. 

548 if log: 

549 logger("Now checking for inconsistencies in instance " 

550 f"seeds under {scorer_name!r}.") 

551 seeds: dict[str, dict[tuple[str, str, str], set[int]]] = {} 

552 for er in source: 

553 inst: str = er[1].instance 

554 cur: dict[tuple[str, str, str], set[int]] 

555 if inst in seeds: 

556 cur = seeds[inst] 

557 else: 

558 seeds[inst] = cur = {} 

559 kx: tuple[str, str, str] = ( 

560 er[1].algorithm, er[1].objective, er[1].encoding) 

561 if kx in cur: 

562 cur[kx].add(er[1].rand_seed) 

563 else: 

564 cur[kx] = {er[1].rand_seed} 

565 

566 max_seed_insts: set[str] = set() 

567 max_seeds: int = -1 

568 min_seed_inst: str | None = None 

569 min_seeds: int = -1 

570 must_delete_from_insts: set[str] = set() 

571 for instance, inst_setups in seeds.items(): 

572 kvx: set[int] | None = None 

573 for setup_seeds in inst_setups.values(): 

574 if kvx is None: 

575 kvx = setup_seeds 

576 seed_cnt: int = set.__len__(setup_seeds) 

577 if seed_cnt >= max_seeds: 

578 if seed_cnt > max_seeds: 

579 max_seed_insts.clear() 

580 max_seeds = seed_cnt 

581 max_seed_insts.add(instance) 

582 if (seed_cnt < min_seeds) or (min_seed_inst is None): 

583 min_seeds = seed_cnt 

584 min_seed_inst = instance 

585 elif kvx != setup_seeds: 

586 # We got a symmetric inconsistency 

587 must_delete_from_insts.add(instance) 

588 break 

589 if min_seeds < max_seeds: 

590 must_delete_from_insts.update(max_seed_insts) 

591 del max_seed_insts 

592 

593 del_count = set.__len__(must_delete_from_insts) 

594 if del_count > 0: 

595 if log: 

596 logger("Must delete one record from all of " 

597 f"{must_delete_from_insts!r}.") 

598 for i in range(count - 1, -1, -1): 

599 er = source[i] 

600 instance = er[1].instance 

601 if instance in must_delete_from_insts: 

602 must_delete_from_insts.remove(instance) 

603 del source[i] 

604 __score_dec(scores, er[0]) 

605 changed = True 

606 if set.__len__(must_delete_from_insts) <= 0: 

607 break 

608 new_count = list.__len__(source) 

609 if new_count != (count - del_count): 

610 raise ValueError("Error when deleting instances " 

611 f"under {scorer_name!r}.") 

612 count = new_count 

613 if not changed: 

614 raise ValueError( 

615 f"Seeds inconsistent under {scorer_name!r}.") 

616 del must_delete_from_insts 

617 

618 del seeds 

619 if changed: 

620 if count <= best_length: 

621 if log: 

622 logger(f"We now only have {count} setups, which " 

623 "means we cannot get better than the current " 

624 f"best set with {best_length} setups, so we " 

625 "quit after seed-based cleaning under " 

626 f"{scorer_name!r}.") 

627 count = -1 

628 break 

629 elif log: 

630 logger(f"No seed inconsistencies under {scorer_name!r}.") 

631 # There should not be any problems left, but we need to check 

632 # again if something has changed. 

633 

634 if count <= 0: 

635 continue # We can do nothing here 

636 

637 if count > best_length: 

638 logger(f"Method {scorer_name!r} yields {count} records, " 

639 "which is the new best.") 

640 best_length = count 

641 best_data = source 

642 if best_length >= dataset_size: 

643 logger(f"Included all data under {scorer_name!r}, " 

644 "so we can stop.") 

645 break 

646 elif log: 

647 logger(f"Method {scorer_name!r} yields {count} records, " 

648 "which is not better than the current " 

649 f"best {best_length}.") 

650 

651 # We are finally finished. The scores are no longer needed. 

652 del scores 

653 if best_length <= 0: 

654 raise ValueError("No data found at all?") 

655 return __ret(best_data, best_length, log)