Coverage for pycommons / math / stream_statistics.py: 99%

401 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-11 03:04 +0000

1""" 

2An immutable record for statistics computed over a stream of data. 

3 

4Stream statistics, represented by class 

5:class:`~pycommons.math.stream_statistics.StreamStatistics` are statistics 

6that are computed over a stream of data. During the computation, only a 

7minimal amount of data is actually kept in memory, such as a running sum, 

8the overall minimum and maximum, etc. 

9This makes these statistics suitable 

10 

11- if the amount of data is large and 

12- the required accuracy is not very high and/or 

13- the available computational budget or memory are limited. 

14 

15In this case, using the 

16:class:`~pycommons.math.stream_statistics.StreamStatistics` routines are 

17very suitable. 

18You could, e.g., use the method 

19:meth:`~pycommons.math.stream_statistics.StreamStatistics.aggregate` to 

20obtain an aggregation object. This object allows you to iteratively append 

21data to the current statistics computation via its `add` method and to obtain 

22the current (or final) statistics result via the `result` method. 

23 

24Such a result is an instance of the class 

25:class:`~pycommons.math.stream_statistics.StreamStatistics`. 

26It stores the 

27:attr:`~pycommons.math.stream_statistics.StreamStatistics.minimum` and the 

28:attr:`~pycommons.math.stream_statistics.StreamStatistics.maximum` of the 

29data, as well as the number 

30:attr:`~pycommons.math.stream_statistics.StreamStatistics.n` of observed data 

31samples. 

32It also offers the approximations of the arithmetic mean as attribute 

33:attr:`~pycommons.math.stream_statistics.StreamStatistics.mean_arith` and 

34the approximation of the standard deviation as attribute 

35:attr:`~pycommons.math.stream_statistics.StreamStatistics.stddev`. 

36 

37There is an absolute order defined upon these records. 

38They are hashable and immutable. 

39We provide methods to store them to CSV format via the class 

40:class:`~pycommons.math.stream_statistics.CsvWriter` 

41and to load them from CSV data via the class 

42:class:`~pycommons.math.stream_statistics.CsvReader`. 

43Functions that access attributes can be obtained via 

44:meth:`~pycommons.math.stream_statistics.StreamStatistics.getter`. 

45 

46If you require high-accuracy statistics or values such as the median, you 

47should use 

48:class:`~pycommons.math.sample_statistics.SampleStatistics` instead. 

49 

50>>> ag = StreamStatistics.aggregate() 

51>>> ag.update((1, 2, 3)) 

52>>> ag.add(4) 

53>>> ag.add(5) 

54>>> r1 = ag.result() 

55>>> repr(r1) 

56'StreamStatistics(n=5, minimum=1, mean_arith=3, maximum=5, \ 

57stddev=1.5811388300841898)' 

58>>> str(r1) 

59'5;1;3;5;1.5811388300841898' 

60 

61>>> r2 = StreamStatistics.from_samples((1, 2, 3, 4, 5)) 

62>>> r1 == r2 

63True 

64 

65>>> ag.reset() 

66>>> try: 

67... ag.result() 

68... except ValueError as ve: 

69... print(ve) 

70n=0 is invalid, must be in 1..9223372036854775808. 

71 

72>>> print(ag.result_or_none()) 

73None 

74""" 

75 

76from dataclasses import dataclass 

77from math import inf, isfinite, sqrt 

78from typing import Callable, Final, Iterable, TypeVar, Union, cast 

79 

80from pycommons.io.csv import ( 

81 CSV_SEPARATOR, 

82 SCOPE_SEPARATOR, 

83 csv_column, 

84 csv_column_or_none, 

85 csv_scope, 

86 csv_val_or_none, 

87 pycommons_footer_bottom_comments, 

88) 

89from pycommons.io.csv import CsvReader as CsvReaderBase 

90from pycommons.io.csv import CsvWriter as CsvWriterBase 

91from pycommons.math.int_math import ( 

92 try_float_int_div, 

93 try_int, 

94) 

95from pycommons.math.streams import StreamAggregate 

96from pycommons.strings.string_conv import ( 

97 num_or_none_to_str, 

98 num_to_str, 

99 str_to_num, 

100) 

101from pycommons.types import check_int_range, type_error, type_name 

102 

103#: The minimum value key. 

104KEY_MINIMUM: Final[str] = "min" 

105#: The median value key. 

106KEY_MEDIAN: Final[str] = "med" 

107#: The arithmetic mean value key. 

108KEY_MEAN_ARITH: Final[str] = "mean" 

109#: The geometric mean value key. 

110KEY_MEAN_GEOM: Final[str] = "geom" 

111#: The maximum value key. 

112KEY_MAXIMUM: Final[str] = "max" 

113#: The standard deviation value key. 

114KEY_STDDEV: Final[str] = "sd" 

115#: The key for `n` 

116KEY_N: Final[str] = "n" 

117#: The single value 

118KEY_VALUE: Final[str] = "value" 

119 

120#: the type variable for data to be written to CSV or to be read from CSV 

121T = TypeVar("T", bound="StreamStatistics") 

122 

123 

124class StreamStatisticsAggregate[T](StreamAggregate): 

125 """An Aggregate producing stream statistics.""" 

126 

127 def result(self) -> T: 

128 """ 

129 Get the stream statistics result. 

130 

131 The result is guaranteed to be a valid instance of 

132 :class:`~pycommons.math.stream_statistics.StreamStatistics`. 

133 It has :attr:`~pycommons.math.stream_statistics.StreamStatistics.n` 

134 greater than zero. 

135 

136 If no data was collected, a `ValueError` is raised. 

137 If you want to get `None` if no data was collected, use 

138 :meth:`~StreamStatisticsAggregate.result_or_none` instead. 

139 

140 :return: the result 

141 :raises ValueError: if no data was collected 

142 

143 >>> try: 

144 ... StreamStatisticsAggregate().result() 

145 ... except NotImplementedError: 

146 ... print("Not implemented!") 

147 Not implemented! 

148 """ 

149 raise NotImplementedError 

150 

151 def result_or_none(self) -> T | None: 

152 """ 

153 Get the result if any data was collected, otherwise `None`. 

154 

155 This method returns the same result as 

156 :meth:`~StreamStatisticsAggregate.result`, with the exception of the 

157 case where no data was collected at all. In this case, 

158 :meth:`~StreamStatisticsAggregate.result` will raise a `ValueError`, 

159 whereas this method here just returns `None`. 

160 

161 :return: the result, or `None` if no data was collected. 

162 

163 >>> try: 

164 ... StreamStatisticsAggregate().result_or_none() 

165 ... except NotImplementedError: 

166 ... print("Not implemented!") 

167 Not implemented! 

168 """ 

169 raise NotImplementedError 

170 

171 

172@dataclass(frozen=True, init=False, order=False, eq=False) 

173class StreamStatistics: 

174 """ 

175 An immutable record with stream statistics of one quantity. 

176 

177 Stream statistics are statistics records that can be computed over a 

178 stream of data. They do not require us to have the complete data sample 

179 in memory at any point in time. 

180 

181 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

182 >>> s1.n 

183 2 

184 >>> s1.minimum 

185 1 

186 >>> s1.mean_arith 

187 4 

188 >>> s1.maximum 

189 6 

190 >>> s1.stddev 

191 0.2 

192 >>> hash(s1) 

193 -997568919428664316 

194 

195 >>> s2 = StreamStatistics(1, 0, 0, 0.0, None) 

196 >>> s2.n 

197 1 

198 >>> s2.minimum 

199 0 

200 >>> s2.mean_arith 

201 0 

202 >>> s2.maximum 

203 0 

204 >>> print(s2.stddev) 

205 None 

206 >>> hash(s2) == hash((0, 0, 0, inf, 0, inf, 1, 0)) 

207 True 

208 

209 >>> s3 = StreamStatistics(n=3, minimum=5, maximum=5, 

210 ... mean_arith=5, stddev=0.0) 

211 >>> s3.stddev 

212 0 

213 >>> hash(s3) 

214 -5331876985145994286 

215 

216 >>> sset = {s1, s1, s2, s1, s3, s3, s2, s1} 

217 >>> len(sset) 

218 3 

219 >>> print(list(sss.n for sss in sorted(sset))) 

220 [1, 2, 3] 

221 >>> print(list(sss.minimum for sss in sorted(sset))) 

222 [0, 1, 5] 

223 

224 >>> try: 

225 ... StreamStatistics(n=1, minimum=5, maximum=6, 

226 ... mean_arith=5, stddev=None) 

227 ... except ValueError as ve: 

228 ... print(ve) 

229 maximum (6) must equal minimum (5) if n=1. 

230 

231 >>> try: 

232 ... StreamStatistics(n=1, minimum=5, maximum=5, 

233 ... mean_arith=4, stddev=None) 

234 ... except ValueError as ve: 

235 ... print(ve) 

236 mean_arith (4) must equal minimum (5) if n=1. 

237 

238 >>> try: 

239 ... StreamStatistics(n=2, minimum=5, maximum=6, 

240 ... mean_arith=4, stddev=None) 

241 ... except ValueError as ve: 

242 ... print(ve) 

243 minimum<=mean_arith<=maximum must hold, but got 5, 4, and 6. 

244 

245 >>> try: 

246 ... StreamStatistics(n=3, minimum=5, maximum=7, 

247 ... mean_arith=6, stddev=-1) 

248 ... except ValueError as ve: 

249 ... print(ve) 

250 stddev must be >= 0, but is -1. 

251 

252 >>> try: 

253 ... StreamStatistics(n=3, minimum=5, maximum=7, 

254 ... mean_arith=6, stddev=0) 

255 ... except ValueError as ve: 

256 ... print(str(ve)[:59]) 

257 If stddev (0) is 0, then minimum (5) must equal maximum (7) 

258 

259 >>> try: 

260 ... StreamStatistics(n=3, minimum=5, maximum=5, 

261 ... mean_arith=5, stddev=1) 

262 ... except ValueError as ve: 

263 ... print(str(ve)[:59]) 

264 If stddev (1) is 0, then minimum (5) must equal maximum (5) 

265 

266 >>> try: 

267 ... StreamStatistics(n=3, minimum=5, maximum=5, 

268 ... mean_arith=5, stddev=None) 

269 ... except ValueError as ve: 

270 ... print(ve) 

271 If n=1, stddev=None and vice versa, but got n=3 and stddev=None. 

272 

273 >>> try: 

274 ... StreamStatistics(n=1, minimum=5, maximum=5, 

275 ... mean_arith=5, stddev=1) 

276 ... except ValueError as ve: 

277 ... print(ve) 

278 If n=1, stddev=None and vice versa, but got n=1 and stddev=1. 

279 """ 

280 

281 #: The number of data samples over which the statistics were computed. 

282 n: int 

283 #: The minimum, i.e., the value of the smallest among the 

284 #: :attr:`~StreamStatistics.n` data samples. 

285 minimum: int | float 

286 #: The arithmetic mean value, i.e., the sum of the 

287 #: :attr:`~StreamStatistics.n` data samples divided by 

288 #: :attr:`~StreamStatistics.n`. 

289 mean_arith: int | float 

290 #: The maximum, i.e., the value of the largest among the 

291 #: :attr:`~StreamStatistics.n` data samples. 

292 maximum: int | float 

293 #: The standard deviation, if defined. This value will be `None` if there 

294 #: was only a single sample. 

295 stddev: int | float | None 

296 

297 def __init__(self, n: int, minimum: int | float, mean_arith: int | float, 

298 maximum: int | float, stddev: int | float | None): 

299 """ 

300 Create a sample statistics record. 

301 

302 :param n: the sample size, must be `n >= 1` 

303 :param minimum: the minimum 

304 :param median: the median 

305 :param mean_arith: the arithmetic mean 

306 :param mean_geom: the geometric mean, or `None` if it is undefined 

307 :param maximum: the maximum 

308 :param stddev: the standard deviation, must be `None` if `n == 0` 

309 """ 

310 n = check_int_range(n, "n", 1, 9_223_372_036_854_775_808) 

311 

312 # check minimum 

313 minimum = try_int(minimum) 

314 # check maximum 

315 maximum = try_int(maximum) 

316 if (n == 1) and (maximum != minimum): 

317 raise ValueError(f"maximum ({maximum}) must equal " 

318 f"minimum ({minimum}) if n=1.") 

319 # check arithmetic mean 

320 mean_arith = try_int(mean_arith) 

321 if n == 1: 

322 if mean_arith != minimum: 

323 raise ValueError(f"mean_arith ({mean_arith}) must equal " 

324 f"minimum ({minimum}) if n=1.") 

325 elif not minimum <= mean_arith <= maximum: 

326 raise ValueError("minimum<=mean_arith<=maximum must hold, but " 

327 f"got {minimum}, {mean_arith}, and {maximum}.") 

328 

329 if stddev is not None: 

330 stddev = try_int(stddev) 

331 if stddev < 0: 

332 raise ValueError(f"stddev must be >= 0, but is {stddev}.") 

333 if (n > 1) and ((minimum == maximum) ^ (stddev == 0)): 

334 raise ValueError( 

335 f"If stddev ({stddev}) is 0, then minimum ({minimum}) " 

336 f"must equal maximum ({maximum}) and vice versa.") 

337 if (stddev is None) ^ (n == 1): 

338 raise ValueError("If n=1, stddev=None and vice versa, but " 

339 f"got n={n} and stddev={stddev}.") 

340 

341 object.__setattr__(self, "n", n) 

342 object.__setattr__(self, "minimum", minimum) 

343 object.__setattr__(self, "maximum", maximum) 

344 object.__setattr__(self, "mean_arith", mean_arith) 

345 object.__setattr__(self, "stddev", stddev) 

346 

347 def __str__(self) -> str: 

348 """ 

349 Get a string representation of this object. 

350 

351 :returns: the string 

352 

353 >>> print(StreamStatistics(1, 0, 0, 0.0, None)) 

354 1;0;0;0;None 

355 

356 >>> print(StreamStatistics(10, 1, 1.5, 2, 1.2)) 

357 10;1;1.5;2;1.2 

358 """ 

359 return CSV_SEPARATOR.join(map(str, ( 

360 self.n, self.minimum, self.mean_arith, self.maximum, 

361 self.stddev))) 

362 

363 def min_mean(self) -> int | float: 

364 """ 

365 Obtain the smallest of the mean values. 

366 

367 :returns: :attr:`~StreamStatistics.mean_arith` 

368 

369 >>> StreamStatistics(1, 0, 0, 0.0, None).min_mean() 

370 0 

371 >>> StreamStatistics(2, 1, 2, 4.0, 0.2).min_mean() 

372 2 

373 """ 

374 return self.mean_arith 

375 

376 def max_mean(self) -> int | float: 

377 """ 

378 Obtain the largest of the mean values. 

379 

380 :returns: :attr:`~StreamStatistics.mean_arith` 

381 

382 

383 >>> StreamStatistics(1, 0, 0, 0.0, None).max_mean() 

384 0 

385 >>> StreamStatistics(2, 1, 2, 4.0, 0.2).max_mean() 

386 2 

387 """ 

388 return self.mean_arith 

389 

390 def compact(self, needs_n: bool = True) \ 

391 -> "int | float | StreamStatistics": 

392 """ 

393 Try to represent this object as single number, if possible. 

394 

395 :param needs_n: if this is `True`, the default, then the object is 

396 only turned into a single number if alsp `n==1`. Otherwise, `n` 

397 is ignored 

398 :returns: an integer or float if this objects minimum equals its 

399 maximum, the object itself otherwise 

400 

401 >>> s = StreamStatistics.from_single_value(10, 1) 

402 >>> s.compact() == 10 

403 True 

404 >>> s.compact() == s.compact(True) 

405 True 

406 

407 >>> s = StreamStatistics.from_single_value(10, 2) 

408 >>> s.compact() is s 

409 True 

410 >>> s.compact() == s.compact(True) 

411 True 

412 

413 >>> s = StreamStatistics.from_single_value(10, 2) 

414 >>> s.compact(False) == 10 

415 True 

416 

417 >>> s = StreamStatistics(2, 1, 3, 5, 3) 

418 >>> s.compact() is s 

419 True 

420 

421 >>> s = StreamStatistics(2, 1, 3, 5, 3) 

422 >>> s.compact(False) is s 

423 True 

424 

425 >>> try: 

426 ... s.compact(1) 

427 ... except TypeError as te: 

428 ... print(te) 

429 needs_n should be an instance of bool but is int, namely 1. 

430 

431 >>> try: 

432 ... s.compact(None) 

433 ... except TypeError as te: 

434 ... print(te) 

435 needs_n should be an instance of bool but is None. 

436 """ 

437 if not isinstance(needs_n, bool): 

438 raise type_error(needs_n, "needs_n", bool) 

439 mi: Final[int | float] = self.minimum 

440 return self if (mi < self.maximum) or ( 

441 needs_n and (self.n > 1)) else mi 

442 

443 def _key(self) -> tuple[int | float, int | float, int | float, 

444 int | float, int | float, int | float, int, int]: 

445 r""" 

446 Get a comparison and hash key. 

447 

448 This key is composed of the values for 

449 :attr:`~StreamStatistics.minimum`, `inf` (for the geometric mean), 

450 :attr:`~StreamStatistics.mean_arith`, `inf` (for the median), 

451 :attr:`~StreamStatistics.maximum`, :attr:`~StreamStatistics.stddev`, 

452 and :attr:`~StreamStatistics.n`. Any statistics value that is 

453 undefined will be turned to `inf`. The last value is a unique 

454 identifier of the object type. This is to prevent objects of type 

455 `StreamStatistics` and `SampleStatistics` to clash. Therefore, the 

456 former gets `0` as identifier, the latter gets `1`. 

457 

458 :returns: the comparison key 

459 

460 >>> StreamStatistics(2, 1, 4.0, 6, 0.2)._key() 

461 (1, inf, 4, inf, 6, 0.2, 2, 0) 

462 

463 >>> StreamStatistics(1, 0, 0, 0, None)._key() 

464 (0, 0, 0, inf, 0, inf, 1, 0) 

465 

466 >>> StreamStatistics(2, 1, 1, 1, 0)._key() 

467 (1, 1, 1, 1, 1, 0, 2, 0) 

468 

469 >>> StreamStatistics(2, 0, 0, 0, 0)._key() 

470 (0, 0, 0, inf, 0, 0, 2, 0) 

471 """ 

472 mi: Final[int | float] = self.minimum 

473 ma: Final[int | float] = self.maximum 

474 return (mi, inf if ma > mi else mi, self.mean_arith, 

475 mi if 0 < ma <= mi else inf, ma, 

476 inf if self.stddev is None else self.stddev, self.n, 0) 

477 

478 def __lt__(self, other) -> bool: 

479 """ 

480 Check if this statistics record is less than another one. 

481 

482 :param other: the other sample statistics 

483 :returns: `True` if this object is less, `False` otherwise 

484 

485 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

486 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

487 >>> s1 < s2 

488 False 

489 

490 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2) 

491 >>> s3 < s1 

492 True 

493 >>> s1 < s3 

494 False 

495 

496 >>> try: 

497 ... s3 < 23 

498 ... except TypeError as te: 

499 ... print(str(te)[:60]) 

500 '<' not supported between instances of 'StreamStatistics' an 

501 """ 

502 return self._key() < other._key()\ 

503 if isinstance(other, StreamStatistics) else NotImplemented 

504 

505 def __le__(self, other) -> bool: 

506 """ 

507 Check if this statistics record is less than or equal to another one. 

508 

509 :param other: the other sample statistics 

510 :returns: `True` if this object is less or equal, `False` otherwise 

511 

512 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

513 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

514 >>> s1 <= s2 

515 True 

516 

517 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2) 

518 >>> s3 <= s1 

519 True 

520 >>> s1 <= s3 

521 False 

522 

523 >>> try: 

524 ... s3 <= 23 

525 ... except TypeError as te: 

526 ... print(str(te)[:60]) 

527 '<=' not supported between instances of 'StreamStatistics' a 

528 """ 

529 return self._key() <= other._key() \ 

530 if isinstance(other, StreamStatistics) else NotImplemented 

531 

532 def __gt__(self, other) -> bool: 

533 """ 

534 Check if this statistics record is greater than another one. 

535 

536 :param other: the other sample statistics 

537 :returns: `True` if this object is greater, `False` otherwise 

538 

539 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

540 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

541 >>> s1 > s2 

542 False 

543 

544 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2) 

545 >>> s3 > s1 

546 False 

547 >>> s1 > s3 

548 True 

549 

550 >>> try: 

551 ... s3 > 23 

552 ... except TypeError as te: 

553 ... print(str(te)[:60]) 

554 '>' not supported between instances of 'StreamStatistics' an 

555 """ 

556 return self._key() > other._key() \ 

557 if isinstance(other, StreamStatistics) else NotImplemented 

558 

559 def __ge__(self, other) -> bool: 

560 """ 

561 Check if this object is greater than or equal to another one. 

562 

563 :param other: the other sample statistics 

564 :returns: `True` if this object is greater or equal, `False` otherwise 

565 

566 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

567 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

568 >>> s1 >= s2 

569 True 

570 

571 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2) 

572 >>> s3 >= s1 

573 False 

574 >>> s1 >= s3 

575 True 

576 

577 >>> try: 

578 ... s3 >= 23 

579 ... except TypeError as te: 

580 ... print(str(te)[:60]) 

581 '>=' not supported between instances of 'StreamStatistics' a 

582 """ 

583 return self._key() >= other._key() \ 

584 if isinstance(other, StreamStatistics) else NotImplemented 

585 

586 def __eq__(self, other) -> bool: 

587 """ 

588 Check if this statistics record equals another object. 

589 

590 :param other: the other obect 

591 :returns: `True` if this object is equal, `False` otherwise 

592 

593 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

594 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

595 >>> s1 == s2 

596 True 

597 

598 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2) 

599 >>> s3 == s1 

600 False 

601 

602 >>> s3 == 23 

603 False 

604 """ 

605 return (isinstance(other, StreamStatistics)) and ( 

606 self._key() == other._key()) 

607 

608 def __ne__(self, other) -> bool: 

609 """ 

610 Check if this statistics record does not equal another object. 

611 

612 :param other: the other sample statistics 

613 :returns: `True` if this object is not equal, `False` otherwise 

614 

615 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

616 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2) 

617 >>> s1 != s2 

618 False 

619 

620 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2) 

621 >>> s3 != s1 

622 True 

623 

624 >>> s3 != "x" 

625 True 

626 """ 

627 return (not isinstance(other, StreamStatistics)) or ( 

628 self._key() != other._key()) 

629 

630 def __hash__(self) -> int: 

631 """ 

632 Compute the hash code of this statistics record. 

633 

634 :returns: the hash code 

635 

636 >>> hash(StreamStatistics(2, 1, 4.0, 6, 0.2)) 

637 -997568919428664316 

638 

639 >>> hash(StreamStatistics(2, -1, 4.0, 6, 0.2)) 

640 -1901621203255131428 

641 """ 

642 return hash(self._key()) 

643 

644 def get_n(self) -> int: 

645 """ 

646 Get the number :attr:`~StreamStatistics.n` of samples. 

647 

648 :returns: the number :attr:`~StreamStatistics.n` of samples. 

649 :raises TypeError: if an object of the wrong type is passed in as self 

650 

651 >>> StreamStatistics(5, 3, 6, 7, 2).get_n() 

652 5 

653 

654 >>> try: 

655 ... StreamStatistics.get_n(None) 

656 ... except TypeError as te: 

657 ... print(str(te)[:20]) 

658 self should be an in 

659 """ 

660 if not isinstance(self, StreamStatistics): 

661 raise type_error(self, "self", StreamStatistics) 

662 return self.n 

663 

664 def get_minimum(self) -> int | float: 

665 """ 

666 Get the :attr:`~StreamStatistics.minimum` of all the samples. 

667 

668 :returns: the :attr:`~StreamStatistics.minimum` of all the samples 

669 :raises TypeError: if an object of the wrong type is passed in as self 

670 

671 >>> StreamStatistics(5, 3, 4, 6, 2).get_minimum() 

672 3 

673 

674 >>> try: 

675 ... StreamStatistics.get_minimum(None) 

676 ... except TypeError as te: 

677 ... print(str(te)[:20]) 

678 self should be an in 

679 """ 

680 if not isinstance(self, StreamStatistics): 

681 raise type_error(self, "self", StreamStatistics) 

682 return self.minimum 

683 

684 def get_maximum(self) -> int | float: 

685 """ 

686 Get the :attr:`~StreamStatistics.maximum` of all the samples. 

687 

688 :returns: the :attr:`~StreamStatistics.maximum` of all the samples 

689 :raises TypeError: if an object of the wrong type is passed in as self 

690 

691 >>> StreamStatistics(5, 3, 6, 7, 2).get_maximum() 

692 7 

693 

694 >>> try: 

695 ... StreamStatistics.get_maximum(None) 

696 ... except TypeError as te: 

697 ... print(str(te)[:20]) 

698 self should be an in 

699 """ 

700 if not isinstance(self, StreamStatistics): 

701 raise type_error(self, "self", StreamStatistics) 

702 return self.maximum 

703 

704 def get_mean_arith(self) -> int | float: 

705 """ 

706 Get the arithmetic mean (:attr:`~StreamStatistics.mean_arith`). 

707 

708 :returns: the arithmetic mean (:attr:`~StreamStatistics.mean_arith`) 

709 of all the samples. 

710 :raises TypeError: if an object of the wrong type is passed in as self 

711 

712 >>> StreamStatistics(5, 3, 6, 7, 2).get_mean_arith() 

713 6 

714 

715 >>> try: 

716 ... StreamStatistics.get_mean_arith(None) 

717 ... except TypeError as te: 

718 ... print(str(te)[:20]) 

719 self should be an in 

720 """ 

721 if not isinstance(self, StreamStatistics): 

722 raise type_error(self, "self", StreamStatistics) 

723 return self.mean_arith 

724 

725 def get_median(self) -> int | float | None: 

726 """ 

727 Get the median of all the samples. 

728 

729 :returns: This object type does not store the media. However, if 

730 the minimum is the same as the maximum, the median will have that 

731 same value, too, so it is returned. Otherwise, this method returns 

732 `None`. This method will be overridden. 

733 :raises TypeError: if an object of the wrong type is passed in as self 

734 

735 >>> print(StreamStatistics(5, 3, 6, 7, 2).get_median()) 

736 None 

737 

738 >>> print(StreamStatistics(5, -3, -3.0, -3, 0).get_median()) 

739 -3 

740 

741 >>> try: 

742 ... StreamStatistics.get_median(None) 

743 ... except TypeError as te: 

744 ... print(str(te)[:20]) 

745 self should be an in 

746 """ 

747 if not isinstance(self, StreamStatistics): 

748 raise type_error(self, "self", StreamStatistics) 

749 return self.minimum if self.minimum >= self.maximum else None 

750 

751 def get_mean_geom(self) -> int | float | None: 

752 """ 

753 Get the geometric mean of all the samples. 

754 

755 This class does not offer storing the geometric mean. This means 

756 that this method will usually return `None`. The only situation 

757 where it will not return `None` is if the geometric mean can be 

758 inferred by definition, namely if the minimum and maximum value 

759 are the same and positive. Subclasses will override this method to 

760 return meaningful values. 

761 

762 :returns: the geometric mean of all the samples, `None` if the 

763 geometric mean is not defined. 

764 :raises TypeError: if an object of the wrong type is passed in as self 

765 

766 >>> print(StreamStatistics(5, 3, 6, 7, 2).get_mean_geom()) 

767 None 

768 

769 >>> print(StreamStatistics(5, 2, 2, 2, 0).get_mean_geom()) 

770 2 

771 

772 >>> try: 

773 ... StreamStatistics.get_mean_geom(None) 

774 ... except TypeError as te: 

775 ... print(str(te)[:20]) 

776 self should be an in 

777 """ 

778 if not isinstance(self, StreamStatistics): 

779 raise type_error(self, "self", StreamStatistics) 

780 mi: Final[int | float] = self.minimum 

781 return mi if 0 < self.maximum <= mi else None 

782 

783 def get_stddev(self) -> int | float | None: 

784 """ 

785 Get the standard deviation mean (:attr:`~StreamStatistics.stddev`). 

786 

787 :returns: the standard deviation (:attr:`~StreamStatistics.stddev`) 

788 of all the samples, `None` if the standard deviation is not 

789 defined, i.e., if there is only a single sample 

790 :raises TypeError: if an object of the wrong type is passed in as self 

791 

792 >>> StreamStatistics(5, 3, 6, 7, 2).get_stddev() 

793 2 

794 

795 >>> try: 

796 ... StreamStatistics.get_stddev(None) 

797 ... except TypeError as te: 

798 ... print(str(te)[:20]) 

799 self should be an in 

800 """ 

801 if not isinstance(self, StreamStatistics): 

802 raise type_error(self, "self", StreamStatistics) 

803 return self.stddev 

804 

805 @classmethod 

806 def aggregate(cls) -> StreamStatisticsAggregate["StreamStatistics"]: 

807 """ 

808 Get an aggregate suitable for this statistics type. 

809 

810 :return: the aggregate 

811 

812 >>> ag = StreamStatistics.aggregate() 

813 >>> ag.update((1, 2, 3, 4)) 

814 >>> ag.result() 

815 StreamStatistics(n=4, minimum=1, mean_arith=2.5, maximum=4, \ 

816stddev=1.2909944487358056) 

817 >>> ag.reset() 

818 >>> ag.add(4) 

819 >>> ag.add(5) 

820 >>> ag.add(6) 

821 >>> ag.add(7) 

822 >>> ag.result() 

823 StreamStatistics(n=4, minimum=4, mean_arith=5.5, maximum=7, \ 

824stddev=1.2909944487358056) 

825 """ 

826 return _StreamStats() 

827 

828 @classmethod 

829 def from_samples(cls, source: Iterable[ 

830 int | float | None]) -> "StreamStatistics": 

831 """ 

832 Create a statistics record from a stream of samples. 

833 

834 :return: the statistics record. 

835 

836 >>> StreamStatistics.from_samples((1, 2, 3, 4)) 

837 StreamStatistics(n=4, minimum=1, mean_arith=2.5, maximum=4, \ 

838stddev=1.2909944487358056) 

839 """ 

840 agg: Final[StreamStatisticsAggregate] = cls.aggregate() 

841 agg.update(source) 

842 return agg.result() 

843 

844 @classmethod 

845 def from_single_value(cls, value: Union[ 

846 int, float, "StreamStatistics"], n: int = 1) -> "StreamStatistics": 

847 r""" 

848 Create a sample statistics from a single number. 

849 

850 :param value: the single value 

851 :param n: the number of samples, i.e., the number of times this value 

852 occurred 

853 :returns: the sample statistics 

854 

855 >>> print(str(StreamStatistics.from_single_value(23))) 

856 1;23;23;23;None 

857 

858 >>> s = StreamStatistics.from_single_value(10, 2) 

859 >>> print(s.stddev) 

860 0 

861 >>> s.minimum == s.maximum == s.mean_arith == 10 

862 True 

863 >>> s is StreamStatistics.from_single_value(s, s.n) 

864 True 

865 

866 >>> s = StreamStatistics.from_single_value(10, 1) 

867 >>> print(s.stddev) 

868 None 

869 >>> s.minimum == s.maximum == s.mean_arith == 10 

870 True 

871 >>> s is StreamStatistics.from_single_value(s, s.n) 

872 True 

873 

874 >>> s = StreamStatistics.from_single_value(-10, 2) 

875 >>> print(s.stddev) 

876 0 

877 >>> s.minimum == s.maximum == s.mean_arith == -10 

878 True 

879 >>> s is StreamStatistics.from_single_value(s, s.n) 

880 True 

881 

882 >>> s = StreamStatistics.from_single_value(-10, 1) 

883 >>> print(s.stddev) 

884 None 

885 >>> s.minimum == s.maximum == s.mean_arith == -10 

886 True 

887 >>> s is StreamStatistics.from_single_value(s, s.n) 

888 True 

889 

890 >>> s = StreamStatistics.from_single_value(10.5, 2) 

891 >>> print(s.stddev) 

892 0 

893 >>> s.minimum == s.maximum == s.mean_arith == 10.5 

894 True 

895 >>> s is StreamStatistics.from_single_value(s, s.n) 

896 True 

897 

898 >>> s = StreamStatistics.from_single_value(10.5, 1) 

899 >>> print(s.stddev) 

900 None 

901 >>> s.minimum == s.maximum == s.mean_arith == 10.5 

902 True 

903 >>> s is StreamStatistics.from_single_value(s, s.n) 

904 True 

905 

906 >>> s = StreamStatistics.from_single_value(-10.5, 2) 

907 >>> print(s.stddev) 

908 0 

909 >>> s.minimum == s.maximum == s.mean_arith == -10.5 

910 True 

911 >>> s is StreamStatistics.from_single_value(s, s.n) 

912 True 

913 

914 >>> s = StreamStatistics.from_single_value(-10.5, 1) 

915 >>> print(s.stddev) 

916 None 

917 >>> s.minimum == s.maximum == s.mean_arith == -10.5 

918 True 

919 >>> s is StreamStatistics.from_single_value(s, s.n) 

920 True 

921 

922 >>> try: 

923 ... StreamStatistics.from_single_value(None) 

924 ... except TypeError as te: 

925 ... print(str(te)[:20]) 

926 value should be an i 

927 

928 >>> try: 

929 ... StreamStatistics.from_single_value("a") 

930 ... except TypeError as te: 

931 ... print(str(te)[:20]) 

932 value should be an i 

933 

934 >>> try: 

935 ... StreamStatistics.from_single_value(1, None) 

936 ... except TypeError as te: 

937 ... print(str(te)[:20]) 

938 n should be an insta 

939 

940 >>> try: 

941 ... StreamStatistics.from_single_value(1, "a") 

942 ... except TypeError as te: 

943 ... print(str(te)[:20]) 

944 n should be an insta 

945 

946 >>> try: 

947 ... StreamStatistics.from_single_value(s, 12) 

948 ... except ValueError as ve: 

949 ... print(str(ve)[:20]) 

950 Incompatible numbers 

951 

952 >>> try: 

953 ... StreamStatistics.from_single_value(inf) 

954 ... except ValueError as ve: 

955 ... print(str(ve)[:20]) 

956 value=inf is not fin 

957 """ 

958 n = check_int_range(n, "n", 1, 1_000_000_000_000_000_000) 

959 if isinstance(value, StreamStatistics): 

960 if value.n == n: 

961 return value 

962 raise ValueError( # noqa: TRY004 

963 f"Incompatible numbers of values {n} and {value}.") 

964 if not isinstance(value, int | float): 

965 raise type_error(value, "value", (int, float, StreamStatistics)) 

966 if not isfinite(value): 

967 raise ValueError(f"value={value} is not finite.") 

968 return StreamStatistics( 

969 n=n, minimum=value, mean_arith=value, maximum=value, 

970 stddev=None if n <= 1 else 0) 

971 

972 @classmethod 

973 def getter(cls, dimension: str) -> Callable[[ 

974 "StreamStatistics"], int | float | None]: 

975 """ 

976 Get a function returning the dimension from :class:`StreamStatistics`. 

977 

978 The returned getter function expects that it receives a valid 

979 :class:`StreamStatistics` instance as parameter, or an instance of the 

980 subclass you called :meth:`StreamStatistics.getter` on. If you pass in 

981 `None`, then ths will raise a `TypeError`. If you are in a situation 

982 where `None` is possible, use the function 

983 :meth:`StreamStatistics.getter_or_none` instead, which will return 

984 `None` in such a case. 

985 

986 :param dimension: the dimension 

987 :returns: a :class:`Callable` that returns the value corresponding to 

988 the dimension 

989 :raises TypeError: if `dimension` is not a string 

990 :raises ValueError: if `dimension` is unknown 

991 

992 >>> StreamStatistics.getter(KEY_N) is StreamStatistics.get_n 

993 True 

994 >>> (StreamStatistics.getter(KEY_MINIMUM) is 

995 ... StreamStatistics.get_minimum) 

996 True 

997 >>> (StreamStatistics.getter(KEY_MEAN_ARITH) is 

998 ... StreamStatistics.get_mean_arith) 

999 True 

1000 >>> (StreamStatistics.getter(KEY_MEAN_GEOM) is 

1001 ... StreamStatistics.get_mean_geom) 

1002 True 

1003 >>> (StreamStatistics.getter(KEY_MAXIMUM) is 

1004 ... StreamStatistics.get_maximum) 

1005 True 

1006 >>> (StreamStatistics.getter(KEY_MEDIAN) is 

1007 ... StreamStatistics.get_median) 

1008 True 

1009 >>> (StreamStatistics.getter(KEY_STDDEV) is 

1010 ... StreamStatistics.get_stddev) 

1011 True 

1012 

1013 >>> s = StreamStatistics(5, 3, 6, 7, 2) 

1014 >>> StreamStatistics.getter(KEY_N)(s) 

1015 5 

1016 >>> StreamStatistics.getter(KEY_MINIMUM)(s) 

1017 3 

1018 >>> StreamStatistics.getter(KEY_MEAN_ARITH)(s) 

1019 6 

1020 >>> print(StreamStatistics.getter(KEY_MEAN_GEOM)(s)) 

1021 None 

1022 >>> StreamStatistics.getter(KEY_MAXIMUM)(s) 

1023 7 

1024 >>> StreamStatistics.getter(KEY_STDDEV)(s) 

1025 2 

1026 >>> print(StreamStatistics.getter(KEY_MEDIAN)(s)) 

1027 None 

1028 

1029 >>> try: 

1030 ... StreamStatistics.getter(KEY_N)(None) 

1031 ... except TypeError as te: 

1032 ... print(str(te)[:20]) 

1033 self should be an in 

1034 

1035 >>> try: 

1036 ... StreamStatistics.getter(None) 

1037 ... except TypeError as te: 

1038 ... print(te) 

1039 descriptor 'strip' for 'str' objects doesn't apply to a 'NoneType' \ 

1040object 

1041 

1042 >>> try: 

1043 ... StreamStatistics.getter(1) 

1044 ... except TypeError as te: 

1045 ... print(te) 

1046 descriptor 'strip' for 'str' objects doesn't apply to a 'int' object 

1047 

1048 >>> try: 

1049 ... StreamStatistics.getter("hello") 

1050 ... except ValueError as ve: 

1051 ... print(str(ve)[-18:]) 

1052 dimension 'hello'. 

1053 """ 

1054 tbl_name: Final[str] = "___cls_getters" 

1055 if hasattr(cls, tbl_name): 

1056 getters = cast("Callable", getattr(cls, tbl_name)) 

1057 else: 

1058 getters = { 

1059 KEY_N: cls.get_n, KEY_MINIMUM: cls.get_minimum, 

1060 "minimum": cls.get_minimum, 

1061 KEY_MEAN_ARITH: cls.get_mean_arith, 

1062 "mean_arith": cls.get_mean_arith, 

1063 "arithmetic mean": cls.get_mean_arith, 

1064 "average": cls.get_mean_arith, KEY_MEDIAN: cls.get_median, 

1065 "median": cls.get_median, KEY_MEAN_GEOM: cls.get_mean_geom, 

1066 "mean_geom": cls.get_mean_geom, 

1067 "geometric mean": cls.get_mean_geom, 

1068 "gmean": cls.get_mean_geom, KEY_MAXIMUM: cls.get_maximum, 

1069 "maximum": cls.get_maximum, KEY_STDDEV: cls.get_stddev, 

1070 "stddev": cls.get_stddev, 

1071 "standard deviation": cls.get_stddev}.get 

1072 setattr(cls, tbl_name, getters) 

1073 

1074 result: Callable[[StreamStatistics], int | float | None] | None \ 

1075 = getters(str.strip(dimension), None) 

1076 if result is None: 

1077 raise ValueError(f"Unknown {cls} dimension {dimension!r}.") 

1078 return result 

1079 

1080 @classmethod 

1081 def getter_or_none(cls, dimension: str) -> Callable[[ 

1082 Union["StreamStatistics", None]], int | float | None]: 

1083 """ 

1084 Obtain a getter that returns `None` if the statistics is `None`. 

1085 

1086 With this method, you can get a function which returns a value from a 

1087 statistics object if the object is not `None`. If `None` is provided, 

1088 then the function also returns `None`. 

1089 

1090 This is especially useful if you work with something like 

1091 :meth:`~StreamStatisticsAggregate.result_or_none`. 

1092 

1093 If your data should never be `None`, the better use 

1094 :meth:`StreamStatistics.getter` instead, which returns getter 

1095 functions that raise `TypeError`s if their input is `None`. 

1096 

1097 :param dimension: the dimension 

1098 :return: the getter 

1099 

1100 >>> ss = StreamStatistics(10, 1, 2, 3, 4) 

1101 >>> g = StreamStatistics.getter_or_none(KEY_MINIMUM) 

1102 >>> g(ss) 

1103 1 

1104 >>> print(g(None)) 

1105 None 

1106 >>> StreamStatistics.getter_or_none(KEY_MINIMUM) is g 

1107 True 

1108 

1109 >>> g = StreamStatistics.getter_or_none(KEY_MAXIMUM) 

1110 >>> g(ss) 

1111 3 

1112 >>> print(g(None)) 

1113 None 

1114 """ 

1115 tbl_name: Final[str] = "___cls_getters_or_none" 

1116 if hasattr(cls, tbl_name): 

1117 getters = cast("dict", getattr(cls, tbl_name)) 

1118 else: 

1119 getters = {} 

1120 setattr(cls, tbl_name, getters) 

1121 

1122 dimension = str.strip(dimension) 

1123 if dimension in getters: 

1124 return getters[dimension] 

1125 

1126 def __getter(x, _y=cls.getter(dimension)) -> int | float | None: 

1127 return None if x is None else _y(x) 

1128 

1129 getters[dimension] = __getter 

1130 return cast("Callable", __getter) 

1131 

1132 

1133class _StreamStats(StreamStatisticsAggregate[StreamStatistics]): 

1134 """ 

1135 The internal stream statistics. 

1136 

1137 The stream statistics compute mean and variance of data using Welford's 

1138 algorithm. 

1139 

1140 1. Donald E. Knuth (1998). The Art of Computer Programming, volume 2: 

1141 Seminumerical Algorithms, 3rd edn., p. 232. Boston: Addison-Wesley. 

1142 2. B. P. Welford (1962). "Note on a method for calculating corrected sums 

1143 of squares and products". Technometrics 4(3):419-420. 

1144 

1145 >>> ss = _StreamStats() 

1146 >>> data1 = [4, 7, 13, 16] 

1147 >>> ss.update(data1) 

1148 >>> ss.result() 

1149 StreamStatistics(n=4, minimum=4, mean_arith=10, maximum=16, \ 

1150stddev=5.477225575051661) 

1151 

1152 >>> data2 = [1e8 + z for z in data1] 

1153 >>> ss.reset() 

1154 >>> ss.update(data2) 

1155 >>> ss.result() 

1156 StreamStatistics(n=4, minimum=100000004, mean_arith=100000010, \ 

1157maximum=100000016, stddev=5.477225575051661) 

1158 

1159 >>> data3 = [1e14 + z for z in data1] 

1160 >>> ss.reset() 

1161 >>> ss.update(data3) 

1162 >>> ss.result() 

1163 StreamStatistics(n=4, minimum=100000000000004, \ 

1164mean_arith=100000000000010, maximum=100000000000016, stddev=5.477225575051661) 

1165 

1166 >>> data3 = [z for z in range(1001)] 

1167 >>> ss.reset() 

1168 >>> ss.update(data3) 

1169 >>> ss.result() 

1170 StreamStatistics(n=1001, minimum=0, mean_arith=500, maximum=1000, \ 

1171stddev=289.10811126635656) 

1172 """ 

1173 

1174 def __init__(self) -> None: 

1175 """Initialize the stream statistics.""" 

1176 #: the number of samples seen 

1177 self.__n: int = 0 

1178 #: the last mean result 

1179 self.__mean: int | float = 0 

1180 #: the running sum for the variance 

1181 self.__var: int | float = 0 

1182 #: the minimum 

1183 self.__min: int | float = inf 

1184 #: the maximum 

1185 self.__max: int | float = -inf 

1186 

1187 def reset(self) -> None: 

1188 """Reset the sample statistics.""" 

1189 self.__n = 0 

1190 self.__mean = 0 

1191 self.__var = 0 

1192 self.__min = inf 

1193 self.__max = -inf 

1194 

1195 def add(self, value: int | float) -> None: 

1196 """ 

1197 Add a value to the statistics. 

1198 

1199 :param value: the value 

1200 """ 

1201 value = try_int(value) # try to sum ints, check type and non-finite 

1202 n: Final[int] = self.__n + 1 

1203 self.__n = n 

1204 mean: int | float = self.__mean 

1205 delta: int | float = value - mean 

1206 mean += delta / n 

1207 self.__mean = mean 

1208 self.__var += delta * (value - mean) 

1209 self.__min = min(self.__min, value) 

1210 self.__max = max(self.__max, value) 

1211 

1212 def result(self) -> StreamStatistics: 

1213 """ 

1214 Get the arithmetic mean. 

1215 

1216 :return: the arithmetic mean or `None` if no value was added yet 

1217 """ 

1218 n: Final[int] = self.__n 

1219 mi: Final[int | float] = self.__min 

1220 ma: Final[int | float] = self.__max 

1221 return StreamStatistics( 

1222 n, mi, max(mi, min(ma, self.__mean)), ma, 

1223 None if n <= 1 else (0 if ma <= mi else sqrt( 

1224 try_float_int_div(self.__var, n - 1)))) 

1225 

1226 def result_or_none(self) -> StreamStatistics | None: 

1227 """ 

1228 Get the result if any data was collected, otherwise `None`. 

1229 

1230 :return: The return value of :meth:`result` if any data was collected, 

1231 otherwise `None` 

1232 """ 

1233 return self.result() if self.__n > 0 else None 

1234 

1235 

1236class CsvReader(CsvReaderBase[StreamStatistics]): 

1237 """ 

1238 A csv parser for sample statistics. 

1239 

1240 >>> from pycommons.io.csv import csv_read 

1241 >>> csv = ["n;min;mean;max;sd", 

1242 ... "3;2;3;10;5", "6;2;;;0", "1;;2", "3;;;0;", 

1243 ... "4;5;12;33;7"] 

1244 >>> for p in csv_read(csv, CsvReader, CsvReader.parse_row): 

1245 ... print(p) 

1246 3;2;3;10;5 

1247 6;2;2;2;0 

1248 1;2;2;2;None 

1249 3;0;0;0;0 

1250 4;5;12;33;7 

1251 

1252 >>> csv = ["value", "1", "3", "0", "-5", "7"] 

1253 >>> for p in csv_read(csv, CsvReader, CsvReader.parse_row): 

1254 ... print(p) 

1255 1;1;1;1;None 

1256 1;3;3;3;None 

1257 1;0;0;0;None 

1258 1;-5;-5;-5;None 

1259 1;7;7;7;None 

1260 

1261 >>> csv = ["n;m;sd", "1;3;", "3;5;0"] 

1262 >>> for p in csv_read(csv, CsvReader, CsvReader.parse_row): 

1263 ... print(p) 

1264 1;3;3;3;None 

1265 3;5;5;5;0 

1266 

1267 >>> csv = ["n;m", "1;3", "3;5"] 

1268 >>> for p in csv_read(csv, CsvReader, CsvReader.parse_row): 

1269 ... print(p) 

1270 1;3;3;3;None 

1271 3;5;5;5;0 

1272 """ 

1273 

1274 def __init__(self, columns: dict[str, int]) -> None: 

1275 """ 

1276 Create a CSV parser for :class:`SampleStatistics`. 

1277 

1278 :param columns: the columns 

1279 

1280 >>> try: 

1281 ... CsvReader(None) 

1282 ... except TypeError as te: 

1283 ... print(te) 

1284 columns should be an instance of dict but is None. 

1285 

1286 >>> try: 

1287 ... CsvReader(1) 

1288 ... except TypeError as te: 

1289 ... print(te) 

1290 columns should be an instance of dict but is int, namely 1. 

1291 

1292 >>> try: 

1293 ... CsvReader(dict()) 

1294 ... except ValueError as ve: 

1295 ... print(ve) 

1296 No useful keys remain in {}. 

1297 

1298 >>> try: 

1299 ... CsvReader({"a": 1, "b": 2}) 

1300 ... except ValueError as ve: 

1301 ... print(ve) 

1302 No useful keys remain in {'a': 1, 'b': 2}. 

1303 

1304 >>> try: 

1305 ... CsvReader({KEY_N: 1, "b": 2, "c": 3}) 

1306 ... except ValueError as ve: 

1307 ... print(ve) 

1308 No useful keys remain in {'b': 2, 'c': 3}. 

1309 

1310 >>> try: 

1311 ... CsvReader({KEY_MINIMUM: 1, "b": 2, "c": 3}) 

1312 ... except ValueError as ve: 

1313 ... print(ve) 

1314 Found strange keys in {'b': 2, 'c': 3}. 

1315 """ 

1316 super().__init__(columns) 

1317 

1318 #: the index of the number of elements 

1319 self.idx_n: Final[int | None] = csv_column_or_none( 

1320 columns, KEY_N) 

1321 

1322 has: int = 0 

1323 has_idx: int = -1 

1324 

1325 #: the index of the minimum 

1326 self.__idx_min: int | None = csv_column_or_none( 

1327 columns, KEY_MINIMUM) 

1328 if self.__idx_min is not None: 

1329 has += 1 

1330 has_idx = self.__idx_min 

1331 

1332 #: the index for the arithmetic mean 

1333 self.__idx_mean_arith: int | None = csv_column_or_none( 

1334 columns, KEY_MEAN_ARITH) 

1335 if self.__idx_mean_arith is not None: 

1336 has += 1 

1337 has_idx = self.__idx_mean_arith 

1338 

1339 #: the index for the maximum 

1340 self.__idx_max: int | None = csv_column_or_none( 

1341 columns, KEY_MAXIMUM) 

1342 if self.__idx_max is not None: 

1343 has += 1 

1344 has_idx = self.__idx_max 

1345 

1346 #: the index for the standard deviation 

1347 self.__idx_sd: Final[int | None] = csv_column_or_none( 

1348 columns, KEY_STDDEV) 

1349 

1350 if has <= 0: 

1351 if dict.__len__(columns) == 1: 

1352 self.__idx_min = has_idx = csv_column( 

1353 columns, next(iter(columns.keys())), True) 

1354 has = 1 

1355 else: 

1356 raise ValueError(f"No useful keys remain in {columns!r}.") 

1357 if dict.__len__(columns) > 1: 

1358 raise ValueError(f"Found strange keys in {columns!r}.") 

1359 

1360 #: is this a parser for single number statistics? 

1361 self.__is_single: Final[bool] = (self.__idx_sd is None) and (has == 1) 

1362 

1363 if self.__is_single: 

1364 self.__idx_min = self.__idx_max = self.__idx_mean_arith = has_idx 

1365 

1366 def parse_row(self, data: list[str]) -> StreamStatistics: 

1367 """ 

1368 Parse a row of data. 

1369 

1370 :param data: the data row 

1371 :returns: the sample statistics 

1372 

1373 >>> cc = CsvReader({KEY_MINIMUM: 0, KEY_MEAN_ARITH: 1, KEY_MAXIMUM: 2, 

1374 ... KEY_STDDEV: 3, KEY_N: 4}) 

1375 >>> try: 

1376 ... cc.parse_row([None, None, None, None, "5"]) 

1377 ... except ValueError as ve: 

1378 ... print(str(ve)[:20]) 

1379 No value defined for 

1380 """ 

1381 n: Final[int] = 1 if self.idx_n is None else int(data[self.idx_n]) 

1382 mi: int | float | None = csv_val_or_none( 

1383 data, self.__idx_min, str_to_num) 

1384 

1385 if self.__is_single: 

1386 return StreamStatistics( 

1387 n=n, minimum=mi, mean_arith=mi, 

1388 maximum=mi, stddev=None if n <= 1 else 0) 

1389 

1390 ar: int | float | None = csv_val_or_none( 

1391 data, self.__idx_mean_arith, str_to_num) 

1392 ma: int | float | None = csv_val_or_none( 

1393 data, self.__idx_max, str_to_num) 

1394 sd: int | float | None = csv_val_or_none( 

1395 data, self.__idx_sd, str_to_num) 

1396 

1397 if mi is None: 

1398 if ar is not None: 

1399 mi = ar 

1400 elif ma is not None: 

1401 mi = ma 

1402 else: 

1403 raise ValueError( 

1404 f"No value defined for min@{self.__idx_min}={mi}, mean@" 

1405 f"{self.__idx_mean_arith}={ar}, max@" 

1406 f"{self.__idx_max}={ma} defined in {data!r}.") 

1407 return StreamStatistics( 

1408 n=n, minimum=mi, mean_arith=mi if ar is None else ar, 

1409 maximum=mi if ma is None else ma, 

1410 stddev=(0 if (n > 1) else None) if sd is None else sd) 

1411 

1412 def parse_optional_row(self, data: list[str] | None) \ 

1413 -> StreamStatistics | None: 

1414 """ 

1415 Parse a row of data that may be empty. 

1416 

1417 :param data: the row of data that may be empty 

1418 :returns: the sample statistic, if the row contains data, else `None` 

1419 

1420 >>> print(CsvReader.parse_optional_row(None, ["1"])) 

1421 None 

1422 >>> print(CsvReader.parse_optional_row(CsvReader({"v": 0}), ["1"])) 

1423 1;1;1;1;None 

1424 >>> print(CsvReader.parse_optional_row(CsvReader({"v": 0}), [""])) 

1425 None 

1426 """ 

1427 if (self is None) or (data is None): 

1428 return None # trick to make this method usable pseudo-static 

1429 # pylint: disable=R0916 

1430 if (((self.__idx_min is not None) and ( 

1431 str.__len__(data[self.__idx_min]) > 0)) or ( 

1432 (self.__idx_mean_arith is not None) and ( 

1433 str.__len__(data[self.__idx_mean_arith]) > 0)) or ( 

1434 (self.__idx_max is not None) and ( 

1435 str.__len__(data[self.__idx_max]) > 0))): 

1436 return self.parse_row(data) 

1437 return None 

1438 

1439 

1440class CsvWriter(CsvWriterBase[T]): 

1441 """A class for CSV writing of :class:`StreamStatistics`.""" 

1442 

1443 def __init__(self, 

1444 data: Iterable[T], 

1445 scope: str | None = None, 

1446 n_not_needed: bool = False, 

1447 what_short: str | None = None, 

1448 what_long: str | None = None, 

1449 clazz: type[T] = cast("type[T]", StreamStatistics)) -> None: 

1450 """ 

1451 Initialize the csv writer. 

1452 

1453 :param data: the data to use 

1454 :param scope: the prefix to be pre-pended to all columns 

1455 :param n_not_needed: should we omit the `n` column? 

1456 :param what_short: the short description of what the statistics is 

1457 about 

1458 :param what_long: the long description of what the statistics is about 

1459 :param clazz: the stream statistics type 

1460 

1461 >>> try: 

1462 ... CsvWriter([], None, n_not_needed=None) 

1463 ... except TypeError as te: 

1464 ... print(te) 

1465 n_not_needed should be an instance of bool but is None. 

1466 

1467 >>> try: 

1468 ... CsvWriter([], clazz=str) 

1469 ... except TypeError as te: 

1470 ... print(str(te)[:20]) 

1471 clazz should be an i 

1472 

1473 >>> try: 

1474 ... CsvWriter([]) 

1475 ... except ValueError as ve: 

1476 ... s = str(ve) 

1477 ... print(s[s.index(' ') + 1:]) 

1478 CsvWriter did not see any data. 

1479 

1480 >>> try: 

1481 ... CsvWriter([1]) 

1482 ... except TypeError as te: 

1483 ... print(str(te)[:29]) 

1484 data[0] should be an instance 

1485 """ 

1486 super().__init__(data, scope) 

1487 

1488 if not issubclass(clazz, StreamStatistics): 

1489 raise type_error(clazz, "clazz", type[StreamStatistics]) 

1490 #: the internal type 

1491 self.__cls: Final[type[StreamStatistics]] = clazz 

1492 

1493 if not isinstance(n_not_needed, bool): 

1494 raise type_error(n_not_needed, "n_not_needed", bool) 

1495 # We need to check at most three conditions to see whether we can 

1496 # compact the output: 

1497 # 1. If all minimum, mean, median, maximum (and geometric mean, if 

1498 # defined) are the same, then we can collapse this column. 

1499 all_same: bool = True 

1500 # 2. If no geometric mean is found, then we can also omit this column. 

1501 has_no_geom: bool = True 

1502 # 3. If no median is found, then we can also omit this column. 

1503 has_no_median: bool = True 

1504 # 4. If the `n` column is not needed or if all `n=1`, then we can omit 

1505 # it. We only need to check if n is not needed if self.n_not_needed is 

1506 # False because otherwise, we rely on self.n_not_needed. 

1507 # n_really_not_needed will become False if we find one situation where 

1508 # we actually need n. 

1509 n_really_not_needed: bool = n_not_needed 

1510 # So if n_really_not_needed is True, we need to do 3 checks. 

1511 # Otherwise, we only need two checks. 

1512 checks_needed: int = 4 if n_really_not_needed else 3 

1513 # the number of samples seen 

1514 seen: int = 0 

1515 

1516 for i, d in enumerate(data): # Iterate over the data. 

1517 if not isinstance(d, clazz): 

1518 raise type_error(d, f"data[{i}]", clazz) 

1519 seen += 1 

1520 if n_really_not_needed and (d.n != 1): 

1521 n_really_not_needed = False 

1522 checks_needed -= 1 

1523 if checks_needed <= 0: 

1524 break 

1525 if all_same and (d.minimum < d.maximum): 

1526 all_same = False 

1527 checks_needed -= 1 

1528 if checks_needed <= 0: 

1529 break 

1530 if has_no_geom and (d.get_mean_geom() is not None): 

1531 has_no_geom = False 

1532 checks_needed -= 1 

1533 if checks_needed <= 0: 

1534 break 

1535 if has_no_median and (d.get_median() is not None): 

1536 has_no_median = False 

1537 checks_needed -= 1 

1538 if checks_needed <= 0: 

1539 break 

1540 

1541 if seen <= 0: 

1542 raise ValueError( 

1543 f"{type_name(self.__cls)} CsvWriter did not see any data.") 

1544 

1545 # stream statistics do not have geometric means or medians 

1546 if self.__cls is StreamStatistics: 

1547 has_no_geom = has_no_median = True 

1548 

1549 n_not_needed = n_really_not_needed or n_not_needed 

1550 #: do we have a geometric mean? 

1551 has_geo_mean: Final[bool] = (not has_no_geom) and (not all_same) 

1552 #: do we have a median? 

1553 has_median: Final[bool] = (not has_no_median) and (not all_same) 

1554 

1555 #: the key for `n` is `None` if `n` is not printed, else it is the key 

1556 self.__key_n: Final[str | None] = None if n_not_needed \ 

1557 else csv_scope(scope, KEY_N) 

1558 

1559 key_all: str | None = None 

1560 key_min: str | None = None 

1561 key_mean_arith: str | None = None 

1562 key_med: str | None = None 

1563 key_max: str | None = None 

1564 key_mean_geom: str | None = None 

1565 key_sd: str | None = None 

1566 

1567 if all_same: 

1568 key_all = KEY_VALUE if scope is None else ( 

1569 csv_scope(scope, None if self.__key_n is None else KEY_VALUE)) 

1570 else: 

1571 key_min = csv_scope(scope, KEY_MINIMUM) 

1572 key_mean_arith = csv_scope(scope, KEY_MEAN_ARITH) 

1573 if has_median: 

1574 key_med = csv_scope(scope, KEY_MEDIAN) 

1575 key_max = csv_scope(scope, KEY_MAXIMUM) 

1576 if has_geo_mean: 

1577 key_mean_geom = csv_scope(scope, KEY_MEAN_GEOM) 

1578 key_sd = csv_scope(scope, KEY_STDDEV) 

1579 

1580 #: the key for single values 

1581 self.__key_all: Final[str | None] = key_all 

1582 #: the key for minimum values 

1583 self.__key_min: Final[str | None] = key_min 

1584 #: the key for the arithmetic mean 

1585 self.__key_mean_arith: Final[str | None] = key_mean_arith 

1586 #: the key for the median 

1587 self.__key_med: Final[str | None] = key_med 

1588 #: the key for the geometric mean 

1589 self.__key_mean_geom: Final[str | None] = key_mean_geom 

1590 #: the key for the maximum value 

1591 self.__key_max: Final[str | None] = key_max 

1592 #: the key for the standard deviation 

1593 self.__key_sd: Final[str | None] = key_sd 

1594 

1595 long_name: str | None = \ 

1596 None if what_long is None else str.strip(what_long) 

1597 short_name: str | None = \ 

1598 None if what_short is None else str.strip(what_short) 

1599 if long_name is None: 

1600 long_name = short_name 

1601 elif short_name is None: 

1602 short_name = long_name 

1603 else: 

1604 long_name = f"{long_name} ({short_name})" 

1605 

1606 #: the short description of what the statistics are about 

1607 self.__short_name: Final[str | None] = short_name 

1608 #: the long description of what the statistics are about 

1609 self.__long_name: Final[str | None] = long_name 

1610 

1611 def get_column_titles(self) -> Iterable[str]: 

1612 """ 

1613 Get the column titles. 

1614 

1615 :returns: the column titles 

1616 """ 

1617 if self.__key_n is not None: 

1618 yield self.__key_n 

1619 

1620 if self.__key_all is None: 

1621 yield self.__key_min 

1622 yield self.__key_mean_arith 

1623 if self.__key_med is not None: 

1624 yield self.__key_med 

1625 if self.__key_mean_geom is not None: 

1626 yield self.__key_mean_geom 

1627 yield self.__key_max 

1628 yield self.__key_sd 

1629 else: 

1630 yield self.__key_all 

1631 

1632 def get_optional_row(self, 

1633 data: int | float | T | None, 

1634 n: int | None = None) -> Iterable[str]: 

1635 """ 

1636 Attach an empty row of the correct shape to the output. 

1637 

1638 This function may be needed in cases where the statistics are part of 

1639 other records that sometimes do not contain the record. 

1640 

1641 :param data: the data item 

1642 :param n: the number of samples 

1643 :returns: the optional row data 

1644 

1645 >>> try: 

1646 ... list(CsvWriter([StreamStatistics.from_single_value( 

1647 ... 1)]).get_optional_row("x")) 

1648 ... except TypeError as te: 

1649 ... print(str(te)[:53]) 

1650 data should be an instance of any in {None, float, in 

1651 """ 

1652 if data is None: 

1653 # attach an empty row 

1654 for _ in range((0 if self.__key_n is None else 1) + ( 

1655 (4 if self.__key_mean_geom is None else 5) 

1656 + (0 if self.__key_med is None else 1) 

1657 if self.__key_all is None else 1)): 

1658 yield "" 

1659 return 

1660 if isinstance(data, int | float): # convert single value 

1661 data = cast("T", self.__cls.from_single_value( 

1662 data, 1 if n is None else n)) 

1663 elif not isinstance(data, StreamStatistics): # huh? 

1664 raise type_error(data, "data", ( 

1665 int, float, StreamStatistics, None)) 

1666 elif (n is not None) and (n != data.n): # sanity check 

1667 raise ValueError(f"data.n={data.n} but n={n}.") 

1668 yield from self.get_row(data) 

1669 

1670 def get_row(self, data: T) -> Iterable[str]: 

1671 """ 

1672 Render a single sample statistics to a CSV row. 

1673 

1674 :param data: the data sample statistics 

1675 :returns: the row iterator 

1676 """ 

1677 if self.__key_n is not None: 

1678 yield str(data.n) 

1679 if self.__key_all is None: 

1680 yield num_to_str(data.minimum) 

1681 yield num_to_str(data.mean_arith) 

1682 if self.__key_med is not None: 

1683 yield num_to_str(data.get_median()) 

1684 if self.__key_mean_geom is not None: 

1685 yield num_or_none_to_str(data.get_mean_geom()) 

1686 yield num_to_str(data.maximum) 

1687 yield num_or_none_to_str(data.stddev) 

1688 else: 

1689 if data.minimum != data.maximum: 

1690 raise ValueError(f"Inconsistent data {data}.") 

1691 yield num_to_str(data.minimum) 

1692 

1693 def get_header_comments(self) -> Iterable[str]: 

1694 """ 

1695 Get any possible header comments. 

1696 

1697 :returns: the iterable of header comments 

1698 """ 

1699 return [f"Statistics about {self.__long_name}."]\ 

1700 if (self.scope is not None) and (self.__long_name is not None)\ 

1701 else () 

1702 

1703 def get_footer_comments(self) -> Iterable[str]: 

1704 """ 

1705 Get any possible footer comments. 

1706 

1707 :returns: the footer comments 

1708 """ 

1709 long_name: str | None = self.__long_name 

1710 long_name = "" if long_name is None else f" {long_name}" 

1711 short_name: str | None = self.__short_name 

1712 short_name = "" if short_name is None else f" {short_name}" 

1713 name: str = long_name 

1714 first: bool = True 

1715 

1716 scope: Final[str] = self.scope 

1717 if (scope is not None) and ( 

1718 (self.__key_n is not None) or ( 

1719 self.__key_all is not None)): 

1720 if first: 

1721 yield "" 

1722 first = False 

1723 yield (f"All{name} sample statistics start with " 

1724 f"{(scope + SCOPE_SEPARATOR)!r}.") 

1725 name = short_name 

1726 

1727 if self.__key_n is not None: 

1728 if first: 

1729 yield "" 

1730 first = False 

1731 yield f"{self.__key_n}: the number of{name} samples" 

1732 name = short_name 

1733 if self.__key_all is None: 

1734 if first: 

1735 yield "" 

1736 n_name: str | None = self.__key_n 

1737 if n_name is None: 

1738 n_name = KEY_N 

1739 yield f"{self.__key_min}: the smallest encountered{name} value" 

1740 name = short_name 

1741 yield (f"{self.__key_mean_arith}: the arithmetic mean of all the" 

1742 f"{name} values, i.e., the sum of the values divided by " 

1743 f"their number {n_name}") 

1744 if self.__key_med is not None: 

1745 yield (f"{self.__key_med}: the median of all the{name} " 

1746 "values, which can be computed by sorting the values " 

1747 "and then picking the value in the middle of the " 

1748 f"sorted list (in case of an odd number {n_name} of " 

1749 "values) or the arithmetic mean (half the sum) of the " 

1750 "two values in the middle (in case of an even number " 

1751 f"{n_name})") 

1752 if self.__key_mean_geom is not None: 

1753 yield (f"{self.__key_mean_geom}: the geometric mean of all the" 

1754 f" {name} values, i.e., the {n_name}-th root of the " 

1755 f"product of all values, which is only defined if all " 

1756 f"values are > 0") 

1757 yield f"{self.__key_max}: the largest encountered{name} value" 

1758 yield (f"{self.__key_sd}: the standard deviation of the{name} " 

1759 "values, which is a measure of spread: the larger it " 

1760 "is, the farther are the values distributed away from " 

1761 f"the arithmetic mean {self.__key_mean_arith}. It can be " 

1762 "computed as the ((sum of squares) - (square of the sum)" 

1763 f" / {n_name}) / ({n_name} - 1) of all{name} values.") 

1764 else: 

1765 if first: 

1766 yield "" 

1767 yield f"{self.__key_all}: all{name} samples have this value" 

1768 

1769 def get_footer_bottom_comments(self) -> Iterable[str] | None: 

1770 """ 

1771 Get the bottom footer comments. 

1772 

1773 :returns: an iterator with the bottom comments 

1774 

1775 >>> for p in CsvWriter([StreamStatistics( 

1776 ... 1, 1, 1, 1, None)]).get_footer_bottom_comments(): 

1777 ... print(p[:30]) 

1778 This CSV output has been creat 

1779 Statistics were computed using 

1780 You can find pycommons at http 

1781 """ 

1782 yield from pycommons_footer_bottom_comments( 

1783 self, ("Statistics were computed using pycommons." 

1784 f"math in mode {type_name(self.__cls)}."))