Coverage for pycommons / math / stream_statistics.py: 99%
401 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
1"""
2An immutable record for statistics computed over a stream of data.
4Stream statistics, represented by class
5:class:`~pycommons.math.stream_statistics.StreamStatistics` are statistics
6that are computed over a stream of data. During the computation, only a
7minimal amount of data is actually kept in memory, such as a running sum,
8the overall minimum and maximum, etc.
9This makes these statistics suitable
11- if the amount of data is large and
12- the required accuracy is not very high and/or
13- the available computational budget or memory are limited.
15In this case, using the
16:class:`~pycommons.math.stream_statistics.StreamStatistics` routines are
17very suitable.
18You could, e.g., use the method
19:meth:`~pycommons.math.stream_statistics.StreamStatistics.aggregate` to
20obtain an aggregation object. This object allows you to iteratively append
21data to the current statistics computation via its `add` method and to obtain
22the current (or final) statistics result via the `result` method.
24Such a result is an instance of the class
25:class:`~pycommons.math.stream_statistics.StreamStatistics`.
26It stores the
27:attr:`~pycommons.math.stream_statistics.StreamStatistics.minimum` and the
28:attr:`~pycommons.math.stream_statistics.StreamStatistics.maximum` of the
29data, as well as the number
30:attr:`~pycommons.math.stream_statistics.StreamStatistics.n` of observed data
31samples.
32It also offers the approximations of the arithmetic mean as attribute
33:attr:`~pycommons.math.stream_statistics.StreamStatistics.mean_arith` and
34the approximation of the standard deviation as attribute
35:attr:`~pycommons.math.stream_statistics.StreamStatistics.stddev`.
37There is an absolute order defined upon these records.
38They are hashable and immutable.
39We provide methods to store them to CSV format via the class
40:class:`~pycommons.math.stream_statistics.CsvWriter`
41and to load them from CSV data via the class
42:class:`~pycommons.math.stream_statistics.CsvReader`.
43Functions that access attributes can be obtained via
44:meth:`~pycommons.math.stream_statistics.StreamStatistics.getter`.
46If you require high-accuracy statistics or values such as the median, you
47should use
48:class:`~pycommons.math.sample_statistics.SampleStatistics` instead.
50>>> ag = StreamStatistics.aggregate()
51>>> ag.update((1, 2, 3))
52>>> ag.add(4)
53>>> ag.add(5)
54>>> r1 = ag.result()
55>>> repr(r1)
56'StreamStatistics(n=5, minimum=1, mean_arith=3, maximum=5, \
57stddev=1.5811388300841898)'
58>>> str(r1)
59'5;1;3;5;1.5811388300841898'
61>>> r2 = StreamStatistics.from_samples((1, 2, 3, 4, 5))
62>>> r1 == r2
63True
65>>> ag.reset()
66>>> try:
67... ag.result()
68... except ValueError as ve:
69... print(ve)
70n=0 is invalid, must be in 1..9223372036854775808.
72>>> print(ag.result_or_none())
73None
74"""
76from dataclasses import dataclass
77from math import inf, isfinite, sqrt
78from typing import Callable, Final, Iterable, TypeVar, Union, cast
80from pycommons.io.csv import (
81 CSV_SEPARATOR,
82 SCOPE_SEPARATOR,
83 csv_column,
84 csv_column_or_none,
85 csv_scope,
86 csv_val_or_none,
87 pycommons_footer_bottom_comments,
88)
89from pycommons.io.csv import CsvReader as CsvReaderBase
90from pycommons.io.csv import CsvWriter as CsvWriterBase
91from pycommons.math.int_math import (
92 try_float_int_div,
93 try_int,
94)
95from pycommons.math.streams import StreamAggregate
96from pycommons.strings.string_conv import (
97 num_or_none_to_str,
98 num_to_str,
99 str_to_num,
100)
101from pycommons.types import check_int_range, type_error, type_name
103#: The minimum value key.
104KEY_MINIMUM: Final[str] = "min"
105#: The median value key.
106KEY_MEDIAN: Final[str] = "med"
107#: The arithmetic mean value key.
108KEY_MEAN_ARITH: Final[str] = "mean"
109#: The geometric mean value key.
110KEY_MEAN_GEOM: Final[str] = "geom"
111#: The maximum value key.
112KEY_MAXIMUM: Final[str] = "max"
113#: The standard deviation value key.
114KEY_STDDEV: Final[str] = "sd"
115#: The key for `n`
116KEY_N: Final[str] = "n"
117#: The single value
118KEY_VALUE: Final[str] = "value"
120#: the type variable for data to be written to CSV or to be read from CSV
121T = TypeVar("T", bound="StreamStatistics")
124class StreamStatisticsAggregate[T](StreamAggregate):
125 """An Aggregate producing stream statistics."""
127 def result(self) -> T:
128 """
129 Get the stream statistics result.
131 The result is guaranteed to be a valid instance of
132 :class:`~pycommons.math.stream_statistics.StreamStatistics`.
133 It has :attr:`~pycommons.math.stream_statistics.StreamStatistics.n`
134 greater than zero.
136 If no data was collected, a `ValueError` is raised.
137 If you want to get `None` if no data was collected, use
138 :meth:`~StreamStatisticsAggregate.result_or_none` instead.
140 :return: the result
141 :raises ValueError: if no data was collected
143 >>> try:
144 ... StreamStatisticsAggregate().result()
145 ... except NotImplementedError:
146 ... print("Not implemented!")
147 Not implemented!
148 """
149 raise NotImplementedError
151 def result_or_none(self) -> T | None:
152 """
153 Get the result if any data was collected, otherwise `None`.
155 This method returns the same result as
156 :meth:`~StreamStatisticsAggregate.result`, with the exception of the
157 case where no data was collected at all. In this case,
158 :meth:`~StreamStatisticsAggregate.result` will raise a `ValueError`,
159 whereas this method here just returns `None`.
161 :return: the result, or `None` if no data was collected.
163 >>> try:
164 ... StreamStatisticsAggregate().result_or_none()
165 ... except NotImplementedError:
166 ... print("Not implemented!")
167 Not implemented!
168 """
169 raise NotImplementedError
172@dataclass(frozen=True, init=False, order=False, eq=False)
173class StreamStatistics:
174 """
175 An immutable record with stream statistics of one quantity.
177 Stream statistics are statistics records that can be computed over a
178 stream of data. They do not require us to have the complete data sample
179 in memory at any point in time.
181 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2)
182 >>> s1.n
183 2
184 >>> s1.minimum
185 1
186 >>> s1.mean_arith
187 4
188 >>> s1.maximum
189 6
190 >>> s1.stddev
191 0.2
192 >>> hash(s1)
193 -997568919428664316
195 >>> s2 = StreamStatistics(1, 0, 0, 0.0, None)
196 >>> s2.n
197 1
198 >>> s2.minimum
199 0
200 >>> s2.mean_arith
201 0
202 >>> s2.maximum
203 0
204 >>> print(s2.stddev)
205 None
206 >>> hash(s2) == hash((0, 0, 0, inf, 0, inf, 1, 0))
207 True
209 >>> s3 = StreamStatistics(n=3, minimum=5, maximum=5,
210 ... mean_arith=5, stddev=0.0)
211 >>> s3.stddev
212 0
213 >>> hash(s3)
214 -5331876985145994286
216 >>> sset = {s1, s1, s2, s1, s3, s3, s2, s1}
217 >>> len(sset)
218 3
219 >>> print(list(sss.n for sss in sorted(sset)))
220 [1, 2, 3]
221 >>> print(list(sss.minimum for sss in sorted(sset)))
222 [0, 1, 5]
224 >>> try:
225 ... StreamStatistics(n=1, minimum=5, maximum=6,
226 ... mean_arith=5, stddev=None)
227 ... except ValueError as ve:
228 ... print(ve)
229 maximum (6) must equal minimum (5) if n=1.
231 >>> try:
232 ... StreamStatistics(n=1, minimum=5, maximum=5,
233 ... mean_arith=4, stddev=None)
234 ... except ValueError as ve:
235 ... print(ve)
236 mean_arith (4) must equal minimum (5) if n=1.
238 >>> try:
239 ... StreamStatistics(n=2, minimum=5, maximum=6,
240 ... mean_arith=4, stddev=None)
241 ... except ValueError as ve:
242 ... print(ve)
243 minimum<=mean_arith<=maximum must hold, but got 5, 4, and 6.
245 >>> try:
246 ... StreamStatistics(n=3, minimum=5, maximum=7,
247 ... mean_arith=6, stddev=-1)
248 ... except ValueError as ve:
249 ... print(ve)
250 stddev must be >= 0, but is -1.
252 >>> try:
253 ... StreamStatistics(n=3, minimum=5, maximum=7,
254 ... mean_arith=6, stddev=0)
255 ... except ValueError as ve:
256 ... print(str(ve)[:59])
257 If stddev (0) is 0, then minimum (5) must equal maximum (7)
259 >>> try:
260 ... StreamStatistics(n=3, minimum=5, maximum=5,
261 ... mean_arith=5, stddev=1)
262 ... except ValueError as ve:
263 ... print(str(ve)[:59])
264 If stddev (1) is 0, then minimum (5) must equal maximum (5)
266 >>> try:
267 ... StreamStatistics(n=3, minimum=5, maximum=5,
268 ... mean_arith=5, stddev=None)
269 ... except ValueError as ve:
270 ... print(ve)
271 If n=1, stddev=None and vice versa, but got n=3 and stddev=None.
273 >>> try:
274 ... StreamStatistics(n=1, minimum=5, maximum=5,
275 ... mean_arith=5, stddev=1)
276 ... except ValueError as ve:
277 ... print(ve)
278 If n=1, stddev=None and vice versa, but got n=1 and stddev=1.
279 """
281 #: The number of data samples over which the statistics were computed.
282 n: int
283 #: The minimum, i.e., the value of the smallest among the
284 #: :attr:`~StreamStatistics.n` data samples.
285 minimum: int | float
286 #: The arithmetic mean value, i.e., the sum of the
287 #: :attr:`~StreamStatistics.n` data samples divided by
288 #: :attr:`~StreamStatistics.n`.
289 mean_arith: int | float
290 #: The maximum, i.e., the value of the largest among the
291 #: :attr:`~StreamStatistics.n` data samples.
292 maximum: int | float
293 #: The standard deviation, if defined. This value will be `None` if there
294 #: was only a single sample.
295 stddev: int | float | None
297 def __init__(self, n: int, minimum: int | float, mean_arith: int | float,
298 maximum: int | float, stddev: int | float | None):
299 """
300 Create a sample statistics record.
302 :param n: the sample size, must be `n >= 1`
303 :param minimum: the minimum
304 :param median: the median
305 :param mean_arith: the arithmetic mean
306 :param mean_geom: the geometric mean, or `None` if it is undefined
307 :param maximum: the maximum
308 :param stddev: the standard deviation, must be `None` if `n == 0`
309 """
310 n = check_int_range(n, "n", 1, 9_223_372_036_854_775_808)
312 # check minimum
313 minimum = try_int(minimum)
314 # check maximum
315 maximum = try_int(maximum)
316 if (n == 1) and (maximum != minimum):
317 raise ValueError(f"maximum ({maximum}) must equal "
318 f"minimum ({minimum}) if n=1.")
319 # check arithmetic mean
320 mean_arith = try_int(mean_arith)
321 if n == 1:
322 if mean_arith != minimum:
323 raise ValueError(f"mean_arith ({mean_arith}) must equal "
324 f"minimum ({minimum}) if n=1.")
325 elif not minimum <= mean_arith <= maximum:
326 raise ValueError("minimum<=mean_arith<=maximum must hold, but "
327 f"got {minimum}, {mean_arith}, and {maximum}.")
329 if stddev is not None:
330 stddev = try_int(stddev)
331 if stddev < 0:
332 raise ValueError(f"stddev must be >= 0, but is {stddev}.")
333 if (n > 1) and ((minimum == maximum) ^ (stddev == 0)):
334 raise ValueError(
335 f"If stddev ({stddev}) is 0, then minimum ({minimum}) "
336 f"must equal maximum ({maximum}) and vice versa.")
337 if (stddev is None) ^ (n == 1):
338 raise ValueError("If n=1, stddev=None and vice versa, but "
339 f"got n={n} and stddev={stddev}.")
341 object.__setattr__(self, "n", n)
342 object.__setattr__(self, "minimum", minimum)
343 object.__setattr__(self, "maximum", maximum)
344 object.__setattr__(self, "mean_arith", mean_arith)
345 object.__setattr__(self, "stddev", stddev)
347 def __str__(self) -> str:
348 """
349 Get a string representation of this object.
351 :returns: the string
353 >>> print(StreamStatistics(1, 0, 0, 0.0, None))
354 1;0;0;0;None
356 >>> print(StreamStatistics(10, 1, 1.5, 2, 1.2))
357 10;1;1.5;2;1.2
358 """
359 return CSV_SEPARATOR.join(map(str, (
360 self.n, self.minimum, self.mean_arith, self.maximum,
361 self.stddev)))
363 def min_mean(self) -> int | float:
364 """
365 Obtain the smallest of the mean values.
367 :returns: :attr:`~StreamStatistics.mean_arith`
369 >>> StreamStatistics(1, 0, 0, 0.0, None).min_mean()
370 0
371 >>> StreamStatistics(2, 1, 2, 4.0, 0.2).min_mean()
372 2
373 """
374 return self.mean_arith
376 def max_mean(self) -> int | float:
377 """
378 Obtain the largest of the mean values.
380 :returns: :attr:`~StreamStatistics.mean_arith`
383 >>> StreamStatistics(1, 0, 0, 0.0, None).max_mean()
384 0
385 >>> StreamStatistics(2, 1, 2, 4.0, 0.2).max_mean()
386 2
387 """
388 return self.mean_arith
390 def compact(self, needs_n: bool = True) \
391 -> "int | float | StreamStatistics":
392 """
393 Try to represent this object as single number, if possible.
395 :param needs_n: if this is `True`, the default, then the object is
396 only turned into a single number if alsp `n==1`. Otherwise, `n`
397 is ignored
398 :returns: an integer or float if this objects minimum equals its
399 maximum, the object itself otherwise
401 >>> s = StreamStatistics.from_single_value(10, 1)
402 >>> s.compact() == 10
403 True
404 >>> s.compact() == s.compact(True)
405 True
407 >>> s = StreamStatistics.from_single_value(10, 2)
408 >>> s.compact() is s
409 True
410 >>> s.compact() == s.compact(True)
411 True
413 >>> s = StreamStatistics.from_single_value(10, 2)
414 >>> s.compact(False) == 10
415 True
417 >>> s = StreamStatistics(2, 1, 3, 5, 3)
418 >>> s.compact() is s
419 True
421 >>> s = StreamStatistics(2, 1, 3, 5, 3)
422 >>> s.compact(False) is s
423 True
425 >>> try:
426 ... s.compact(1)
427 ... except TypeError as te:
428 ... print(te)
429 needs_n should be an instance of bool but is int, namely 1.
431 >>> try:
432 ... s.compact(None)
433 ... except TypeError as te:
434 ... print(te)
435 needs_n should be an instance of bool but is None.
436 """
437 if not isinstance(needs_n, bool):
438 raise type_error(needs_n, "needs_n", bool)
439 mi: Final[int | float] = self.minimum
440 return self if (mi < self.maximum) or (
441 needs_n and (self.n > 1)) else mi
443 def _key(self) -> tuple[int | float, int | float, int | float,
444 int | float, int | float, int | float, int, int]:
445 r"""
446 Get a comparison and hash key.
448 This key is composed of the values for
449 :attr:`~StreamStatistics.minimum`, `inf` (for the geometric mean),
450 :attr:`~StreamStatistics.mean_arith`, `inf` (for the median),
451 :attr:`~StreamStatistics.maximum`, :attr:`~StreamStatistics.stddev`,
452 and :attr:`~StreamStatistics.n`. Any statistics value that is
453 undefined will be turned to `inf`. The last value is a unique
454 identifier of the object type. This is to prevent objects of type
455 `StreamStatistics` and `SampleStatistics` to clash. Therefore, the
456 former gets `0` as identifier, the latter gets `1`.
458 :returns: the comparison key
460 >>> StreamStatistics(2, 1, 4.0, 6, 0.2)._key()
461 (1, inf, 4, inf, 6, 0.2, 2, 0)
463 >>> StreamStatistics(1, 0, 0, 0, None)._key()
464 (0, 0, 0, inf, 0, inf, 1, 0)
466 >>> StreamStatistics(2, 1, 1, 1, 0)._key()
467 (1, 1, 1, 1, 1, 0, 2, 0)
469 >>> StreamStatistics(2, 0, 0, 0, 0)._key()
470 (0, 0, 0, inf, 0, 0, 2, 0)
471 """
472 mi: Final[int | float] = self.minimum
473 ma: Final[int | float] = self.maximum
474 return (mi, inf if ma > mi else mi, self.mean_arith,
475 mi if 0 < ma <= mi else inf, ma,
476 inf if self.stddev is None else self.stddev, self.n, 0)
478 def __lt__(self, other) -> bool:
479 """
480 Check if this statistics record is less than another one.
482 :param other: the other sample statistics
483 :returns: `True` if this object is less, `False` otherwise
485 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2)
486 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2)
487 >>> s1 < s2
488 False
490 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2)
491 >>> s3 < s1
492 True
493 >>> s1 < s3
494 False
496 >>> try:
497 ... s3 < 23
498 ... except TypeError as te:
499 ... print(str(te)[:60])
500 '<' not supported between instances of 'StreamStatistics' an
501 """
502 return self._key() < other._key()\
503 if isinstance(other, StreamStatistics) else NotImplemented
505 def __le__(self, other) -> bool:
506 """
507 Check if this statistics record is less than or equal to another one.
509 :param other: the other sample statistics
510 :returns: `True` if this object is less or equal, `False` otherwise
512 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2)
513 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2)
514 >>> s1 <= s2
515 True
517 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2)
518 >>> s3 <= s1
519 True
520 >>> s1 <= s3
521 False
523 >>> try:
524 ... s3 <= 23
525 ... except TypeError as te:
526 ... print(str(te)[:60])
527 '<=' not supported between instances of 'StreamStatistics' a
528 """
529 return self._key() <= other._key() \
530 if isinstance(other, StreamStatistics) else NotImplemented
532 def __gt__(self, other) -> bool:
533 """
534 Check if this statistics record is greater than another one.
536 :param other: the other sample statistics
537 :returns: `True` if this object is greater, `False` otherwise
539 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2)
540 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2)
541 >>> s1 > s2
542 False
544 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2)
545 >>> s3 > s1
546 False
547 >>> s1 > s3
548 True
550 >>> try:
551 ... s3 > 23
552 ... except TypeError as te:
553 ... print(str(te)[:60])
554 '>' not supported between instances of 'StreamStatistics' an
555 """
556 return self._key() > other._key() \
557 if isinstance(other, StreamStatistics) else NotImplemented
559 def __ge__(self, other) -> bool:
560 """
561 Check if this object is greater than or equal to another one.
563 :param other: the other sample statistics
564 :returns: `True` if this object is greater or equal, `False` otherwise
566 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2)
567 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2)
568 >>> s1 >= s2
569 True
571 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2)
572 >>> s3 >= s1
573 False
574 >>> s1 >= s3
575 True
577 >>> try:
578 ... s3 >= 23
579 ... except TypeError as te:
580 ... print(str(te)[:60])
581 '>=' not supported between instances of 'StreamStatistics' a
582 """
583 return self._key() >= other._key() \
584 if isinstance(other, StreamStatistics) else NotImplemented
586 def __eq__(self, other) -> bool:
587 """
588 Check if this statistics record equals another object.
590 :param other: the other obect
591 :returns: `True` if this object is equal, `False` otherwise
593 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2)
594 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2)
595 >>> s1 == s2
596 True
598 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2)
599 >>> s3 == s1
600 False
602 >>> s3 == 23
603 False
604 """
605 return (isinstance(other, StreamStatistics)) and (
606 self._key() == other._key())
608 def __ne__(self, other) -> bool:
609 """
610 Check if this statistics record does not equal another object.
612 :param other: the other sample statistics
613 :returns: `True` if this object is not equal, `False` otherwise
615 >>> s1 = StreamStatistics(2, 1, 4.0, 6, 0.2)
616 >>> s2 = StreamStatistics(2, 1, 4.0, 6, 0.2)
617 >>> s1 != s2
618 False
620 >>> s3 = StreamStatistics(2, 0.5, 4.0, 6, 0.2)
621 >>> s3 != s1
622 True
624 >>> s3 != "x"
625 True
626 """
627 return (not isinstance(other, StreamStatistics)) or (
628 self._key() != other._key())
630 def __hash__(self) -> int:
631 """
632 Compute the hash code of this statistics record.
634 :returns: the hash code
636 >>> hash(StreamStatistics(2, 1, 4.0, 6, 0.2))
637 -997568919428664316
639 >>> hash(StreamStatistics(2, -1, 4.0, 6, 0.2))
640 -1901621203255131428
641 """
642 return hash(self._key())
644 def get_n(self) -> int:
645 """
646 Get the number :attr:`~StreamStatistics.n` of samples.
648 :returns: the number :attr:`~StreamStatistics.n` of samples.
649 :raises TypeError: if an object of the wrong type is passed in as self
651 >>> StreamStatistics(5, 3, 6, 7, 2).get_n()
652 5
654 >>> try:
655 ... StreamStatistics.get_n(None)
656 ... except TypeError as te:
657 ... print(str(te)[:20])
658 self should be an in
659 """
660 if not isinstance(self, StreamStatistics):
661 raise type_error(self, "self", StreamStatistics)
662 return self.n
664 def get_minimum(self) -> int | float:
665 """
666 Get the :attr:`~StreamStatistics.minimum` of all the samples.
668 :returns: the :attr:`~StreamStatistics.minimum` of all the samples
669 :raises TypeError: if an object of the wrong type is passed in as self
671 >>> StreamStatistics(5, 3, 4, 6, 2).get_minimum()
672 3
674 >>> try:
675 ... StreamStatistics.get_minimum(None)
676 ... except TypeError as te:
677 ... print(str(te)[:20])
678 self should be an in
679 """
680 if not isinstance(self, StreamStatistics):
681 raise type_error(self, "self", StreamStatistics)
682 return self.minimum
684 def get_maximum(self) -> int | float:
685 """
686 Get the :attr:`~StreamStatistics.maximum` of all the samples.
688 :returns: the :attr:`~StreamStatistics.maximum` of all the samples
689 :raises TypeError: if an object of the wrong type is passed in as self
691 >>> StreamStatistics(5, 3, 6, 7, 2).get_maximum()
692 7
694 >>> try:
695 ... StreamStatistics.get_maximum(None)
696 ... except TypeError as te:
697 ... print(str(te)[:20])
698 self should be an in
699 """
700 if not isinstance(self, StreamStatistics):
701 raise type_error(self, "self", StreamStatistics)
702 return self.maximum
704 def get_mean_arith(self) -> int | float:
705 """
706 Get the arithmetic mean (:attr:`~StreamStatistics.mean_arith`).
708 :returns: the arithmetic mean (:attr:`~StreamStatistics.mean_arith`)
709 of all the samples.
710 :raises TypeError: if an object of the wrong type is passed in as self
712 >>> StreamStatistics(5, 3, 6, 7, 2).get_mean_arith()
713 6
715 >>> try:
716 ... StreamStatistics.get_mean_arith(None)
717 ... except TypeError as te:
718 ... print(str(te)[:20])
719 self should be an in
720 """
721 if not isinstance(self, StreamStatistics):
722 raise type_error(self, "self", StreamStatistics)
723 return self.mean_arith
725 def get_median(self) -> int | float | None:
726 """
727 Get the median of all the samples.
729 :returns: This object type does not store the media. However, if
730 the minimum is the same as the maximum, the median will have that
731 same value, too, so it is returned. Otherwise, this method returns
732 `None`. This method will be overridden.
733 :raises TypeError: if an object of the wrong type is passed in as self
735 >>> print(StreamStatistics(5, 3, 6, 7, 2).get_median())
736 None
738 >>> print(StreamStatistics(5, -3, -3.0, -3, 0).get_median())
739 -3
741 >>> try:
742 ... StreamStatistics.get_median(None)
743 ... except TypeError as te:
744 ... print(str(te)[:20])
745 self should be an in
746 """
747 if not isinstance(self, StreamStatistics):
748 raise type_error(self, "self", StreamStatistics)
749 return self.minimum if self.minimum >= self.maximum else None
751 def get_mean_geom(self) -> int | float | None:
752 """
753 Get the geometric mean of all the samples.
755 This class does not offer storing the geometric mean. This means
756 that this method will usually return `None`. The only situation
757 where it will not return `None` is if the geometric mean can be
758 inferred by definition, namely if the minimum and maximum value
759 are the same and positive. Subclasses will override this method to
760 return meaningful values.
762 :returns: the geometric mean of all the samples, `None` if the
763 geometric mean is not defined.
764 :raises TypeError: if an object of the wrong type is passed in as self
766 >>> print(StreamStatistics(5, 3, 6, 7, 2).get_mean_geom())
767 None
769 >>> print(StreamStatistics(5, 2, 2, 2, 0).get_mean_geom())
770 2
772 >>> try:
773 ... StreamStatistics.get_mean_geom(None)
774 ... except TypeError as te:
775 ... print(str(te)[:20])
776 self should be an in
777 """
778 if not isinstance(self, StreamStatistics):
779 raise type_error(self, "self", StreamStatistics)
780 mi: Final[int | float] = self.minimum
781 return mi if 0 < self.maximum <= mi else None
783 def get_stddev(self) -> int | float | None:
784 """
785 Get the standard deviation mean (:attr:`~StreamStatistics.stddev`).
787 :returns: the standard deviation (:attr:`~StreamStatistics.stddev`)
788 of all the samples, `None` if the standard deviation is not
789 defined, i.e., if there is only a single sample
790 :raises TypeError: if an object of the wrong type is passed in as self
792 >>> StreamStatistics(5, 3, 6, 7, 2).get_stddev()
793 2
795 >>> try:
796 ... StreamStatistics.get_stddev(None)
797 ... except TypeError as te:
798 ... print(str(te)[:20])
799 self should be an in
800 """
801 if not isinstance(self, StreamStatistics):
802 raise type_error(self, "self", StreamStatistics)
803 return self.stddev
805 @classmethod
806 def aggregate(cls) -> StreamStatisticsAggregate["StreamStatistics"]:
807 """
808 Get an aggregate suitable for this statistics type.
810 :return: the aggregate
812 >>> ag = StreamStatistics.aggregate()
813 >>> ag.update((1, 2, 3, 4))
814 >>> ag.result()
815 StreamStatistics(n=4, minimum=1, mean_arith=2.5, maximum=4, \
816stddev=1.2909944487358056)
817 >>> ag.reset()
818 >>> ag.add(4)
819 >>> ag.add(5)
820 >>> ag.add(6)
821 >>> ag.add(7)
822 >>> ag.result()
823 StreamStatistics(n=4, minimum=4, mean_arith=5.5, maximum=7, \
824stddev=1.2909944487358056)
825 """
826 return _StreamStats()
828 @classmethod
829 def from_samples(cls, source: Iterable[
830 int | float | None]) -> "StreamStatistics":
831 """
832 Create a statistics record from a stream of samples.
834 :return: the statistics record.
836 >>> StreamStatistics.from_samples((1, 2, 3, 4))
837 StreamStatistics(n=4, minimum=1, mean_arith=2.5, maximum=4, \
838stddev=1.2909944487358056)
839 """
840 agg: Final[StreamStatisticsAggregate] = cls.aggregate()
841 agg.update(source)
842 return agg.result()
844 @classmethod
845 def from_single_value(cls, value: Union[
846 int, float, "StreamStatistics"], n: int = 1) -> "StreamStatistics":
847 r"""
848 Create a sample statistics from a single number.
850 :param value: the single value
851 :param n: the number of samples, i.e., the number of times this value
852 occurred
853 :returns: the sample statistics
855 >>> print(str(StreamStatistics.from_single_value(23)))
856 1;23;23;23;None
858 >>> s = StreamStatistics.from_single_value(10, 2)
859 >>> print(s.stddev)
860 0
861 >>> s.minimum == s.maximum == s.mean_arith == 10
862 True
863 >>> s is StreamStatistics.from_single_value(s, s.n)
864 True
866 >>> s = StreamStatistics.from_single_value(10, 1)
867 >>> print(s.stddev)
868 None
869 >>> s.minimum == s.maximum == s.mean_arith == 10
870 True
871 >>> s is StreamStatistics.from_single_value(s, s.n)
872 True
874 >>> s = StreamStatistics.from_single_value(-10, 2)
875 >>> print(s.stddev)
876 0
877 >>> s.minimum == s.maximum == s.mean_arith == -10
878 True
879 >>> s is StreamStatistics.from_single_value(s, s.n)
880 True
882 >>> s = StreamStatistics.from_single_value(-10, 1)
883 >>> print(s.stddev)
884 None
885 >>> s.minimum == s.maximum == s.mean_arith == -10
886 True
887 >>> s is StreamStatistics.from_single_value(s, s.n)
888 True
890 >>> s = StreamStatistics.from_single_value(10.5, 2)
891 >>> print(s.stddev)
892 0
893 >>> s.minimum == s.maximum == s.mean_arith == 10.5
894 True
895 >>> s is StreamStatistics.from_single_value(s, s.n)
896 True
898 >>> s = StreamStatistics.from_single_value(10.5, 1)
899 >>> print(s.stddev)
900 None
901 >>> s.minimum == s.maximum == s.mean_arith == 10.5
902 True
903 >>> s is StreamStatistics.from_single_value(s, s.n)
904 True
906 >>> s = StreamStatistics.from_single_value(-10.5, 2)
907 >>> print(s.stddev)
908 0
909 >>> s.minimum == s.maximum == s.mean_arith == -10.5
910 True
911 >>> s is StreamStatistics.from_single_value(s, s.n)
912 True
914 >>> s = StreamStatistics.from_single_value(-10.5, 1)
915 >>> print(s.stddev)
916 None
917 >>> s.minimum == s.maximum == s.mean_arith == -10.5
918 True
919 >>> s is StreamStatistics.from_single_value(s, s.n)
920 True
922 >>> try:
923 ... StreamStatistics.from_single_value(None)
924 ... except TypeError as te:
925 ... print(str(te)[:20])
926 value should be an i
928 >>> try:
929 ... StreamStatistics.from_single_value("a")
930 ... except TypeError as te:
931 ... print(str(te)[:20])
932 value should be an i
934 >>> try:
935 ... StreamStatistics.from_single_value(1, None)
936 ... except TypeError as te:
937 ... print(str(te)[:20])
938 n should be an insta
940 >>> try:
941 ... StreamStatistics.from_single_value(1, "a")
942 ... except TypeError as te:
943 ... print(str(te)[:20])
944 n should be an insta
946 >>> try:
947 ... StreamStatistics.from_single_value(s, 12)
948 ... except ValueError as ve:
949 ... print(str(ve)[:20])
950 Incompatible numbers
952 >>> try:
953 ... StreamStatistics.from_single_value(inf)
954 ... except ValueError as ve:
955 ... print(str(ve)[:20])
956 value=inf is not fin
957 """
958 n = check_int_range(n, "n", 1, 1_000_000_000_000_000_000)
959 if isinstance(value, StreamStatistics):
960 if value.n == n:
961 return value
962 raise ValueError( # noqa: TRY004
963 f"Incompatible numbers of values {n} and {value}.")
964 if not isinstance(value, int | float):
965 raise type_error(value, "value", (int, float, StreamStatistics))
966 if not isfinite(value):
967 raise ValueError(f"value={value} is not finite.")
968 return StreamStatistics(
969 n=n, minimum=value, mean_arith=value, maximum=value,
970 stddev=None if n <= 1 else 0)
972 @classmethod
973 def getter(cls, dimension: str) -> Callable[[
974 "StreamStatistics"], int | float | None]:
975 """
976 Get a function returning the dimension from :class:`StreamStatistics`.
978 The returned getter function expects that it receives a valid
979 :class:`StreamStatistics` instance as parameter, or an instance of the
980 subclass you called :meth:`StreamStatistics.getter` on. If you pass in
981 `None`, then ths will raise a `TypeError`. If you are in a situation
982 where `None` is possible, use the function
983 :meth:`StreamStatistics.getter_or_none` instead, which will return
984 `None` in such a case.
986 :param dimension: the dimension
987 :returns: a :class:`Callable` that returns the value corresponding to
988 the dimension
989 :raises TypeError: if `dimension` is not a string
990 :raises ValueError: if `dimension` is unknown
992 >>> StreamStatistics.getter(KEY_N) is StreamStatistics.get_n
993 True
994 >>> (StreamStatistics.getter(KEY_MINIMUM) is
995 ... StreamStatistics.get_minimum)
996 True
997 >>> (StreamStatistics.getter(KEY_MEAN_ARITH) is
998 ... StreamStatistics.get_mean_arith)
999 True
1000 >>> (StreamStatistics.getter(KEY_MEAN_GEOM) is
1001 ... StreamStatistics.get_mean_geom)
1002 True
1003 >>> (StreamStatistics.getter(KEY_MAXIMUM) is
1004 ... StreamStatistics.get_maximum)
1005 True
1006 >>> (StreamStatistics.getter(KEY_MEDIAN) is
1007 ... StreamStatistics.get_median)
1008 True
1009 >>> (StreamStatistics.getter(KEY_STDDEV) is
1010 ... StreamStatistics.get_stddev)
1011 True
1013 >>> s = StreamStatistics(5, 3, 6, 7, 2)
1014 >>> StreamStatistics.getter(KEY_N)(s)
1015 5
1016 >>> StreamStatistics.getter(KEY_MINIMUM)(s)
1017 3
1018 >>> StreamStatistics.getter(KEY_MEAN_ARITH)(s)
1019 6
1020 >>> print(StreamStatistics.getter(KEY_MEAN_GEOM)(s))
1021 None
1022 >>> StreamStatistics.getter(KEY_MAXIMUM)(s)
1023 7
1024 >>> StreamStatistics.getter(KEY_STDDEV)(s)
1025 2
1026 >>> print(StreamStatistics.getter(KEY_MEDIAN)(s))
1027 None
1029 >>> try:
1030 ... StreamStatistics.getter(KEY_N)(None)
1031 ... except TypeError as te:
1032 ... print(str(te)[:20])
1033 self should be an in
1035 >>> try:
1036 ... StreamStatistics.getter(None)
1037 ... except TypeError as te:
1038 ... print(te)
1039 descriptor 'strip' for 'str' objects doesn't apply to a 'NoneType' \
1040object
1042 >>> try:
1043 ... StreamStatistics.getter(1)
1044 ... except TypeError as te:
1045 ... print(te)
1046 descriptor 'strip' for 'str' objects doesn't apply to a 'int' object
1048 >>> try:
1049 ... StreamStatistics.getter("hello")
1050 ... except ValueError as ve:
1051 ... print(str(ve)[-18:])
1052 dimension 'hello'.
1053 """
1054 tbl_name: Final[str] = "___cls_getters"
1055 if hasattr(cls, tbl_name):
1056 getters = cast("Callable", getattr(cls, tbl_name))
1057 else:
1058 getters = {
1059 KEY_N: cls.get_n, KEY_MINIMUM: cls.get_minimum,
1060 "minimum": cls.get_minimum,
1061 KEY_MEAN_ARITH: cls.get_mean_arith,
1062 "mean_arith": cls.get_mean_arith,
1063 "arithmetic mean": cls.get_mean_arith,
1064 "average": cls.get_mean_arith, KEY_MEDIAN: cls.get_median,
1065 "median": cls.get_median, KEY_MEAN_GEOM: cls.get_mean_geom,
1066 "mean_geom": cls.get_mean_geom,
1067 "geometric mean": cls.get_mean_geom,
1068 "gmean": cls.get_mean_geom, KEY_MAXIMUM: cls.get_maximum,
1069 "maximum": cls.get_maximum, KEY_STDDEV: cls.get_stddev,
1070 "stddev": cls.get_stddev,
1071 "standard deviation": cls.get_stddev}.get
1072 setattr(cls, tbl_name, getters)
1074 result: Callable[[StreamStatistics], int | float | None] | None \
1075 = getters(str.strip(dimension), None)
1076 if result is None:
1077 raise ValueError(f"Unknown {cls} dimension {dimension!r}.")
1078 return result
1080 @classmethod
1081 def getter_or_none(cls, dimension: str) -> Callable[[
1082 Union["StreamStatistics", None]], int | float | None]:
1083 """
1084 Obtain a getter that returns `None` if the statistics is `None`.
1086 With this method, you can get a function which returns a value from a
1087 statistics object if the object is not `None`. If `None` is provided,
1088 then the function also returns `None`.
1090 This is especially useful if you work with something like
1091 :meth:`~StreamStatisticsAggregate.result_or_none`.
1093 If your data should never be `None`, the better use
1094 :meth:`StreamStatistics.getter` instead, which returns getter
1095 functions that raise `TypeError`s if their input is `None`.
1097 :param dimension: the dimension
1098 :return: the getter
1100 >>> ss = StreamStatistics(10, 1, 2, 3, 4)
1101 >>> g = StreamStatistics.getter_or_none(KEY_MINIMUM)
1102 >>> g(ss)
1103 1
1104 >>> print(g(None))
1105 None
1106 >>> StreamStatistics.getter_or_none(KEY_MINIMUM) is g
1107 True
1109 >>> g = StreamStatistics.getter_or_none(KEY_MAXIMUM)
1110 >>> g(ss)
1111 3
1112 >>> print(g(None))
1113 None
1114 """
1115 tbl_name: Final[str] = "___cls_getters_or_none"
1116 if hasattr(cls, tbl_name):
1117 getters = cast("dict", getattr(cls, tbl_name))
1118 else:
1119 getters = {}
1120 setattr(cls, tbl_name, getters)
1122 dimension = str.strip(dimension)
1123 if dimension in getters:
1124 return getters[dimension]
1126 def __getter(x, _y=cls.getter(dimension)) -> int | float | None:
1127 return None if x is None else _y(x)
1129 getters[dimension] = __getter
1130 return cast("Callable", __getter)
1133class _StreamStats(StreamStatisticsAggregate[StreamStatistics]):
1134 """
1135 The internal stream statistics.
1137 The stream statistics compute mean and variance of data using Welford's
1138 algorithm.
1140 1. Donald E. Knuth (1998). The Art of Computer Programming, volume 2:
1141 Seminumerical Algorithms, 3rd edn., p. 232. Boston: Addison-Wesley.
1142 2. B. P. Welford (1962). "Note on a method for calculating corrected sums
1143 of squares and products". Technometrics 4(3):419-420.
1145 >>> ss = _StreamStats()
1146 >>> data1 = [4, 7, 13, 16]
1147 >>> ss.update(data1)
1148 >>> ss.result()
1149 StreamStatistics(n=4, minimum=4, mean_arith=10, maximum=16, \
1150stddev=5.477225575051661)
1152 >>> data2 = [1e8 + z for z in data1]
1153 >>> ss.reset()
1154 >>> ss.update(data2)
1155 >>> ss.result()
1156 StreamStatistics(n=4, minimum=100000004, mean_arith=100000010, \
1157maximum=100000016, stddev=5.477225575051661)
1159 >>> data3 = [1e14 + z for z in data1]
1160 >>> ss.reset()
1161 >>> ss.update(data3)
1162 >>> ss.result()
1163 StreamStatistics(n=4, minimum=100000000000004, \
1164mean_arith=100000000000010, maximum=100000000000016, stddev=5.477225575051661)
1166 >>> data3 = [z for z in range(1001)]
1167 >>> ss.reset()
1168 >>> ss.update(data3)
1169 >>> ss.result()
1170 StreamStatistics(n=1001, minimum=0, mean_arith=500, maximum=1000, \
1171stddev=289.10811126635656)
1172 """
1174 def __init__(self) -> None:
1175 """Initialize the stream statistics."""
1176 #: the number of samples seen
1177 self.__n: int = 0
1178 #: the last mean result
1179 self.__mean: int | float = 0
1180 #: the running sum for the variance
1181 self.__var: int | float = 0
1182 #: the minimum
1183 self.__min: int | float = inf
1184 #: the maximum
1185 self.__max: int | float = -inf
1187 def reset(self) -> None:
1188 """Reset the sample statistics."""
1189 self.__n = 0
1190 self.__mean = 0
1191 self.__var = 0
1192 self.__min = inf
1193 self.__max = -inf
1195 def add(self, value: int | float) -> None:
1196 """
1197 Add a value to the statistics.
1199 :param value: the value
1200 """
1201 value = try_int(value) # try to sum ints, check type and non-finite
1202 n: Final[int] = self.__n + 1
1203 self.__n = n
1204 mean: int | float = self.__mean
1205 delta: int | float = value - mean
1206 mean += delta / n
1207 self.__mean = mean
1208 self.__var += delta * (value - mean)
1209 self.__min = min(self.__min, value)
1210 self.__max = max(self.__max, value)
1212 def result(self) -> StreamStatistics:
1213 """
1214 Get the arithmetic mean.
1216 :return: the arithmetic mean or `None` if no value was added yet
1217 """
1218 n: Final[int] = self.__n
1219 mi: Final[int | float] = self.__min
1220 ma: Final[int | float] = self.__max
1221 return StreamStatistics(
1222 n, mi, max(mi, min(ma, self.__mean)), ma,
1223 None if n <= 1 else (0 if ma <= mi else sqrt(
1224 try_float_int_div(self.__var, n - 1))))
1226 def result_or_none(self) -> StreamStatistics | None:
1227 """
1228 Get the result if any data was collected, otherwise `None`.
1230 :return: The return value of :meth:`result` if any data was collected,
1231 otherwise `None`
1232 """
1233 return self.result() if self.__n > 0 else None
1236class CsvReader(CsvReaderBase[StreamStatistics]):
1237 """
1238 A csv parser for sample statistics.
1240 >>> from pycommons.io.csv import csv_read
1241 >>> csv = ["n;min;mean;max;sd",
1242 ... "3;2;3;10;5", "6;2;;;0", "1;;2", "3;;;0;",
1243 ... "4;5;12;33;7"]
1244 >>> for p in csv_read(csv, CsvReader, CsvReader.parse_row):
1245 ... print(p)
1246 3;2;3;10;5
1247 6;2;2;2;0
1248 1;2;2;2;None
1249 3;0;0;0;0
1250 4;5;12;33;7
1252 >>> csv = ["value", "1", "3", "0", "-5", "7"]
1253 >>> for p in csv_read(csv, CsvReader, CsvReader.parse_row):
1254 ... print(p)
1255 1;1;1;1;None
1256 1;3;3;3;None
1257 1;0;0;0;None
1258 1;-5;-5;-5;None
1259 1;7;7;7;None
1261 >>> csv = ["n;m;sd", "1;3;", "3;5;0"]
1262 >>> for p in csv_read(csv, CsvReader, CsvReader.parse_row):
1263 ... print(p)
1264 1;3;3;3;None
1265 3;5;5;5;0
1267 >>> csv = ["n;m", "1;3", "3;5"]
1268 >>> for p in csv_read(csv, CsvReader, CsvReader.parse_row):
1269 ... print(p)
1270 1;3;3;3;None
1271 3;5;5;5;0
1272 """
1274 def __init__(self, columns: dict[str, int]) -> None:
1275 """
1276 Create a CSV parser for :class:`SampleStatistics`.
1278 :param columns: the columns
1280 >>> try:
1281 ... CsvReader(None)
1282 ... except TypeError as te:
1283 ... print(te)
1284 columns should be an instance of dict but is None.
1286 >>> try:
1287 ... CsvReader(1)
1288 ... except TypeError as te:
1289 ... print(te)
1290 columns should be an instance of dict but is int, namely 1.
1292 >>> try:
1293 ... CsvReader(dict())
1294 ... except ValueError as ve:
1295 ... print(ve)
1296 No useful keys remain in {}.
1298 >>> try:
1299 ... CsvReader({"a": 1, "b": 2})
1300 ... except ValueError as ve:
1301 ... print(ve)
1302 No useful keys remain in {'a': 1, 'b': 2}.
1304 >>> try:
1305 ... CsvReader({KEY_N: 1, "b": 2, "c": 3})
1306 ... except ValueError as ve:
1307 ... print(ve)
1308 No useful keys remain in {'b': 2, 'c': 3}.
1310 >>> try:
1311 ... CsvReader({KEY_MINIMUM: 1, "b": 2, "c": 3})
1312 ... except ValueError as ve:
1313 ... print(ve)
1314 Found strange keys in {'b': 2, 'c': 3}.
1315 """
1316 super().__init__(columns)
1318 #: the index of the number of elements
1319 self.idx_n: Final[int | None] = csv_column_or_none(
1320 columns, KEY_N)
1322 has: int = 0
1323 has_idx: int = -1
1325 #: the index of the minimum
1326 self.__idx_min: int | None = csv_column_or_none(
1327 columns, KEY_MINIMUM)
1328 if self.__idx_min is not None:
1329 has += 1
1330 has_idx = self.__idx_min
1332 #: the index for the arithmetic mean
1333 self.__idx_mean_arith: int | None = csv_column_or_none(
1334 columns, KEY_MEAN_ARITH)
1335 if self.__idx_mean_arith is not None:
1336 has += 1
1337 has_idx = self.__idx_mean_arith
1339 #: the index for the maximum
1340 self.__idx_max: int | None = csv_column_or_none(
1341 columns, KEY_MAXIMUM)
1342 if self.__idx_max is not None:
1343 has += 1
1344 has_idx = self.__idx_max
1346 #: the index for the standard deviation
1347 self.__idx_sd: Final[int | None] = csv_column_or_none(
1348 columns, KEY_STDDEV)
1350 if has <= 0:
1351 if dict.__len__(columns) == 1:
1352 self.__idx_min = has_idx = csv_column(
1353 columns, next(iter(columns.keys())), True)
1354 has = 1
1355 else:
1356 raise ValueError(f"No useful keys remain in {columns!r}.")
1357 if dict.__len__(columns) > 1:
1358 raise ValueError(f"Found strange keys in {columns!r}.")
1360 #: is this a parser for single number statistics?
1361 self.__is_single: Final[bool] = (self.__idx_sd is None) and (has == 1)
1363 if self.__is_single:
1364 self.__idx_min = self.__idx_max = self.__idx_mean_arith = has_idx
1366 def parse_row(self, data: list[str]) -> StreamStatistics:
1367 """
1368 Parse a row of data.
1370 :param data: the data row
1371 :returns: the sample statistics
1373 >>> cc = CsvReader({KEY_MINIMUM: 0, KEY_MEAN_ARITH: 1, KEY_MAXIMUM: 2,
1374 ... KEY_STDDEV: 3, KEY_N: 4})
1375 >>> try:
1376 ... cc.parse_row([None, None, None, None, "5"])
1377 ... except ValueError as ve:
1378 ... print(str(ve)[:20])
1379 No value defined for
1380 """
1381 n: Final[int] = 1 if self.idx_n is None else int(data[self.idx_n])
1382 mi: int | float | None = csv_val_or_none(
1383 data, self.__idx_min, str_to_num)
1385 if self.__is_single:
1386 return StreamStatistics(
1387 n=n, minimum=mi, mean_arith=mi,
1388 maximum=mi, stddev=None if n <= 1 else 0)
1390 ar: int | float | None = csv_val_or_none(
1391 data, self.__idx_mean_arith, str_to_num)
1392 ma: int | float | None = csv_val_or_none(
1393 data, self.__idx_max, str_to_num)
1394 sd: int | float | None = csv_val_or_none(
1395 data, self.__idx_sd, str_to_num)
1397 if mi is None:
1398 if ar is not None:
1399 mi = ar
1400 elif ma is not None:
1401 mi = ma
1402 else:
1403 raise ValueError(
1404 f"No value defined for min@{self.__idx_min}={mi}, mean@"
1405 f"{self.__idx_mean_arith}={ar}, max@"
1406 f"{self.__idx_max}={ma} defined in {data!r}.")
1407 return StreamStatistics(
1408 n=n, minimum=mi, mean_arith=mi if ar is None else ar,
1409 maximum=mi if ma is None else ma,
1410 stddev=(0 if (n > 1) else None) if sd is None else sd)
1412 def parse_optional_row(self, data: list[str] | None) \
1413 -> StreamStatistics | None:
1414 """
1415 Parse a row of data that may be empty.
1417 :param data: the row of data that may be empty
1418 :returns: the sample statistic, if the row contains data, else `None`
1420 >>> print(CsvReader.parse_optional_row(None, ["1"]))
1421 None
1422 >>> print(CsvReader.parse_optional_row(CsvReader({"v": 0}), ["1"]))
1423 1;1;1;1;None
1424 >>> print(CsvReader.parse_optional_row(CsvReader({"v": 0}), [""]))
1425 None
1426 """
1427 if (self is None) or (data is None):
1428 return None # trick to make this method usable pseudo-static
1429 # pylint: disable=R0916
1430 if (((self.__idx_min is not None) and (
1431 str.__len__(data[self.__idx_min]) > 0)) or (
1432 (self.__idx_mean_arith is not None) and (
1433 str.__len__(data[self.__idx_mean_arith]) > 0)) or (
1434 (self.__idx_max is not None) and (
1435 str.__len__(data[self.__idx_max]) > 0))):
1436 return self.parse_row(data)
1437 return None
1440class CsvWriter(CsvWriterBase[T]):
1441 """A class for CSV writing of :class:`StreamStatistics`."""
1443 def __init__(self,
1444 data: Iterable[T],
1445 scope: str | None = None,
1446 n_not_needed: bool = False,
1447 what_short: str | None = None,
1448 what_long: str | None = None,
1449 clazz: type[T] = cast("type[T]", StreamStatistics)) -> None:
1450 """
1451 Initialize the csv writer.
1453 :param data: the data to use
1454 :param scope: the prefix to be pre-pended to all columns
1455 :param n_not_needed: should we omit the `n` column?
1456 :param what_short: the short description of what the statistics is
1457 about
1458 :param what_long: the long description of what the statistics is about
1459 :param clazz: the stream statistics type
1461 >>> try:
1462 ... CsvWriter([], None, n_not_needed=None)
1463 ... except TypeError as te:
1464 ... print(te)
1465 n_not_needed should be an instance of bool but is None.
1467 >>> try:
1468 ... CsvWriter([], clazz=str)
1469 ... except TypeError as te:
1470 ... print(str(te)[:20])
1471 clazz should be an i
1473 >>> try:
1474 ... CsvWriter([])
1475 ... except ValueError as ve:
1476 ... s = str(ve)
1477 ... print(s[s.index(' ') + 1:])
1478 CsvWriter did not see any data.
1480 >>> try:
1481 ... CsvWriter([1])
1482 ... except TypeError as te:
1483 ... print(str(te)[:29])
1484 data[0] should be an instance
1485 """
1486 super().__init__(data, scope)
1488 if not issubclass(clazz, StreamStatistics):
1489 raise type_error(clazz, "clazz", type[StreamStatistics])
1490 #: the internal type
1491 self.__cls: Final[type[StreamStatistics]] = clazz
1493 if not isinstance(n_not_needed, bool):
1494 raise type_error(n_not_needed, "n_not_needed", bool)
1495 # We need to check at most three conditions to see whether we can
1496 # compact the output:
1497 # 1. If all minimum, mean, median, maximum (and geometric mean, if
1498 # defined) are the same, then we can collapse this column.
1499 all_same: bool = True
1500 # 2. If no geometric mean is found, then we can also omit this column.
1501 has_no_geom: bool = True
1502 # 3. If no median is found, then we can also omit this column.
1503 has_no_median: bool = True
1504 # 4. If the `n` column is not needed or if all `n=1`, then we can omit
1505 # it. We only need to check if n is not needed if self.n_not_needed is
1506 # False because otherwise, we rely on self.n_not_needed.
1507 # n_really_not_needed will become False if we find one situation where
1508 # we actually need n.
1509 n_really_not_needed: bool = n_not_needed
1510 # So if n_really_not_needed is True, we need to do 3 checks.
1511 # Otherwise, we only need two checks.
1512 checks_needed: int = 4 if n_really_not_needed else 3
1513 # the number of samples seen
1514 seen: int = 0
1516 for i, d in enumerate(data): # Iterate over the data.
1517 if not isinstance(d, clazz):
1518 raise type_error(d, f"data[{i}]", clazz)
1519 seen += 1
1520 if n_really_not_needed and (d.n != 1):
1521 n_really_not_needed = False
1522 checks_needed -= 1
1523 if checks_needed <= 0:
1524 break
1525 if all_same and (d.minimum < d.maximum):
1526 all_same = False
1527 checks_needed -= 1
1528 if checks_needed <= 0:
1529 break
1530 if has_no_geom and (d.get_mean_geom() is not None):
1531 has_no_geom = False
1532 checks_needed -= 1
1533 if checks_needed <= 0:
1534 break
1535 if has_no_median and (d.get_median() is not None):
1536 has_no_median = False
1537 checks_needed -= 1
1538 if checks_needed <= 0:
1539 break
1541 if seen <= 0:
1542 raise ValueError(
1543 f"{type_name(self.__cls)} CsvWriter did not see any data.")
1545 # stream statistics do not have geometric means or medians
1546 if self.__cls is StreamStatistics:
1547 has_no_geom = has_no_median = True
1549 n_not_needed = n_really_not_needed or n_not_needed
1550 #: do we have a geometric mean?
1551 has_geo_mean: Final[bool] = (not has_no_geom) and (not all_same)
1552 #: do we have a median?
1553 has_median: Final[bool] = (not has_no_median) and (not all_same)
1555 #: the key for `n` is `None` if `n` is not printed, else it is the key
1556 self.__key_n: Final[str | None] = None if n_not_needed \
1557 else csv_scope(scope, KEY_N)
1559 key_all: str | None = None
1560 key_min: str | None = None
1561 key_mean_arith: str | None = None
1562 key_med: str | None = None
1563 key_max: str | None = None
1564 key_mean_geom: str | None = None
1565 key_sd: str | None = None
1567 if all_same:
1568 key_all = KEY_VALUE if scope is None else (
1569 csv_scope(scope, None if self.__key_n is None else KEY_VALUE))
1570 else:
1571 key_min = csv_scope(scope, KEY_MINIMUM)
1572 key_mean_arith = csv_scope(scope, KEY_MEAN_ARITH)
1573 if has_median:
1574 key_med = csv_scope(scope, KEY_MEDIAN)
1575 key_max = csv_scope(scope, KEY_MAXIMUM)
1576 if has_geo_mean:
1577 key_mean_geom = csv_scope(scope, KEY_MEAN_GEOM)
1578 key_sd = csv_scope(scope, KEY_STDDEV)
1580 #: the key for single values
1581 self.__key_all: Final[str | None] = key_all
1582 #: the key for minimum values
1583 self.__key_min: Final[str | None] = key_min
1584 #: the key for the arithmetic mean
1585 self.__key_mean_arith: Final[str | None] = key_mean_arith
1586 #: the key for the median
1587 self.__key_med: Final[str | None] = key_med
1588 #: the key for the geometric mean
1589 self.__key_mean_geom: Final[str | None] = key_mean_geom
1590 #: the key for the maximum value
1591 self.__key_max: Final[str | None] = key_max
1592 #: the key for the standard deviation
1593 self.__key_sd: Final[str | None] = key_sd
1595 long_name: str | None = \
1596 None if what_long is None else str.strip(what_long)
1597 short_name: str | None = \
1598 None if what_short is None else str.strip(what_short)
1599 if long_name is None:
1600 long_name = short_name
1601 elif short_name is None:
1602 short_name = long_name
1603 else:
1604 long_name = f"{long_name} ({short_name})"
1606 #: the short description of what the statistics are about
1607 self.__short_name: Final[str | None] = short_name
1608 #: the long description of what the statistics are about
1609 self.__long_name: Final[str | None] = long_name
1611 def get_column_titles(self) -> Iterable[str]:
1612 """
1613 Get the column titles.
1615 :returns: the column titles
1616 """
1617 if self.__key_n is not None:
1618 yield self.__key_n
1620 if self.__key_all is None:
1621 yield self.__key_min
1622 yield self.__key_mean_arith
1623 if self.__key_med is not None:
1624 yield self.__key_med
1625 if self.__key_mean_geom is not None:
1626 yield self.__key_mean_geom
1627 yield self.__key_max
1628 yield self.__key_sd
1629 else:
1630 yield self.__key_all
1632 def get_optional_row(self,
1633 data: int | float | T | None,
1634 n: int | None = None) -> Iterable[str]:
1635 """
1636 Attach an empty row of the correct shape to the output.
1638 This function may be needed in cases where the statistics are part of
1639 other records that sometimes do not contain the record.
1641 :param data: the data item
1642 :param n: the number of samples
1643 :returns: the optional row data
1645 >>> try:
1646 ... list(CsvWriter([StreamStatistics.from_single_value(
1647 ... 1)]).get_optional_row("x"))
1648 ... except TypeError as te:
1649 ... print(str(te)[:53])
1650 data should be an instance of any in {None, float, in
1651 """
1652 if data is None:
1653 # attach an empty row
1654 for _ in range((0 if self.__key_n is None else 1) + (
1655 (4 if self.__key_mean_geom is None else 5)
1656 + (0 if self.__key_med is None else 1)
1657 if self.__key_all is None else 1)):
1658 yield ""
1659 return
1660 if isinstance(data, int | float): # convert single value
1661 data = cast("T", self.__cls.from_single_value(
1662 data, 1 if n is None else n))
1663 elif not isinstance(data, StreamStatistics): # huh?
1664 raise type_error(data, "data", (
1665 int, float, StreamStatistics, None))
1666 elif (n is not None) and (n != data.n): # sanity check
1667 raise ValueError(f"data.n={data.n} but n={n}.")
1668 yield from self.get_row(data)
1670 def get_row(self, data: T) -> Iterable[str]:
1671 """
1672 Render a single sample statistics to a CSV row.
1674 :param data: the data sample statistics
1675 :returns: the row iterator
1676 """
1677 if self.__key_n is not None:
1678 yield str(data.n)
1679 if self.__key_all is None:
1680 yield num_to_str(data.minimum)
1681 yield num_to_str(data.mean_arith)
1682 if self.__key_med is not None:
1683 yield num_to_str(data.get_median())
1684 if self.__key_mean_geom is not None:
1685 yield num_or_none_to_str(data.get_mean_geom())
1686 yield num_to_str(data.maximum)
1687 yield num_or_none_to_str(data.stddev)
1688 else:
1689 if data.minimum != data.maximum:
1690 raise ValueError(f"Inconsistent data {data}.")
1691 yield num_to_str(data.minimum)
1693 def get_header_comments(self) -> Iterable[str]:
1694 """
1695 Get any possible header comments.
1697 :returns: the iterable of header comments
1698 """
1699 return [f"Statistics about {self.__long_name}."]\
1700 if (self.scope is not None) and (self.__long_name is not None)\
1701 else ()
1703 def get_footer_comments(self) -> Iterable[str]:
1704 """
1705 Get any possible footer comments.
1707 :returns: the footer comments
1708 """
1709 long_name: str | None = self.__long_name
1710 long_name = "" if long_name is None else f" {long_name}"
1711 short_name: str | None = self.__short_name
1712 short_name = "" if short_name is None else f" {short_name}"
1713 name: str = long_name
1714 first: bool = True
1716 scope: Final[str] = self.scope
1717 if (scope is not None) and (
1718 (self.__key_n is not None) or (
1719 self.__key_all is not None)):
1720 if first:
1721 yield ""
1722 first = False
1723 yield (f"All{name} sample statistics start with "
1724 f"{(scope + SCOPE_SEPARATOR)!r}.")
1725 name = short_name
1727 if self.__key_n is not None:
1728 if first:
1729 yield ""
1730 first = False
1731 yield f"{self.__key_n}: the number of{name} samples"
1732 name = short_name
1733 if self.__key_all is None:
1734 if first:
1735 yield ""
1736 n_name: str | None = self.__key_n
1737 if n_name is None:
1738 n_name = KEY_N
1739 yield f"{self.__key_min}: the smallest encountered{name} value"
1740 name = short_name
1741 yield (f"{self.__key_mean_arith}: the arithmetic mean of all the"
1742 f"{name} values, i.e., the sum of the values divided by "
1743 f"their number {n_name}")
1744 if self.__key_med is not None:
1745 yield (f"{self.__key_med}: the median of all the{name} "
1746 "values, which can be computed by sorting the values "
1747 "and then picking the value in the middle of the "
1748 f"sorted list (in case of an odd number {n_name} of "
1749 "values) or the arithmetic mean (half the sum) of the "
1750 "two values in the middle (in case of an even number "
1751 f"{n_name})")
1752 if self.__key_mean_geom is not None:
1753 yield (f"{self.__key_mean_geom}: the geometric mean of all the"
1754 f" {name} values, i.e., the {n_name}-th root of the "
1755 f"product of all values, which is only defined if all "
1756 f"values are > 0")
1757 yield f"{self.__key_max}: the largest encountered{name} value"
1758 yield (f"{self.__key_sd}: the standard deviation of the{name} "
1759 "values, which is a measure of spread: the larger it "
1760 "is, the farther are the values distributed away from "
1761 f"the arithmetic mean {self.__key_mean_arith}. It can be "
1762 "computed as the ((sum of squares) - (square of the sum)"
1763 f" / {n_name}) / ({n_name} - 1) of all{name} values.")
1764 else:
1765 if first:
1766 yield ""
1767 yield f"{self.__key_all}: all{name} samples have this value"
1769 def get_footer_bottom_comments(self) -> Iterable[str] | None:
1770 """
1771 Get the bottom footer comments.
1773 :returns: an iterator with the bottom comments
1775 >>> for p in CsvWriter([StreamStatistics(
1776 ... 1, 1, 1, 1, None)]).get_footer_bottom_comments():
1777 ... print(p[:30])
1778 This CSV output has been creat
1779 Statistics were computed using
1780 You can find pycommons at http
1781 """
1782 yield from pycommons_footer_bottom_comments(
1783 self, ("Statistics were computed using pycommons."
1784 f"math in mode {type_name(self.__cls)}."))