metrics_util/summary.rs
1use sketches_ddsketch::{Config, DDSketch};
2use std::fmt;
3
4/// A quantile sketch with relative-error guarantees.
5///
6/// Based on [DDSketch][ddsketch], `Summary` provides quantiles over an arbitrary distribution of
7/// floating-point numbers, including for negative numbers, using a space-efficient sketch that
8/// provides relative-error guarantees, regardless of the absolute range between the smallest and
9/// larger values.
10///
11/// `Summary` is similiar to [HDRHistogram][hdrhistogram] in practice, but supports an arbitrary
12/// range of values, and supports floating-point numbers.
13///
14/// Numbers with an absolute value smaller than given `min_value` will be recognized as zeroes.
15///
16/// Memory usage for `Summary` should be nearly identical to `DDSketch`.
17/// [`Summary::estimated_size`] provides a rough estimate of summary size based on the current
18/// values that have been added to it.
19///
20/// As mentioned above, this sketch provides relative-error guarantees across quantiles falling
21/// within 0 <= q <= 1, but trades some accuracy at the lowest quantiles as part of the collapsing
22/// scheme that allows for automatically handling arbitrary ranges of values, even when the
23/// maximum number of bins has been allocated. Typically, q=0.05 and below is where this error will
24/// be noticed, if present.
25///
26/// For cases when all values are positive, you can simply use [`Summary::min`] in lieu of checking
27/// these quantiles, as the minimum value will be closer to the true value. For cases when values
28/// range from negative to positive, the aforementioned collapsing will perturb the estimated true
29/// value for quantiles that conceptually fall within this collapsed band.
30///
31/// For example, for a distribution that spans from -25 to 75, we would intuitively expect q=0 to be
32/// -25, q=0.25 to be 0, q=0.5 to be 25, and so on. Internally, negative numbers and positive
33/// numbers are handled in two separate containers. Based on this example, one container would
34/// handle -25 to 0, and another would handle the 0 to 75 range. As the containers are mapped "back
35/// to back", q=0.25 for this hypothetical summary would actually be q=0 within the negative
36/// container, which may return an estimated true value that exceeds the relative error guarantees.
37///
38/// Of course, as these problems are related to the estimation aspect of this data structure, users
39/// can allow the summary to allocate more bins to compensate for these edge cases, if desired.
40///
41/// [ddsketch]: https://arxiv.org/abs/1908.10693
42/// [hdrhistogram]: https://docs.rs/hdrhistogram
43#[derive(Clone)]
44pub struct Summary {
45 sketch: DDSketch,
46}
47
48impl Summary {
49 /// Creates a new [`Summary`].
50 ///
51 /// `alpha` represents the desired relative error for this summary. If `alpha` was 0.0001, that
52 /// would represent a desired relative error of 0.01%. For example, if the true value at
53 /// quantile q0 was 1, the estimated value at that quantile would be a value within 0.01% of the
54 /// true value, or a value between 0.9999 and 1.0001.
55 ///
56 /// `max_buckets` controls how many subbuckets are created, which directly influences memory usage.
57 /// Each bucket "costs" eight bytes, so a summary with 2048 buckets would consume a maximum of
58 /// around 16 KiB. Depending on how many samples have been added to the summary, the number of
59 /// subbuckets allocated may be far below `max_buckets`, and the summary will allocate more as
60 /// needed to fulfill the relative error guarantee.
61 ///
62 /// `min_value` controls the smallest value that will be recognized distinctly from zero. Said
63 /// another way, any value between `-min_value` and `min_value` will be counted as zero.
64 pub fn new(alpha: f64, max_buckets: u32, min_value: f64) -> Summary {
65 let config = Config::new(alpha, max_buckets, min_value.abs());
66
67 Summary { sketch: DDSketch::new(config) }
68 }
69
70 /// Creates a new [`Summary`] with default values.
71 ///
72 /// `alpha` is 0.0001, `max_buckets` is 32,768, and `min_value` is 1.0e-9.
73 ///
74 /// This will yield a summary that is roughly equivalent in memory usage to an HDRHistogram with
75 /// 3 significant digits, and will support values down to a single nanosecond.
76 ///
77 /// In practice, when using only positive values, maximum memory usage can be expected to hover
78 /// around 200KiB, while usage of negative values can lead to an average maximum size of around
79 /// 400KiB.
80 pub fn with_defaults() -> Summary {
81 Summary::new(0.0001, 32_768, 1.0e-9)
82 }
83
84 /// Adds a sample to the summary.
85 ///
86 /// If the absolute value of `value` is smaller than given `min_value`, it will be added as a zero.
87 pub fn add(&mut self, value: f64) {
88 if value.is_infinite() {
89 return;
90 }
91
92 self.sketch.add(value);
93 }
94
95 /// Gets the estimated value at the given quantile.
96 ///
97 /// If the sketch is empty, or if the quantile is less than 0.0 or greater than 1.0, then the
98 /// result will be `None`.
99 ///
100 /// If the 0.0 or 1.0 quantile is requested, this function will return self.min() or self.max()
101 /// instead of the estimated value.
102 pub fn quantile(&self, q: f64) -> Option<f64> {
103 if !(0.0..=1.0).contains(&q) || self.count() == 0 {
104 return None;
105 }
106
107 self.sketch.quantile(q).expect("quantile should be valid at this point")
108 }
109
110 /// Merge another Summary into this one.
111 ///
112 /// # Errors
113 ///
114 /// This function will return an error if the other Summary was not created with the same
115 /// parameters.
116 pub fn merge(&mut self, other: &Summary) -> Result<(), MergeError> {
117 self.sketch.merge(&other.sketch).map_err(|_| MergeError {})?;
118 Ok(())
119 }
120
121 /// Gets the minimum value this summary has seen so far.
122 pub fn min(&self) -> f64 {
123 self.sketch.min().unwrap_or(f64::INFINITY)
124 }
125
126 /// Gets the maximum value this summary has seen so far.
127 pub fn max(&self) -> f64 {
128 self.sketch.max().unwrap_or(f64::NEG_INFINITY)
129 }
130
131 /// Whether or not this summary is empty.
132 pub fn is_empty(&self) -> bool {
133 self.count() == 0
134 }
135
136 /// Gets the number of samples in this summary.
137 pub fn count(&self) -> usize {
138 self.sketch.count()
139 }
140
141 /// Gets the estimized size of this summary, in bytes.
142 ///
143 /// In practice, this value should be very close to the actual size, but will not be entirely
144 /// precise.
145 pub fn estimated_size(&self) -> usize {
146 std::mem::size_of::<Self>() + (self.sketch.length() * 8)
147 }
148}
149
150#[derive(Copy, Clone, Debug)]
151pub struct MergeError {}
152
153impl fmt::Display for MergeError {
154 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
155 write!(f, "merge error")
156 }
157}
158
159impl std::error::Error for MergeError {}
160
161#[cfg(test)]
162mod tests {
163 use super::Summary;
164
165 use quickcheck_macros::quickcheck;
166
167 // Need this, because without the relative_eq/abs_diff_eq imports, we get weird IDE errors.
168 #[allow(unused_imports)]
169 use approx::{abs_diff_eq, assert_abs_diff_eq, assert_relative_eq, relative_eq};
170
171 use ndarray::{Array1, Axis};
172 use ndarray_stats::{interpolate::Linear, QuantileExt};
173 use noisy_float::types::n64;
174 use ordered_float::NotNan;
175 use rand::{distributions::Distribution, thread_rng};
176 use rand_distr::Uniform;
177
178 #[test]
179 fn test_basics() {
180 let alpha = 0.0001;
181 let max_bins = 32_768;
182 let min_value = 1.0e-9;
183 let mut summary = Summary::new(alpha, max_bins, min_value);
184 assert!(summary.is_empty());
185
186 // Stretch the legs with a single value.
187 summary.add(-420.42);
188 assert_eq!(summary.count(), 1);
189 assert_relative_eq!(summary.min(), -420.42);
190 assert_relative_eq!(summary.max(), -420.42);
191
192 let test_cases = vec![(0.1, -420.42), (0.5, -420.42), (0.9, -420.42)];
193 for (q, val) in test_cases {
194 assert_relative_eq!(
195 summary.quantile(q).expect("value should exist"),
196 val,
197 max_relative = alpha
198 );
199 }
200
201 summary.add(420.42);
202
203 assert_eq!(summary.count(), 2);
204 assert_relative_eq!(summary.min(), -420.42);
205 assert_relative_eq!(summary.max(), 420.42);
206 assert_relative_eq!(
207 summary.quantile(0.5).expect("value should exist"),
208 -420.42,
209 max_relative = alpha
210 );
211 assert_relative_eq!(
212 summary.quantile(0.51).expect("value should exist"),
213 -420.42,
214 max_relative = alpha
215 );
216
217 summary.add(42.42);
218 assert_eq!(summary.count(), 3);
219 assert_relative_eq!(summary.min(), -420.42);
220 assert_relative_eq!(summary.max(), 420.42);
221
222 let test_cases = vec![
223 (0.333333, -420.42),
224 (0.333334, -420.42),
225 (0.666666, 42.42),
226 (0.666667, 42.42),
227 (0.999999, 42.42),
228 ];
229 for (q, val) in test_cases {
230 assert_relative_eq!(
231 summary.quantile(q).expect("value should exist"),
232 val,
233 max_relative = alpha
234 );
235 }
236 }
237
238 #[test]
239 fn test_positive_uniform() {
240 let alpha = 0.0001;
241 let max_bins = 32_768;
242 let min_value = 1.0e-9;
243
244 let mut rng = thread_rng();
245 let dist = Uniform::new(0.0, 100.0);
246
247 let mut summary = Summary::new(alpha, max_bins, min_value);
248 let mut uniform = Vec::new();
249 for _ in 0..100_000 {
250 let value = dist.sample(&mut rng);
251 uniform.push(NotNan::new(value).unwrap());
252 summary.add(value);
253 }
254
255 uniform.sort();
256 let mut true_histogram = Array1::from(uniform);
257
258 let quantiles = &[0.25, 0.5, 0.75, 0.99];
259 for quantile in quantiles {
260 let aval_raw = true_histogram
261 .quantile_axis_mut(Axis(0), n64(*quantile), &Linear)
262 .expect("quantile should be in range");
263 let aval = aval_raw.get(()).expect("quantile value should be present").into_inner();
264 let sval = summary.quantile(*quantile).expect("quantile value should be present");
265
266 // Multiply the true value by α, and double it to account from the -α/α swing.
267 let distance = (aval * alpha) * 2.0;
268
269 assert_relative_eq!(aval, sval, max_relative = distance);
270 }
271 }
272
273 #[test]
274 fn test_negative_positive_uniform() {
275 let alpha = 0.0001;
276 let max_bins = 65_536;
277 let min_value = 1.0e-9;
278
279 let mut rng = thread_rng();
280 let dist = Uniform::new(-100.0, 100.0);
281
282 let mut summary = Summary::new(alpha, max_bins, min_value);
283 let mut uniform = Vec::new();
284 for _ in 0..100_000 {
285 let value = dist.sample(&mut rng);
286 uniform.push(NotNan::new(value).unwrap());
287 summary.add(value);
288 }
289
290 uniform.sort();
291 let mut true_histogram = Array1::from(uniform);
292
293 // We explicitly skirt q=0.5 here to avoid the edge case quantiles as best as possible
294 // while asserting tightly to our relative error bound for everything else.
295 let quantiles = &[0.25, 0.47, 0.75, 0.99];
296 for quantile in quantiles {
297 let aval_raw = true_histogram
298 .quantile_axis_mut(Axis(0), n64(*quantile), &Linear)
299 .expect("quantile should be in range");
300 let aval = aval_raw.get(()).expect("quantile value should be present").into_inner();
301 let sval = summary.quantile(*quantile).expect("quantile value should be present");
302
303 // Multiply the true value by α, and quadruple it to account from the -α/α swing,
304 // but also to account for the values sitting at the edge case quantiles.
305 let distance = (aval.abs() * alpha) * 2.0;
306
307 assert_relative_eq!(aval, sval, max_relative = distance);
308 }
309 }
310
311 #[test]
312 fn test_zeroes() {
313 let mut summary = Summary::with_defaults();
314 summary.add(0.0);
315 assert_eq!(summary.quantile(0.5), Some(0.0));
316 }
317
318 #[test]
319 fn test_infinities() {
320 let mut summary = Summary::with_defaults();
321 summary.add(f64::INFINITY);
322 assert_eq!(summary.quantile(0.5), None);
323 summary.add(f64::NEG_INFINITY);
324 assert_eq!(summary.quantile(0.5), None);
325 }
326
327 #[quickcheck]
328 fn quantile_validity(inputs: Vec<f64>) -> bool {
329 let mut had_non_inf = false;
330
331 let mut summary = Summary::with_defaults();
332 for input in &inputs {
333 if !input.is_infinite() {
334 had_non_inf = true;
335 }
336 summary.add(*input);
337 }
338
339 let qs = &[0.0, 0.5, 0.9, 0.95, 0.99, 0.999, 1.0];
340 for q in qs {
341 let result = summary.quantile(*q);
342 if had_non_inf {
343 assert!(result.is_some());
344 } else {
345 assert!(result.is_none());
346 }
347 }
348
349 true
350 }
351}