001/*
002 * Copyright (C) 2012 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.math;
016
017import static com.google.common.base.Preconditions.checkState;
018import static com.google.common.math.DoubleUtils.ensureNonNegative;
019import static com.google.common.primitives.Doubles.isFinite;
020import static java.lang.Double.NaN;
021import static java.lang.Double.isNaN;
022
023import com.google.common.annotations.GwtIncompatible;
024import com.google.common.annotations.J2ktIncompatible;
025import java.util.Iterator;
026import java.util.stream.DoubleStream;
027import java.util.stream.IntStream;
028import java.util.stream.LongStream;
029
030/**
031 * A mutable object which accumulates double values and tracks some basic statistics over all the
032 * values added so far. The values may be added singly or in groups. This class is not thread safe.
033 *
034 * @author Pete Gillin
035 * @author Kevin Bourrillion
036 * @since 20.0
037 */
038@J2ktIncompatible
039@GwtIncompatible
040@ElementTypesAreNonnullByDefault
041public final class StatsAccumulator {
042
043  // These fields must satisfy the requirements of Stats' constructor as well as those of the stat
044  // methods of this class.
045  private long count = 0;
046  private double mean = 0.0; // any finite value will do, we only use it to multiply by zero for sum
047  private double sumOfSquaresOfDeltas = 0.0;
048  private double min = NaN; // any value will do
049  private double max = NaN; // any value will do
050
051  /** Adds the given value to the dataset. */
052  public void add(double value) {
053    if (count == 0) {
054      count = 1;
055      mean = value;
056      min = value;
057      max = value;
058      if (!isFinite(value)) {
059        sumOfSquaresOfDeltas = NaN;
060      }
061    } else {
062      count++;
063      if (isFinite(value) && isFinite(mean)) {
064        // Art of Computer Programming vol. 2, Knuth, 4.2.2, (15) and (16)
065        double delta = value - mean;
066        mean += delta / count;
067        sumOfSquaresOfDeltas += delta * (value - mean);
068      } else {
069        mean = calculateNewMeanNonFinite(mean, value);
070        sumOfSquaresOfDeltas = NaN;
071      }
072      min = Math.min(min, value);
073      max = Math.max(max, value);
074    }
075  }
076
077  /**
078   * Adds the given values to the dataset.
079   *
080   * @param values a series of values, which will be converted to {@code double} values (this may
081   *     cause loss of precision)
082   */
083  public void addAll(Iterable<? extends Number> values) {
084    for (Number value : values) {
085      add(value.doubleValue());
086    }
087  }
088
089  /**
090   * Adds the given values to the dataset.
091   *
092   * @param values a series of values, which will be converted to {@code double} values (this may
093   *     cause loss of precision)
094   */
095  public void addAll(Iterator<? extends Number> values) {
096    while (values.hasNext()) {
097      add(values.next().doubleValue());
098    }
099  }
100
101  /**
102   * Adds the given values to the dataset.
103   *
104   * @param values a series of values
105   */
106  public void addAll(double... values) {
107    for (double value : values) {
108      add(value);
109    }
110  }
111
112  /**
113   * Adds the given values to the dataset.
114   *
115   * @param values a series of values
116   */
117  public void addAll(int... values) {
118    for (int value : values) {
119      add(value);
120    }
121  }
122
123  /**
124   * Adds the given values to the dataset.
125   *
126   * @param values a series of values, which will be converted to {@code double} values (this may
127   *     cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15))
128   */
129  public void addAll(long... values) {
130    for (long value : values) {
131      add(value);
132    }
133  }
134
135  /**
136   * Adds the given values to the dataset. The stream will be completely consumed by this method.
137   *
138   * @param values a series of values
139   * @since 28.2
140   */
141  public void addAll(DoubleStream values) {
142    addAll(values.collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll));
143  }
144
145  /**
146   * Adds the given values to the dataset. The stream will be completely consumed by this method.
147   *
148   * @param values a series of values
149   * @since 28.2
150   */
151  public void addAll(IntStream values) {
152    addAll(values.collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll));
153  }
154
155  /**
156   * Adds the given values to the dataset. The stream will be completely consumed by this method.
157   *
158   * @param values a series of values, which will be converted to {@code double} values (this may
159   *     cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15))
160   * @since 28.2
161   */
162  public void addAll(LongStream values) {
163    addAll(values.collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll));
164  }
165
166  /**
167   * Adds the given statistics to the dataset, as if the individual values used to compute the
168   * statistics had been added directly.
169   */
170  public void addAll(Stats values) {
171    if (values.count() == 0) {
172      return;
173    }
174    merge(values.count(), values.mean(), values.sumOfSquaresOfDeltas(), values.min(), values.max());
175  }
176
177  /**
178   * Adds the given statistics to the dataset, as if the individual values used to compute the
179   * statistics had been added directly.
180   *
181   * @since 28.2
182   */
183  public void addAll(StatsAccumulator values) {
184    if (values.count() == 0) {
185      return;
186    }
187    merge(values.count(), values.mean(), values.sumOfSquaresOfDeltas(), values.min(), values.max());
188  }
189
190  private void merge(
191      long otherCount,
192      double otherMean,
193      double otherSumOfSquaresOfDeltas,
194      double otherMin,
195      double otherMax) {
196    if (count == 0) {
197      count = otherCount;
198      mean = otherMean;
199      sumOfSquaresOfDeltas = otherSumOfSquaresOfDeltas;
200      min = otherMin;
201      max = otherMax;
202    } else {
203      count += otherCount;
204      if (isFinite(mean) && isFinite(otherMean)) {
205        // This is a generalized version of the calculation in add(double) above.
206        double delta = otherMean - mean;
207        mean += delta * otherCount / count;
208        sumOfSquaresOfDeltas += otherSumOfSquaresOfDeltas + delta * (otherMean - mean) * otherCount;
209      } else {
210        mean = calculateNewMeanNonFinite(mean, otherMean);
211        sumOfSquaresOfDeltas = NaN;
212      }
213      min = Math.min(min, otherMin);
214      max = Math.max(max, otherMax);
215    }
216  }
217
218  /** Returns an immutable snapshot of the current statistics. */
219  public Stats snapshot() {
220    return new Stats(count, mean, sumOfSquaresOfDeltas, min, max);
221  }
222
223  /** Returns the number of values. */
224  public long count() {
225    return count;
226  }
227
228  /**
229   * Returns the <a href="http://en.wikipedia.org/wiki/Arithmetic_mean">arithmetic mean</a> of the
230   * values. The count must be non-zero.
231   *
232   * <p>If these values are a sample drawn from a population, this is also an unbiased estimator of
233   * the arithmetic mean of the population.
234   *
235   * <h3>Non-finite values</h3>
236   *
237   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
238   * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the
239   * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
240   * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}.
241   * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or {@link
242   * Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}.
243   *
244   * @throws IllegalStateException if the dataset is empty
245   */
246  public double mean() {
247    checkState(count != 0);
248    return mean;
249  }
250
251  /**
252   * Returns the sum of the values.
253   *
254   * <h3>Non-finite values</h3>
255   *
256   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
257   * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the
258   * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
259   * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}.
260   * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or {@link
261   * Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}.
262   */
263  public final double sum() {
264    return mean * count;
265  }
266
267  /**
268   * Returns the <a href="http://en.wikipedia.org/wiki/Variance#Population_variance">population
269   * variance</a> of the values. The count must be non-zero.
270   *
271   * <p>This is guaranteed to return zero if the dataset contains only exactly one finite value. It
272   * is not guaranteed to return zero when the dataset consists of the same value multiple times,
273   * due to numerical errors. However, it is guaranteed never to return a negative result.
274   *
275   * <h3>Non-finite values</h3>
276   *
277   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
278   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
279   *
280   * @throws IllegalStateException if the dataset is empty
281   */
282  public final double populationVariance() {
283    checkState(count != 0);
284    if (isNaN(sumOfSquaresOfDeltas)) {
285      return NaN;
286    }
287    if (count == 1) {
288      return 0.0;
289    }
290    return ensureNonNegative(sumOfSquaresOfDeltas) / count;
291  }
292
293  /**
294   * Returns the <a
295   * href="http://en.wikipedia.org/wiki/Standard_deviation#Definition_of_population_values">
296   * population standard deviation</a> of the values. The count must be non-zero.
297   *
298   * <p>This is guaranteed to return zero if the dataset contains only exactly one finite value. It
299   * is not guaranteed to return zero when the dataset consists of the same value multiple times,
300   * due to numerical errors. However, it is guaranteed never to return a negative result.
301   *
302   * <h3>Non-finite values</h3>
303   *
304   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
305   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
306   *
307   * @throws IllegalStateException if the dataset is empty
308   */
309  public final double populationStandardDeviation() {
310    return Math.sqrt(populationVariance());
311  }
312
313  /**
314   * Returns the <a href="http://en.wikipedia.org/wiki/Variance#Sample_variance">unbiased sample
315   * variance</a> of the values. If this dataset is a sample drawn from a population, this is an
316   * unbiased estimator of the population variance of the population. The count must be greater than
317   * one.
318   *
319   * <p>This is not guaranteed to return zero when the dataset consists of the same value multiple
320   * times, due to numerical errors. However, it is guaranteed never to return a negative result.
321   *
322   * <h3>Non-finite values</h3>
323   *
324   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
325   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
326   *
327   * @throws IllegalStateException if the dataset is empty or contains a single value
328   */
329  public final double sampleVariance() {
330    checkState(count > 1);
331    if (isNaN(sumOfSquaresOfDeltas)) {
332      return NaN;
333    }
334    return ensureNonNegative(sumOfSquaresOfDeltas) / (count - 1);
335  }
336
337  /**
338   * Returns the <a
339   * href="http://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation">
340   * corrected sample standard deviation</a> of the values. If this dataset is a sample drawn from a
341   * population, this is an estimator of the population standard deviation of the population which
342   * is less biased than {@link #populationStandardDeviation()} (the unbiased estimator depends on
343   * the distribution). The count must be greater than one.
344   *
345   * <p>This is not guaranteed to return zero when the dataset consists of the same value multiple
346   * times, due to numerical errors. However, it is guaranteed never to return a negative result.
347   *
348   * <h3>Non-finite values</h3>
349   *
350   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
351   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
352   *
353   * @throws IllegalStateException if the dataset is empty or contains a single value
354   */
355  public final double sampleStandardDeviation() {
356    return Math.sqrt(sampleVariance());
357  }
358
359  /**
360   * Returns the lowest value in the dataset. The count must be non-zero.
361   *
362   * <h3>Non-finite values</h3>
363   *
364   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
365   * contains {@link Double#NEGATIVE_INFINITY} and not {@link Double#NaN} then the result is {@link
366   * Double#NEGATIVE_INFINITY}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
367   * only then the result is the lowest finite value. If it contains {@link
368   * Double#POSITIVE_INFINITY} only then the result is {@link Double#POSITIVE_INFINITY}.
369   *
370   * @throws IllegalStateException if the dataset is empty
371   */
372  public double min() {
373    checkState(count != 0);
374    return min;
375  }
376
377  /**
378   * Returns the highest value in the dataset. The count must be non-zero.
379   *
380   * <h3>Non-finite values</h3>
381   *
382   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
383   * contains {@link Double#POSITIVE_INFINITY} and not {@link Double#NaN} then the result is {@link
384   * Double#POSITIVE_INFINITY}. If it contains {@link Double#NEGATIVE_INFINITY} and finite values
385   * only then the result is the highest finite value. If it contains {@link
386   * Double#NEGATIVE_INFINITY} only then the result is {@link Double#NEGATIVE_INFINITY}.
387   *
388   * @throws IllegalStateException if the dataset is empty
389   */
390  public double max() {
391    checkState(count != 0);
392    return max;
393  }
394
395  double sumOfSquaresOfDeltas() {
396    return sumOfSquaresOfDeltas;
397  }
398
399  /**
400   * Calculates the new value for the accumulated mean when a value is added, in the case where at
401   * least one of the previous mean and the value is non-finite.
402   */
403  static double calculateNewMeanNonFinite(double previousMean, double value) {
404    /*
405     * Desired behaviour is to match the results of applying the naive mean formula. In particular,
406     * the update formula can subtract infinities in cases where the naive formula would add them.
407     *
408     * Consequently:
409     * 1. If the previous mean is finite and the new value is non-finite then the new mean is that
410     *    value (whether it is NaN or infinity).
411     * 2. If the new value is finite and the previous mean is non-finite then the mean is unchanged
412     *    (whether it is NaN or infinity).
413     * 3. If both the previous mean and the new value are non-finite and...
414     * 3a. ...either or both is NaN (so mean != value) then the new mean is NaN.
415     * 3b. ...they are both the same infinities (so mean == value) then the mean is unchanged.
416     * 3c. ...they are different infinities (so mean != value) then the new mean is NaN.
417     */
418    if (isFinite(previousMean)) {
419      // This is case 1.
420      return value;
421    } else if (isFinite(value) || previousMean == value) {
422      // This is case 2. or 3b.
423      return previousMean;
424    } else {
425      // This is case 3a. or 3c.
426      return NaN;
427    }
428  }
429}