001/*
002 * Copyright (C) 2012 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.math;
016
017import static com.google.common.base.Preconditions.checkState;
018import static com.google.common.math.DoubleUtils.ensureNonNegative;
019import static com.google.common.primitives.Doubles.isFinite;
020import static java.lang.Double.NaN;
021import static java.lang.Double.isNaN;
022
023import com.google.common.annotations.Beta;
024import com.google.common.annotations.GwtIncompatible;
025import java.util.Iterator;
026import java.util.stream.DoubleStream;
027import java.util.stream.IntStream;
028import java.util.stream.LongStream;
029
030/**
031 * A mutable object which accumulates double values and tracks some basic statistics over all the
032 * values added so far. The values may be added singly or in groups. This class is not thread safe.
033 *
034 * @author Pete Gillin
035 * @author Kevin Bourrillion
036 * @since 20.0
037 */
038@Beta
039@GwtIncompatible
040public final class StatsAccumulator {
041
042  // These fields must satisfy the requirements of Stats' constructor as well as those of the stat
043  // methods of this class.
044  private long count = 0;
045  private double mean = 0.0; // any finite value will do, we only use it to multiply by zero for sum
046  private double sumOfSquaresOfDeltas = 0.0;
047  private double min = NaN; // any value will do
048  private double max = NaN; // any value will do
049
050  /** Adds the given value to the dataset. */
051  public void add(double value) {
052    if (count == 0) {
053      count = 1;
054      mean = value;
055      min = value;
056      max = value;
057      if (!isFinite(value)) {
058        sumOfSquaresOfDeltas = NaN;
059      }
060    } else {
061      count++;
062      if (isFinite(value) && isFinite(mean)) {
063        // Art of Computer Programming vol. 2, Knuth, 4.2.2, (15) and (16)
064        double delta = value - mean;
065        mean += delta / count;
066        sumOfSquaresOfDeltas += delta * (value - mean);
067      } else {
068        mean = calculateNewMeanNonFinite(mean, value);
069        sumOfSquaresOfDeltas = NaN;
070      }
071      min = Math.min(min, value);
072      max = Math.max(max, value);
073    }
074  }
075
076  /**
077   * Adds the given values to the dataset.
078   *
079   * @param values a series of values, which will be converted to {@code double} values (this may
080   *     cause loss of precision)
081   */
082  public void addAll(Iterable<? extends Number> values) {
083    for (Number value : values) {
084      add(value.doubleValue());
085    }
086  }
087
088  /**
089   * Adds the given values to the dataset.
090   *
091   * @param values a series of values, which will be converted to {@code double} values (this may
092   *     cause loss of precision)
093   */
094  public void addAll(Iterator<? extends Number> values) {
095    while (values.hasNext()) {
096      add(values.next().doubleValue());
097    }
098  }
099
100  /**
101   * Adds the given values to the dataset.
102   *
103   * @param values a series of values
104   */
105  public void addAll(double... values) {
106    for (double value : values) {
107      add(value);
108    }
109  }
110
111  /**
112   * Adds the given values to the dataset.
113   *
114   * @param values a series of values
115   */
116  public void addAll(int... values) {
117    for (int value : values) {
118      add(value);
119    }
120  }
121
122  /**
123   * Adds the given values to the dataset.
124   *
125   * @param values a series of values, which will be converted to {@code double} values (this may
126   *     cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15))
127   */
128  public void addAll(long... values) {
129    for (long value : values) {
130      add(value);
131    }
132  }
133
134  /**
135   * Adds the given values to the dataset. The stream will be completely consumed by this method.
136   *
137   * @param values a series of values
138   * @since 28.2
139   */
140  public void addAll(DoubleStream values) {
141    addAll(values.collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll));
142  }
143
144  /**
145   * Adds the given values to the dataset. The stream will be completely consumed by this method.
146   *
147   * @param values a series of values
148   * @since 28.2
149   */
150  public void addAll(IntStream values) {
151    addAll(values.collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll));
152  }
153
154  /**
155   * Adds the given values to the dataset. The stream will be completely consumed by this method.
156   *
157   * @param values a series of values, which will be converted to {@code double} values (this may
158   *     cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15))
159   * @since 28.2
160   */
161  public void addAll(LongStream values) {
162    addAll(values.collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll));
163  }
164
165  /**
166   * Adds the given statistics to the dataset, as if the individual values used to compute the
167   * statistics had been added directly.
168   */
169  public void addAll(Stats values) {
170    if (values.count() == 0) {
171      return;
172    }
173    merge(values.count(), values.mean(), values.sumOfSquaresOfDeltas(), values.min(), values.max());
174  }
175
176  /**
177   * Adds the given statistics to the dataset, as if the individual values used to compute the
178   * statistics had been added directly.
179   *
180   * @since 28.2
181   */
182  public void addAll(StatsAccumulator values) {
183    if (values.count() == 0) {
184      return;
185    }
186    merge(values.count(), values.mean(), values.sumOfSquaresOfDeltas(), values.min(), values.max());
187  }
188
189  private void merge(
190      long otherCount,
191      double otherMean,
192      double otherSumOfSquaresOfDeltas,
193      double otherMin,
194      double otherMax) {
195    if (count == 0) {
196      count = otherCount;
197      mean = otherMean;
198      sumOfSquaresOfDeltas = otherSumOfSquaresOfDeltas;
199      min = otherMin;
200      max = otherMax;
201    } else {
202      count += otherCount;
203      if (isFinite(mean) && isFinite(otherMean)) {
204        // This is a generalized version of the calculation in add(double) above.
205        double delta = otherMean - mean;
206        mean += delta * otherCount / count;
207        sumOfSquaresOfDeltas += otherSumOfSquaresOfDeltas + delta * (otherMean - mean) * otherCount;
208      } else {
209        mean = calculateNewMeanNonFinite(mean, otherMean);
210        sumOfSquaresOfDeltas = NaN;
211      }
212      min = Math.min(min, otherMin);
213      max = Math.max(max, otherMax);
214    }
215  }
216
217  /** Returns an immutable snapshot of the current statistics. */
218  public Stats snapshot() {
219    return new Stats(count, mean, sumOfSquaresOfDeltas, min, max);
220  }
221
222  /** Returns the number of values. */
223  public long count() {
224    return count;
225  }
226
227  /**
228   * Returns the <a href="http://en.wikipedia.org/wiki/Arithmetic_mean">arithmetic mean</a> of the
229   * values. The count must be non-zero.
230   *
231   * <p>If these values are a sample drawn from a population, this is also an unbiased estimator of
232   * the arithmetic mean of the population.
233   *
234   * <h3>Non-finite values</h3>
235   *
236   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
237   * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the
238   * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
239   * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}.
240   * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or {@link
241   * Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}.
242   *
243   * @throws IllegalStateException if the dataset is empty
244   */
245  public double mean() {
246    checkState(count != 0);
247    return mean;
248  }
249
250  /**
251   * Returns the sum of the values.
252   *
253   * <h3>Non-finite values</h3>
254   *
255   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
256   * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the
257   * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
258   * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}.
259   * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or {@link
260   * Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}.
261   */
262  public final double sum() {
263    return mean * count;
264  }
265
266  /**
267   * Returns the <a href="http://en.wikipedia.org/wiki/Variance#Population_variance">population
268   * variance</a> of the values. The count must be non-zero.
269   *
270   * <p>This is guaranteed to return zero if the dataset contains only exactly one finite value. It
271   * is not guaranteed to return zero when the dataset consists of the same value multiple times,
272   * due to numerical errors. However, it is guaranteed never to return a negative result.
273   *
274   * <h3>Non-finite values</h3>
275   *
276   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
277   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
278   *
279   * @throws IllegalStateException if the dataset is empty
280   */
281  public final double populationVariance() {
282    checkState(count != 0);
283    if (isNaN(sumOfSquaresOfDeltas)) {
284      return NaN;
285    }
286    if (count == 1) {
287      return 0.0;
288    }
289    return ensureNonNegative(sumOfSquaresOfDeltas) / count;
290  }
291
292  /**
293   * Returns the <a
294   * href="http://en.wikipedia.org/wiki/Standard_deviation#Definition_of_population_values">
295   * population standard deviation</a> of the values. The count must be non-zero.
296   *
297   * <p>This is guaranteed to return zero if the dataset contains only exactly one finite value. It
298   * is not guaranteed to return zero when the dataset consists of the same value multiple times,
299   * due to numerical errors. However, it is guaranteed never to return a negative result.
300   *
301   * <h3>Non-finite values</h3>
302   *
303   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
304   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
305   *
306   * @throws IllegalStateException if the dataset is empty
307   */
308  public final double populationStandardDeviation() {
309    return Math.sqrt(populationVariance());
310  }
311
312  /**
313   * Returns the <a href="http://en.wikipedia.org/wiki/Variance#Sample_variance">unbiased sample
314   * variance</a> of the values. If this dataset is a sample drawn from a population, this is an
315   * unbiased estimator of the population variance of the population. The count must be greater than
316   * one.
317   *
318   * <p>This is not guaranteed to return zero when the dataset consists of the same value multiple
319   * times, due to numerical errors. However, it is guaranteed never to return a negative result.
320   *
321   * <h3>Non-finite values</h3>
322   *
323   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
324   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
325   *
326   * @throws IllegalStateException if the dataset is empty or contains a single value
327   */
328  public final double sampleVariance() {
329    checkState(count > 1);
330    if (isNaN(sumOfSquaresOfDeltas)) {
331      return NaN;
332    }
333    return ensureNonNegative(sumOfSquaresOfDeltas) / (count - 1);
334  }
335
336  /**
337   * Returns the <a
338   * href="http://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation">
339   * corrected sample standard deviation</a> of the values. If this dataset is a sample drawn from a
340   * population, this is an estimator of the population standard deviation of the population which
341   * is less biased than {@link #populationStandardDeviation()} (the unbiased estimator depends on
342   * the distribution). The count must be greater than one.
343   *
344   * <p>This is not guaranteed to return zero when the dataset consists of the same value multiple
345   * times, due to numerical errors. However, it is guaranteed never to return a negative result.
346   *
347   * <h3>Non-finite values</h3>
348   *
349   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
350   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
351   *
352   * @throws IllegalStateException if the dataset is empty or contains a single value
353   */
354  public final double sampleStandardDeviation() {
355    return Math.sqrt(sampleVariance());
356  }
357
358  /**
359   * Returns the lowest value in the dataset. The count must be non-zero.
360   *
361   * <h3>Non-finite values</h3>
362   *
363   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
364   * contains {@link Double#NEGATIVE_INFINITY} and not {@link Double#NaN} then the result is {@link
365   * Double#NEGATIVE_INFINITY}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
366   * only then the result is the lowest finite value. If it contains {@link
367   * Double#POSITIVE_INFINITY} only then the result is {@link Double#POSITIVE_INFINITY}.
368   *
369   * @throws IllegalStateException if the dataset is empty
370   */
371  public double min() {
372    checkState(count != 0);
373    return min;
374  }
375
376  /**
377   * Returns the highest value in the dataset. The count must be non-zero.
378   *
379   * <h3>Non-finite values</h3>
380   *
381   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
382   * contains {@link Double#POSITIVE_INFINITY} and not {@link Double#NaN} then the result is {@link
383   * Double#POSITIVE_INFINITY}. If it contains {@link Double#NEGATIVE_INFINITY} and finite values
384   * only then the result is the highest finite value. If it contains {@link
385   * Double#NEGATIVE_INFINITY} only then the result is {@link Double#NEGATIVE_INFINITY}.
386   *
387   * @throws IllegalStateException if the dataset is empty
388   */
389  public double max() {
390    checkState(count != 0);
391    return max;
392  }
393
394  double sumOfSquaresOfDeltas() {
395    return sumOfSquaresOfDeltas;
396  }
397
398  /**
399   * Calculates the new value for the accumulated mean when a value is added, in the case where at
400   * least one of the previous mean and the value is non-finite.
401   */
402  static double calculateNewMeanNonFinite(double previousMean, double value) {
403    /*
404     * Desired behaviour is to match the results of applying the naive mean formula. In particular,
405     * the update formula can subtract infinities in cases where the naive formula would add them.
406     *
407     * Consequently:
408     * 1. If the previous mean is finite and the new value is non-finite then the new mean is that
409     *    value (whether it is NaN or infinity).
410     * 2. If the new value is finite and the previous mean is non-finite then the mean is unchanged
411     *    (whether it is NaN or infinity).
412     * 3. If both the previous mean and the new value are non-finite and...
413     * 3a. ...either or both is NaN (so mean != value) then the new mean is NaN.
414     * 3b. ...they are both the same infinities (so mean == value) then the mean is unchanged.
415     * 3c. ...they are different infinities (so mean != value) then the new mean is NaN.
416     */
417    if (isFinite(previousMean)) {
418      // This is case 1.
419      return value;
420    } else if (isFinite(value) || previousMean == value) {
421      // This is case 2. or 3b.
422      return previousMean;
423    } else {
424      // This is case 3a. or 3c.
425      return NaN;
426    }
427  }
428}