001/*
002 * Copyright (C) 2012 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.math;
016
017import static com.google.common.base.Preconditions.checkState;
018import static com.google.common.math.DoubleUtils.ensureNonNegative;
019import static com.google.common.primitives.Doubles.isFinite;
020import static java.lang.Double.NaN;
021import static java.lang.Double.isNaN;
022
023import com.google.common.annotations.Beta;
024import com.google.common.annotations.GwtIncompatible;
025import java.util.Iterator;
026
027/**
028 * A mutable object which accumulates double values and tracks some basic statistics over all the
029 * values added so far. The values may be added singly or in groups. This class is not thread safe.
030 *
031 * @author Pete Gillin
032 * @author Kevin Bourrillion
033 * @since 20.0
034 */
035@Beta
036@GwtIncompatible
037public final class StatsAccumulator {
038
039  // These fields must satisfy the requirements of Stats' constructor as well as those of the stat
040  // methods of this class.
041  private long count = 0;
042  private double mean = 0.0; // any finite value will do, we only use it to multiply by zero for sum
043  private double sumOfSquaresOfDeltas = 0.0;
044  private double min = NaN; // any value will do
045  private double max = NaN; // any value will do
046
047  /**
048   * Adds the given value to the dataset.
049   */
050  public void add(double value) {
051    if (count == 0) {
052      count = 1;
053      mean = value;
054      min = value;
055      max = value;
056      if (!isFinite(value)) {
057        sumOfSquaresOfDeltas = NaN;
058      }
059    } else {
060      count++;
061      if (isFinite(value) && isFinite(mean)) {
062        // Art of Computer Programming vol. 2, Knuth, 4.2.2, (15) and (16)
063        double delta = value - mean;
064        mean += delta / count;
065        sumOfSquaresOfDeltas += delta * (value - mean);
066      } else {
067        mean = calculateNewMeanNonFinite(mean, value);
068        sumOfSquaresOfDeltas = NaN;
069      }
070      min = Math.min(min, value);
071      max = Math.max(max, value);
072    }
073  }
074
075  /**
076   * Adds the given values to the dataset.
077   *
078   * @param values a series of values, which will be converted to {@code double} values (this may
079   *     cause loss of precision)
080   */
081  public void addAll(Iterable<? extends Number> values) {
082    for (Number value : values) {
083      add(value.doubleValue());
084    }
085  }
086
087  /**
088   * Adds the given values to the dataset.
089   *
090   * @param values a series of values, which will be converted to {@code double} values (this may
091   *     cause loss of precision)
092   */
093  public void addAll(Iterator<? extends Number> values) {
094    while (values.hasNext()) {
095      add(values.next().doubleValue());
096    }
097  }
098
099  /**
100   * Adds the given values to the dataset.
101   *
102   * @param values a series of values
103   */
104  public void addAll(double... values) {
105    for (double value : values) {
106      add(value);
107    }
108  }
109
110  /**
111   * Adds the given values to the dataset.
112   *
113   * @param values a series of values
114   */
115  public void addAll(int... values) {
116    for (int value : values) {
117      add(value);
118    }
119  }
120
121  /**
122   * Adds the given values to the dataset.
123   *
124   * @param values a series of values, which will be converted to {@code double} values (this may
125   *     cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15))
126   */
127  public void addAll(long... values) {
128    for (long value : values) {
129      add(value);
130    }
131  }
132
133  /**
134   * Adds the given statistics to the dataset, as if the individual values used to compute the
135   * statistics had been added directly.
136   */
137  public void addAll(Stats values) {
138    if (values.count() == 0) {
139      return;
140    }
141
142    if (count == 0) {
143      count = values.count();
144      mean = values.mean();
145      sumOfSquaresOfDeltas = values.sumOfSquaresOfDeltas();
146      min = values.min();
147      max = values.max();
148    } else {
149      count += values.count();
150      if (isFinite(mean) && isFinite(values.mean())) {
151        // This is a generalized version of the calculation in add(double) above.
152        double delta = values.mean() - mean;
153        mean += delta * values.count() / count;
154        sumOfSquaresOfDeltas +=
155            values.sumOfSquaresOfDeltas() + delta * (values.mean() - mean) * values.count();
156      } else {
157        mean = calculateNewMeanNonFinite(mean, values.mean());
158        sumOfSquaresOfDeltas = NaN;
159      }
160      min = Math.min(min, values.min());
161      max = Math.max(max, values.max());
162    }
163  }
164
165  /**
166   * Returns an immutable snapshot of the current statistics.
167   */
168  public Stats snapshot() {
169    return new Stats(count, mean, sumOfSquaresOfDeltas, min, max);
170  }
171
172  /**
173   * Returns the number of values.
174   */
175  public long count() {
176    return count;
177  }
178
179  /**
180   * Returns the <a href="http://en.wikipedia.org/wiki/Arithmetic_mean">arithmetic mean</a> of the
181   * values. The count must be non-zero.
182   *
183   * <p>If these values are a sample drawn from a population, this is also an unbiased estimator of
184   * the arithmetic mean of the population.
185   *
186   * <h3>Non-finite values</h3>
187   *
188   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
189   * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the
190   * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
191   * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}.
192   * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or
193   * {@link Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}.
194   *
195   * @throws IllegalStateException if the dataset is empty
196   */
197  public double mean() {
198    checkState(count != 0);
199    return mean;
200  }
201
202  /**
203   * Returns the sum of the values.
204   *
205   * <h3>Non-finite values</h3>
206   *
207   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
208   * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the
209   * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values
210   * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}.
211   * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or
212   * {@link Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}.
213   */
214  public final double sum() {
215    return mean * count;
216  }
217
218  /**
219   * Returns the <a href="http://en.wikipedia.org/wiki/Variance#Population_variance">population
220   * variance</a> of the values. The count must be non-zero.
221   *
222   * <p>This is guaranteed to return zero if the dataset contains only exactly one finite value.
223   * It is not guaranteed to return zero when the dataset consists of the same value multiple times,
224   * due to numerical errors. However, it is guaranteed never to return a negative result.
225   *
226   * <h3>Non-finite values</h3>
227   *
228   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
229   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
230   *
231   * @throws IllegalStateException if the dataset is empty
232   */
233  public final double populationVariance() {
234    checkState(count != 0);
235    if (isNaN(sumOfSquaresOfDeltas)) {
236      return NaN;
237    }
238    if (count == 1) {
239      return 0.0;
240    }
241    return ensureNonNegative(sumOfSquaresOfDeltas) / count;
242  }
243
244  /**
245   * Returns the
246   * <a href="http://en.wikipedia.org/wiki/Standard_deviation#Definition_of_population_values">
247   * population standard deviation</a> of the values. The count must be non-zero.
248   *
249   * <p>This is guaranteed to return zero if the dataset contains only exactly one finite value.
250   * It is not guaranteed to return zero when the dataset consists of the same value multiple times,
251   * due to numerical errors. However, it is guaranteed never to return a negative result.
252   *
253   * <h3>Non-finite values</h3>
254   *
255   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
256   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
257   *
258   * @throws IllegalStateException if the dataset is empty
259   */
260  public final double populationStandardDeviation() {
261    return Math.sqrt(populationVariance());
262  }
263
264  /**
265   * Returns the <a href="http://en.wikipedia.org/wiki/Variance#Sample_variance">unbiased sample
266   * variance</a> of the values. If this dataset is a sample drawn from a population, this is an
267   * unbiased estimator of the population variance of the population. The count must be greater than
268   * one.
269   *
270   * <p>This is not guaranteed to return zero when the dataset consists of the same value multiple
271   * times, due to numerical errors. However, it is guaranteed never to return a negative result.
272   *
273   * <h3>Non-finite values</h3>
274   *
275   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
276   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
277   *
278   * @throws IllegalStateException if the dataset is empty or contains a single value
279   */
280  public final double sampleVariance() {
281    checkState(count > 1);
282    if (isNaN(sumOfSquaresOfDeltas)) {
283      return NaN;
284    }
285    return ensureNonNegative(sumOfSquaresOfDeltas) / (count - 1);
286  }
287
288  /**
289   * Returns the
290   * <a href="http://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation">
291   * corrected sample standard deviation</a> of the values. If this dataset is a sample drawn from a
292   * population, this is an estimator of the population standard deviation of the population which
293   * is less biased than {@link #populationStandardDeviation()} (the unbiased estimator depends on
294   * the distribution). The count must be greater than one.
295   *
296   * <p>This is not guaranteed to return zero when the dataset consists of the same value multiple
297   * times, due to numerical errors. However, it is guaranteed never to return a negative result.
298   *
299   * <h3>Non-finite values</h3>
300   *
301   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
302   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
303   *
304   * @throws IllegalStateException if the dataset is empty or contains a single value
305   */
306  public final double sampleStandardDeviation() {
307    return Math.sqrt(sampleVariance());
308  }
309
310  /**
311   * Returns the lowest value in the dataset. The count must be non-zero.
312   *
313   * <h3>Non-finite values</h3>
314   *
315   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
316   * contains {@link Double#NEGATIVE_INFINITY} and not {@link Double#NaN} then the result is
317   * {@link Double#NEGATIVE_INFINITY}. If it contains {@link Double#POSITIVE_INFINITY} and finite
318   * values only then the result is the lowest finite value. If it contains
319   * {@link Double#POSITIVE_INFINITY} only then the result is {@link Double#POSITIVE_INFINITY}.
320   *
321   * @throws IllegalStateException if the dataset is empty
322   */
323  public double min() {
324    checkState(count != 0);
325    return min;
326  }
327
328  /**
329   * Returns the highest value in the dataset. The count must be non-zero.
330   *
331   * <h3>Non-finite values</h3>
332   *
333   * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it
334   * contains {@link Double#POSITIVE_INFINITY} and not {@link Double#NaN} then the result is
335   * {@link Double#POSITIVE_INFINITY}. If it contains {@link Double#NEGATIVE_INFINITY} and finite
336   * values only then the result is the highest finite value. If it contains
337   * {@link Double#NEGATIVE_INFINITY} only then the result is {@link Double#NEGATIVE_INFINITY}.
338   *
339   * @throws IllegalStateException if the dataset is empty
340   */
341  public double max() {
342    checkState(count != 0);
343    return max;
344  }
345
346  double sumOfSquaresOfDeltas() {
347    return sumOfSquaresOfDeltas;
348  }
349
350  /**
351   * Calculates the new value for the accumulated mean when a value is added, in the case where at
352   * least one of the previous mean and the value is non-finite.
353   */
354  static double calculateNewMeanNonFinite(double previousMean, double value) {
355    /*
356     * Desired behaviour is to match the results of applying the naive mean formula. In particular,
357     * the update formula can subtract infinities in cases where the naive formula would add them.
358     *
359     * Consequently:
360     * 1. If the previous mean is finite and the new value is non-finite then the new mean is that
361     *    value (whether it is NaN or infinity).
362     * 2. If the new value is finite and the previous mean is non-finite then the mean is unchanged
363     *    (whether it is NaN or infinity).
364     * 3. If both the previous mean and the new value are non-finite and...
365     * 3a. ...either or both is NaN (so mean != value) then the new mean is NaN.
366     * 3b. ...they are both the same infinities (so mean == value) then the mean is unchanged.
367     * 3c. ...they are different infinities (so mean != value) then the new mean is NaN.
368     */
369    if (isFinite(previousMean)) {
370      // This is case 1.
371      return value;
372    } else if (isFinite(value) || previousMean == value) {
373      // This is case 2. or 3b.
374      return previousMean;
375    } else {
376      // This is case 3a. or 3c.
377      return NaN;
378    }
379  }
380}