001/* 002 * Copyright (C) 2012 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.math; 016 017import static com.google.common.base.Preconditions.checkState; 018import static com.google.common.math.DoubleUtils.ensureNonNegative; 019import static com.google.common.primitives.Doubles.isFinite; 020import static java.lang.Double.NaN; 021import static java.lang.Double.isNaN; 022 023import com.google.common.annotations.GwtIncompatible; 024import com.google.common.annotations.J2ktIncompatible; 025import java.util.Iterator; 026import java.util.stream.DoubleStream; 027import java.util.stream.IntStream; 028import java.util.stream.LongStream; 029 030/** 031 * A mutable object which accumulates double values and tracks some basic statistics over all the 032 * values added so far. The values may be added singly or in groups. This class is not thread safe. 033 * 034 * @author Pete Gillin 035 * @author Kevin Bourrillion 036 * @since 20.0 037 */ 038@J2ktIncompatible 039@GwtIncompatible 040@ElementTypesAreNonnullByDefault 041public final class StatsAccumulator { 042 043 // These fields must satisfy the requirements of Stats' constructor as well as those of the stat 044 // methods of this class. 045 private long count = 0; 046 private double mean = 0.0; // any finite value will do, we only use it to multiply by zero for sum 047 private double sumOfSquaresOfDeltas = 0.0; 048 private double min = NaN; // any value will do 049 private double max = NaN; // any value will do 050 051 /** Adds the given value to the dataset. */ 052 public void add(double value) { 053 if (count == 0) { 054 count = 1; 055 mean = value; 056 min = value; 057 max = value; 058 if (!isFinite(value)) { 059 sumOfSquaresOfDeltas = NaN; 060 } 061 } else { 062 count++; 063 if (isFinite(value) && isFinite(mean)) { 064 // Art of Computer Programming vol. 2, Knuth, 4.2.2, (15) and (16) 065 double delta = value - mean; 066 mean += delta / count; 067 sumOfSquaresOfDeltas += delta * (value - mean); 068 } else { 069 mean = calculateNewMeanNonFinite(mean, value); 070 sumOfSquaresOfDeltas = NaN; 071 } 072 min = Math.min(min, value); 073 max = Math.max(max, value); 074 } 075 } 076 077 /** 078 * Adds the given values to the dataset. 079 * 080 * @param values a series of values, which will be converted to {@code double} values (this may 081 * cause loss of precision) 082 */ 083 public void addAll(Iterable<? extends Number> values) { 084 for (Number value : values) { 085 add(value.doubleValue()); 086 } 087 } 088 089 /** 090 * Adds the given values to the dataset. 091 * 092 * @param values a series of values, which will be converted to {@code double} values (this may 093 * cause loss of precision) 094 */ 095 public void addAll(Iterator<? extends Number> values) { 096 while (values.hasNext()) { 097 add(values.next().doubleValue()); 098 } 099 } 100 101 /** 102 * Adds the given values to the dataset. 103 * 104 * @param values a series of values 105 */ 106 public void addAll(double... values) { 107 for (double value : values) { 108 add(value); 109 } 110 } 111 112 /** 113 * Adds the given values to the dataset. 114 * 115 * @param values a series of values 116 */ 117 public void addAll(int... values) { 118 for (int value : values) { 119 add(value); 120 } 121 } 122 123 /** 124 * Adds the given values to the dataset. 125 * 126 * @param values a series of values, which will be converted to {@code double} values (this may 127 * cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15)) 128 */ 129 public void addAll(long... values) { 130 for (long value : values) { 131 add(value); 132 } 133 } 134 135 /** 136 * Adds the given values to the dataset. The stream will be completely consumed by this method. 137 * 138 * @param values a series of values 139 * @since 28.2 140 */ 141 public void addAll(DoubleStream values) { 142 addAll(values.collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll)); 143 } 144 145 /** 146 * Adds the given values to the dataset. The stream will be completely consumed by this method. 147 * 148 * @param values a series of values 149 * @since 28.2 150 */ 151 public void addAll(IntStream values) { 152 addAll(values.collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll)); 153 } 154 155 /** 156 * Adds the given values to the dataset. The stream will be completely consumed by this method. 157 * 158 * @param values a series of values, which will be converted to {@code double} values (this may 159 * cause loss of precision for longs of magnitude over 2^53 (slightly over 9e15)) 160 * @since 28.2 161 */ 162 public void addAll(LongStream values) { 163 addAll(values.collect(StatsAccumulator::new, StatsAccumulator::add, StatsAccumulator::addAll)); 164 } 165 166 /** 167 * Adds the given statistics to the dataset, as if the individual values used to compute the 168 * statistics had been added directly. 169 */ 170 public void addAll(Stats values) { 171 if (values.count() == 0) { 172 return; 173 } 174 merge(values.count(), values.mean(), values.sumOfSquaresOfDeltas(), values.min(), values.max()); 175 } 176 177 /** 178 * Adds the given statistics to the dataset, as if the individual values used to compute the 179 * statistics had been added directly. 180 * 181 * @since 28.2 182 */ 183 public void addAll(StatsAccumulator values) { 184 if (values.count() == 0) { 185 return; 186 } 187 merge(values.count(), values.mean(), values.sumOfSquaresOfDeltas(), values.min(), values.max()); 188 } 189 190 private void merge( 191 long otherCount, 192 double otherMean, 193 double otherSumOfSquaresOfDeltas, 194 double otherMin, 195 double otherMax) { 196 if (count == 0) { 197 count = otherCount; 198 mean = otherMean; 199 sumOfSquaresOfDeltas = otherSumOfSquaresOfDeltas; 200 min = otherMin; 201 max = otherMax; 202 } else { 203 count += otherCount; 204 if (isFinite(mean) && isFinite(otherMean)) { 205 // This is a generalized version of the calculation in add(double) above. 206 double delta = otherMean - mean; 207 mean += delta * otherCount / count; 208 sumOfSquaresOfDeltas += otherSumOfSquaresOfDeltas + delta * (otherMean - mean) * otherCount; 209 } else { 210 mean = calculateNewMeanNonFinite(mean, otherMean); 211 sumOfSquaresOfDeltas = NaN; 212 } 213 min = Math.min(min, otherMin); 214 max = Math.max(max, otherMax); 215 } 216 } 217 218 /** Returns an immutable snapshot of the current statistics. */ 219 public Stats snapshot() { 220 return new Stats(count, mean, sumOfSquaresOfDeltas, min, max); 221 } 222 223 /** Returns the number of values. */ 224 public long count() { 225 return count; 226 } 227 228 /** 229 * Returns the <a href="http://en.wikipedia.org/wiki/Arithmetic_mean">arithmetic mean</a> of the 230 * values. The count must be non-zero. 231 * 232 * <p>If these values are a sample drawn from a population, this is also an unbiased estimator of 233 * the arithmetic mean of the population. 234 * 235 * <h3>Non-finite values</h3> 236 * 237 * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it 238 * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the 239 * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values 240 * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}. 241 * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or {@link 242 * Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}. 243 * 244 * @throws IllegalStateException if the dataset is empty 245 */ 246 public double mean() { 247 checkState(count != 0); 248 return mean; 249 } 250 251 /** 252 * Returns the sum of the values. 253 * 254 * <h3>Non-finite values</h3> 255 * 256 * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it 257 * contains both {@link Double#POSITIVE_INFINITY} and {@link Double#NEGATIVE_INFINITY} then the 258 * result is {@link Double#NaN}. If it contains {@link Double#POSITIVE_INFINITY} and finite values 259 * only or {@link Double#POSITIVE_INFINITY} only, the result is {@link Double#POSITIVE_INFINITY}. 260 * If it contains {@link Double#NEGATIVE_INFINITY} and finite values only or {@link 261 * Double#NEGATIVE_INFINITY} only, the result is {@link Double#NEGATIVE_INFINITY}. 262 */ 263 public final double sum() { 264 return mean * count; 265 } 266 267 /** 268 * Returns the <a href="http://en.wikipedia.org/wiki/Variance#Population_variance">population 269 * variance</a> of the values. The count must be non-zero. 270 * 271 * <p>This is guaranteed to return zero if the dataset contains only exactly one finite value. It 272 * is not guaranteed to return zero when the dataset consists of the same value multiple times, 273 * due to numerical errors. However, it is guaranteed never to return a negative result. 274 * 275 * <h3>Non-finite values</h3> 276 * 277 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 278 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 279 * 280 * @throws IllegalStateException if the dataset is empty 281 */ 282 public final double populationVariance() { 283 checkState(count != 0); 284 if (isNaN(sumOfSquaresOfDeltas)) { 285 return NaN; 286 } 287 if (count == 1) { 288 return 0.0; 289 } 290 return ensureNonNegative(sumOfSquaresOfDeltas) / count; 291 } 292 293 /** 294 * Returns the <a 295 * href="http://en.wikipedia.org/wiki/Standard_deviation#Definition_of_population_values"> 296 * population standard deviation</a> of the values. The count must be non-zero. 297 * 298 * <p>This is guaranteed to return zero if the dataset contains only exactly one finite value. It 299 * is not guaranteed to return zero when the dataset consists of the same value multiple times, 300 * due to numerical errors. However, it is guaranteed never to return a negative result. 301 * 302 * <h3>Non-finite values</h3> 303 * 304 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 305 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 306 * 307 * @throws IllegalStateException if the dataset is empty 308 */ 309 public final double populationStandardDeviation() { 310 return Math.sqrt(populationVariance()); 311 } 312 313 /** 314 * Returns the <a href="http://en.wikipedia.org/wiki/Variance#Sample_variance">unbiased sample 315 * variance</a> of the values. If this dataset is a sample drawn from a population, this is an 316 * unbiased estimator of the population variance of the population. The count must be greater than 317 * one. 318 * 319 * <p>This is not guaranteed to return zero when the dataset consists of the same value multiple 320 * times, due to numerical errors. However, it is guaranteed never to return a negative result. 321 * 322 * <h3>Non-finite values</h3> 323 * 324 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 325 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 326 * 327 * @throws IllegalStateException if the dataset is empty or contains a single value 328 */ 329 public final double sampleVariance() { 330 checkState(count > 1); 331 if (isNaN(sumOfSquaresOfDeltas)) { 332 return NaN; 333 } 334 return ensureNonNegative(sumOfSquaresOfDeltas) / (count - 1); 335 } 336 337 /** 338 * Returns the <a 339 * href="http://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation"> 340 * corrected sample standard deviation</a> of the values. If this dataset is a sample drawn from a 341 * population, this is an estimator of the population standard deviation of the population which 342 * is less biased than {@link #populationStandardDeviation()} (the unbiased estimator depends on 343 * the distribution). The count must be greater than one. 344 * 345 * <p>This is not guaranteed to return zero when the dataset consists of the same value multiple 346 * times, due to numerical errors. However, it is guaranteed never to return a negative result. 347 * 348 * <h3>Non-finite values</h3> 349 * 350 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 351 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 352 * 353 * @throws IllegalStateException if the dataset is empty or contains a single value 354 */ 355 public final double sampleStandardDeviation() { 356 return Math.sqrt(sampleVariance()); 357 } 358 359 /** 360 * Returns the lowest value in the dataset. The count must be non-zero. 361 * 362 * <h3>Non-finite values</h3> 363 * 364 * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it 365 * contains {@link Double#NEGATIVE_INFINITY} and not {@link Double#NaN} then the result is {@link 366 * Double#NEGATIVE_INFINITY}. If it contains {@link Double#POSITIVE_INFINITY} and finite values 367 * only then the result is the lowest finite value. If it contains {@link 368 * Double#POSITIVE_INFINITY} only then the result is {@link Double#POSITIVE_INFINITY}. 369 * 370 * @throws IllegalStateException if the dataset is empty 371 */ 372 public double min() { 373 checkState(count != 0); 374 return min; 375 } 376 377 /** 378 * Returns the highest value in the dataset. The count must be non-zero. 379 * 380 * <h3>Non-finite values</h3> 381 * 382 * <p>If the dataset contains {@link Double#NaN} then the result is {@link Double#NaN}. If it 383 * contains {@link Double#POSITIVE_INFINITY} and not {@link Double#NaN} then the result is {@link 384 * Double#POSITIVE_INFINITY}. If it contains {@link Double#NEGATIVE_INFINITY} and finite values 385 * only then the result is the highest finite value. If it contains {@link 386 * Double#NEGATIVE_INFINITY} only then the result is {@link Double#NEGATIVE_INFINITY}. 387 * 388 * @throws IllegalStateException if the dataset is empty 389 */ 390 public double max() { 391 checkState(count != 0); 392 return max; 393 } 394 395 double sumOfSquaresOfDeltas() { 396 return sumOfSquaresOfDeltas; 397 } 398 399 /** 400 * Calculates the new value for the accumulated mean when a value is added, in the case where at 401 * least one of the previous mean and the value is non-finite. 402 */ 403 static double calculateNewMeanNonFinite(double previousMean, double value) { 404 /* 405 * Desired behaviour is to match the results of applying the naive mean formula. In particular, 406 * the update formula can subtract infinities in cases where the naive formula would add them. 407 * 408 * Consequently: 409 * 1. If the previous mean is finite and the new value is non-finite then the new mean is that 410 * value (whether it is NaN or infinity). 411 * 2. If the new value is finite and the previous mean is non-finite then the mean is unchanged 412 * (whether it is NaN or infinity). 413 * 3. If both the previous mean and the new value are non-finite and... 414 * 3a. ...either or both is NaN (so mean != value) then the new mean is NaN. 415 * 3b. ...they are both the same infinities (so mean == value) then the mean is unchanged. 416 * 3c. ...they are different infinities (so mean != value) then the new mean is NaN. 417 */ 418 if (isFinite(previousMean)) { 419 // This is case 1. 420 return value; 421 } else if (isFinite(value) || previousMean == value) { 422 // This is case 2. or 3b. 423 return previousMean; 424 } else { 425 // This is case 3a. or 3c. 426 return NaN; 427 } 428 } 429}