001/* 002 * Copyright (C) 2012 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.math; 016 017import static com.google.common.base.Preconditions.checkState; 018import static com.google.common.primitives.Doubles.isFinite; 019import static java.lang.Double.NaN; 020import static java.lang.Double.isNaN; 021 022import com.google.common.annotations.GwtIncompatible; 023import com.google.common.annotations.J2ktIncompatible; 024import com.google.common.primitives.Doubles; 025 026/** 027 * A mutable object which accumulates paired double values (e.g. points on a plane) and tracks some 028 * basic statistics over all the values added so far. This class is not thread safe. 029 * 030 * @author Pete Gillin 031 * @since 20.0 032 */ 033@J2ktIncompatible 034@GwtIncompatible 035public final class PairedStatsAccumulator { 036 /** Creates a new accumulator. */ 037 public PairedStatsAccumulator() {} 038 039 // These fields must satisfy the requirements of PairedStats' constructor as well as those of the 040 // stat methods of this class. 041 private final StatsAccumulator xStats = new StatsAccumulator(); 042 private final StatsAccumulator yStats = new StatsAccumulator(); 043 private double sumOfProductsOfDeltas = 0.0; 044 045 /** Adds the given pair of values to the dataset. */ 046 public void add(double x, double y) { 047 // We extend the recursive expression for the one-variable case at Art of Computer Programming 048 // vol. 2, Knuth, 4.2.2, (16) to the two-variable case. We have two value series x_i and y_i. 049 // We define the arithmetic means X_n = 1/n \sum_{i=1}^n x_i, and Y_n = 1/n \sum_{i=1}^n y_i. 050 // We also define the sum of the products of the differences from the means 051 // C_n = \sum_{i=1}^n x_i y_i - n X_n Y_n 052 // for all n >= 1. Then for all n > 1: 053 // C_{n-1} = \sum_{i=1}^{n-1} x_i y_i - (n-1) X_{n-1} Y_{n-1} 054 // C_n - C_{n-1} = x_n y_n - n X_n Y_n + (n-1) X_{n-1} Y_{n-1} 055 // = x_n y_n - X_n [ y_n + (n-1) Y_{n-1} ] + [ n X_n - x_n ] Y_{n-1} 056 // = x_n y_n - X_n y_n - x_n Y_{n-1} + X_n Y_{n-1} 057 // = (x_n - X_n) (y_n - Y_{n-1}) 058 xStats.add(x); 059 if (isFinite(x) && isFinite(y)) { 060 if (xStats.count() > 1) { 061 sumOfProductsOfDeltas += (x - xStats.mean()) * (y - yStats.mean()); 062 } 063 } else { 064 sumOfProductsOfDeltas = NaN; 065 } 066 yStats.add(y); 067 } 068 069 /** 070 * Adds the given statistics to the dataset, as if the individual values used to compute the 071 * statistics had been added directly. 072 */ 073 public void addAll(PairedStats values) { 074 if (values.count() == 0) { 075 return; 076 } 077 078 xStats.addAll(values.xStats()); 079 if (yStats.count() == 0) { 080 sumOfProductsOfDeltas = values.sumOfProductsOfDeltas(); 081 } else { 082 // This is a generalized version of the calculation in add(double, double) above. Note that 083 // non-finite inputs will have sumOfProductsOfDeltas = NaN, so non-finite values will result 084 // in NaN naturally. 085 sumOfProductsOfDeltas += 086 values.sumOfProductsOfDeltas() 087 + (values.xStats().mean() - xStats.mean()) 088 * (values.yStats().mean() - yStats.mean()) 089 * values.count(); 090 } 091 yStats.addAll(values.yStats()); 092 } 093 094 /** Returns an immutable snapshot of the current statistics. */ 095 public PairedStats snapshot() { 096 return new PairedStats(xStats.snapshot(), yStats.snapshot(), sumOfProductsOfDeltas); 097 } 098 099 /** Returns the number of pairs in the dataset. */ 100 public long count() { 101 return xStats.count(); 102 } 103 104 /** Returns an immutable snapshot of the statistics on the {@code x} values alone. */ 105 public Stats xStats() { 106 return xStats.snapshot(); 107 } 108 109 /** Returns an immutable snapshot of the statistics on the {@code y} values alone. */ 110 public Stats yStats() { 111 return yStats.snapshot(); 112 } 113 114 /** 115 * Returns the population covariance of the values. The count must be non-zero. 116 * 117 * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It 118 * is not guaranteed to return zero when the dataset consists of the same pair of values multiple 119 * times, due to numerical errors. 120 * 121 * <h3>Non-finite values</h3> 122 * 123 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 124 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 125 * 126 * @throws IllegalStateException if the dataset is empty 127 */ 128 public double populationCovariance() { 129 checkState(count() != 0); 130 return sumOfProductsOfDeltas / count(); 131 } 132 133 /** 134 * Returns the sample covariance of the values. The count must be greater than one. 135 * 136 * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values 137 * multiple times, due to numerical errors. 138 * 139 * <h3>Non-finite values</h3> 140 * 141 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 142 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 143 * 144 * @throws IllegalStateException if the dataset is empty or contains a single pair of values 145 */ 146 public final double sampleCovariance() { 147 checkState(count() > 1); 148 return sumOfProductsOfDeltas / (count() - 1); 149 } 150 151 /** 152 * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or 153 * product-moment correlation coefficient</a> of the values. The count must greater than one, and 154 * the {@code x} and {@code y} values must both have non-zero population variance (i.e. {@code 155 * xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result is not 156 * guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to 157 * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1]. 158 * 159 * <h3>Non-finite values</h3> 160 * 161 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 162 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 163 * 164 * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or 165 * either the {@code x} and {@code y} dataset has zero population variance 166 */ 167 public final double pearsonsCorrelationCoefficient() { 168 checkState(count() > 1); 169 if (isNaN(sumOfProductsOfDeltas)) { 170 return NaN; 171 } 172 double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas(); 173 double ySumOfSquaresOfDeltas = yStats.sumOfSquaresOfDeltas(); 174 checkState(xSumOfSquaresOfDeltas > 0.0); 175 checkState(ySumOfSquaresOfDeltas > 0.0); 176 // The product of two positive numbers can be zero if the multiplication underflowed. We 177 // force a positive value by effectively rounding up to MIN_VALUE. 178 double productOfSumsOfSquaresOfDeltas = 179 ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas); 180 return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas)); 181 } 182 183 /** 184 * Returns a linear transformation giving the best fit to the data according to <a 185 * href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear 186 * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and 187 * either the {@code x} or {@code y} data must have a non-zero population variance (i.e. {@code 188 * xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result is 189 * guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y} 190 * data, and vertical if there is variance in the {@code y} data but not the {@code x} data. 191 * 192 * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This 193 * error is defined as the square root of the mean of the squares of the differences between the 194 * actual {@code y} values of the data and the values predicted by the fit for the {@code x} 195 * values (i.e. it is the square root of the mean of the squares of the vertical distances between 196 * the data points and the best fit line). For this fit, this error is a fraction {@code sqrt(1 - 197 * R*R)} of the population standard deviation of {@code y}, where {@code R} is the Pearson's 198 * correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}). 199 * 200 * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a 201 * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit 202 * does not normally minimize that error: to do that, you should swap the roles of {@code x} and 203 * {@code y}. 204 * 205 * <h3>Non-finite values</h3> 206 * 207 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 208 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link 209 * LinearTransformation#forNaN()}. 210 * 211 * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or 212 * both the {@code x} and {@code y} dataset have zero population variance 213 */ 214 public final LinearTransformation leastSquaresFit() { 215 checkState(count() > 1); 216 if (isNaN(sumOfProductsOfDeltas)) { 217 return LinearTransformation.forNaN(); 218 } 219 double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas(); 220 if (xSumOfSquaresOfDeltas > 0.0) { 221 if (yStats.sumOfSquaresOfDeltas() > 0.0) { 222 return LinearTransformation.mapping(xStats.mean(), yStats.mean()) 223 .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas); 224 } else { 225 return LinearTransformation.horizontal(yStats.mean()); 226 } 227 } else { 228 checkState(yStats.sumOfSquaresOfDeltas() > 0.0); 229 return LinearTransformation.vertical(xStats.mean()); 230 } 231 } 232 233 private double ensurePositive(double value) { 234 if (value > 0.0) { 235 return value; 236 } else { 237 return Double.MIN_VALUE; 238 } 239 } 240 241 private static double ensureInUnitRange(double value) { 242 return Doubles.constrainToRange(value, -1.0, 1.0); 243 } 244}