001/* 002 * Copyright (C) 2012 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.math; 016 017import static com.google.common.base.Preconditions.checkState; 018import static com.google.common.primitives.Doubles.isFinite; 019import static java.lang.Double.NaN; 020import static java.lang.Double.isNaN; 021 022import com.google.common.annotations.Beta; 023import com.google.common.annotations.GwtIncompatible; 024import com.google.common.primitives.Doubles; 025 026/** 027 * A mutable object which accumulates paired double values (e.g. points on a plane) and tracks some 028 * basic statistics over all the values added so far. This class is not thread safe. 029 * 030 * @author Pete Gillin 031 * @since 20.0 032 */ 033@Beta 034@GwtIncompatible 035public final class PairedStatsAccumulator { 036 037 // These fields must satisfy the requirements of PairedStats' constructor as well as those of the 038 // stat methods of this class. 039 private final StatsAccumulator xStats = new StatsAccumulator(); 040 private final StatsAccumulator yStats = new StatsAccumulator(); 041 private double sumOfProductsOfDeltas = 0.0; 042 043 /** Adds the given pair of values to the dataset. */ 044 public void add(double x, double y) { 045 // We extend the recursive expression for the one-variable case at Art of Computer Programming 046 // vol. 2, Knuth, 4.2.2, (16) to the two-variable case. We have two value series x_i and y_i. 047 // We define the arithmetic means X_n = 1/n \sum_{i=1}^n x_i, and Y_n = 1/n \sum_{i=1}^n y_i. 048 // We also define the sum of the products of the differences from the means 049 // C_n = \sum_{i=1}^n x_i y_i - n X_n Y_n 050 // for all n >= 1. Then for all n > 1: 051 // C_{n-1} = \sum_{i=1}^{n-1} x_i y_i - (n-1) X_{n-1} Y_{n-1} 052 // C_n - C_{n-1} = x_n y_n - n X_n Y_n + (n-1) X_{n-1} Y_{n-1} 053 // = x_n y_n - X_n [ y_n + (n-1) Y_{n-1} ] + [ n X_n - x_n ] Y_{n-1} 054 // = x_n y_n - X_n y_n - x_n Y_{n-1} + X_n Y_{n-1} 055 // = (x_n - X_n) (y_n - Y_{n-1}) 056 xStats.add(x); 057 if (isFinite(x) && isFinite(y)) { 058 if (xStats.count() > 1) { 059 sumOfProductsOfDeltas += (x - xStats.mean()) * (y - yStats.mean()); 060 } 061 } else { 062 sumOfProductsOfDeltas = NaN; 063 } 064 yStats.add(y); 065 } 066 067 /** 068 * Adds the given statistics to the dataset, as if the individual values used to compute the 069 * statistics had been added directly. 070 */ 071 public void addAll(PairedStats values) { 072 if (values.count() == 0) { 073 return; 074 } 075 076 xStats.addAll(values.xStats()); 077 if (yStats.count() == 0) { 078 sumOfProductsOfDeltas = values.sumOfProductsOfDeltas(); 079 } else { 080 // This is a generalized version of the calculation in add(double, double) above. Note that 081 // non-finite inputs will have sumOfProductsOfDeltas = NaN, so non-finite values will result 082 // in NaN naturally. 083 sumOfProductsOfDeltas += 084 values.sumOfProductsOfDeltas() 085 + (values.xStats().mean() - xStats.mean()) 086 * (values.yStats().mean() - yStats.mean()) 087 * values.count(); 088 } 089 yStats.addAll(values.yStats()); 090 } 091 092 /** Returns an immutable snapshot of the current statistics. */ 093 public PairedStats snapshot() { 094 return new PairedStats(xStats.snapshot(), yStats.snapshot(), sumOfProductsOfDeltas); 095 } 096 097 /** Returns the number of pairs in the dataset. */ 098 public long count() { 099 return xStats.count(); 100 } 101 102 /** Returns an immutable snapshot of the statistics on the {@code x} values alone. */ 103 public Stats xStats() { 104 return xStats.snapshot(); 105 } 106 107 /** Returns an immutable snapshot of the statistics on the {@code y} values alone. */ 108 public Stats yStats() { 109 return yStats.snapshot(); 110 } 111 112 /** 113 * Returns the population covariance of the values. The count must be non-zero. 114 * 115 * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It 116 * is not guaranteed to return zero when the dataset consists of the same pair of values multiple 117 * times, due to numerical errors. 118 * 119 * <h3>Non-finite values</h3> 120 * 121 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 122 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 123 * 124 * @throws IllegalStateException if the dataset is empty 125 */ 126 public double populationCovariance() { 127 checkState(count() != 0); 128 return sumOfProductsOfDeltas / count(); 129 } 130 131 /** 132 * Returns the sample covariance of the values. The count must be greater than one. 133 * 134 * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values 135 * multiple times, due to numerical errors. 136 * 137 * <h3>Non-finite values</h3> 138 * 139 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 140 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 141 * 142 * @throws IllegalStateException if the dataset is empty or contains a single pair of values 143 */ 144 public final double sampleCovariance() { 145 checkState(count() > 1); 146 return sumOfProductsOfDeltas / (count() - 1); 147 } 148 149 /** 150 * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or 151 * product-moment correlation coefficient</a> of the values. The count must greater than one, and 152 * the {@code x} and {@code y} values must both have non-zero population variance (i.e. {@code 153 * xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result is not 154 * guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to 155 * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1]. 156 * 157 * <h3>Non-finite values</h3> 158 * 159 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 160 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 161 * 162 * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or 163 * either the {@code x} and {@code y} dataset has zero population variance 164 */ 165 public final double pearsonsCorrelationCoefficient() { 166 checkState(count() > 1); 167 if (isNaN(sumOfProductsOfDeltas)) { 168 return NaN; 169 } 170 double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas(); 171 double ySumOfSquaresOfDeltas = yStats.sumOfSquaresOfDeltas(); 172 checkState(xSumOfSquaresOfDeltas > 0.0); 173 checkState(ySumOfSquaresOfDeltas > 0.0); 174 // The product of two positive numbers can be zero if the multiplication underflowed. We 175 // force a positive value by effectively rounding up to MIN_VALUE. 176 double productOfSumsOfSquaresOfDeltas = 177 ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas); 178 return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas)); 179 } 180 181 /** 182 * Returns a linear transformation giving the best fit to the data according to <a 183 * href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear 184 * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and 185 * either the {@code x} or {@code y} data must have a non-zero population variance (i.e. {@code 186 * xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result is 187 * guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y} 188 * data, and vertical if there is variance in the {@code y} data but not the {@code x} data. 189 * 190 * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This 191 * error is defined as the square root of the mean of the squares of the differences between the 192 * actual {@code y} values of the data and the values predicted by the fit for the {@code x} 193 * values (i.e. it is the square root of the mean of the squares of the vertical distances between 194 * the data points and the best fit line). For this fit, this error is a fraction {@code sqrt(1 - 195 * R*R)} of the population standard deviation of {@code y}, where {@code R} is the Pearson's 196 * correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}). 197 * 198 * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a 199 * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit 200 * does not normally minimize that error: to do that, you should swap the roles of {@code x} and 201 * {@code y}. 202 * 203 * <h3>Non-finite values</h3> 204 * 205 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 206 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link 207 * LinearTransformation#forNaN()}. 208 * 209 * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or 210 * both the {@code x} and {@code y} dataset have zero population variance 211 */ 212 public final LinearTransformation leastSquaresFit() { 213 checkState(count() > 1); 214 if (isNaN(sumOfProductsOfDeltas)) { 215 return LinearTransformation.forNaN(); 216 } 217 double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas(); 218 if (xSumOfSquaresOfDeltas > 0.0) { 219 if (yStats.sumOfSquaresOfDeltas() > 0.0) { 220 return LinearTransformation.mapping(xStats.mean(), yStats.mean()) 221 .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas); 222 } else { 223 return LinearTransformation.horizontal(yStats.mean()); 224 } 225 } else { 226 checkState(yStats.sumOfSquaresOfDeltas() > 0.0); 227 return LinearTransformation.vertical(xStats.mean()); 228 } 229 } 230 231 private double ensurePositive(double value) { 232 if (value > 0.0) { 233 return value; 234 } else { 235 return Double.MIN_VALUE; 236 } 237 } 238 239 private static double ensureInUnitRange(double value) { 240 return Doubles.constrainToRange(value, -1.0, 1.0); 241 } 242}