001/* 002 * Copyright (C) 2012 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.math; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkState; 020import static java.lang.Double.NaN; 021import static java.lang.Double.doubleToLongBits; 022import static java.lang.Double.isNaN; 023 024import com.google.common.annotations.Beta; 025import com.google.common.annotations.GwtIncompatible; 026import com.google.common.base.MoreObjects; 027import com.google.common.base.Objects; 028import java.io.Serializable; 029import java.nio.ByteBuffer; 030import java.nio.ByteOrder; 031import javax.annotation.Nullable; 032 033/** 034 * An immutable value object capturing some basic statistics about a collection of paired double 035 * values (e.g. points on a plane). Build instances with {@link PairedStatsAccumulator#snapshot}. 036 * 037 * @author Pete Gillin 038 * @since 20.0 039 */ 040@Beta 041@GwtIncompatible 042public final class PairedStats implements Serializable { 043 044 private final Stats xStats; 045 private final Stats yStats; 046 private final double sumOfProductsOfDeltas; 047 048 /** 049 * Internal constructor. Users should use {@link PairedStatsAccumulator#snapshot}. 050 * 051 * <p>To ensure that the created instance obeys its contract, the parameters should satisfy the 052 * following constraints. This is the callers responsibility and is not enforced here. 053 * <ul> 054 * <li>Both {@code xStats} and {@code yStats} must have the same {@count}. 055 * <li>If that {@code count} is 1, {@code sumOfProductsOfDeltas} must be exactly 0.0. 056 * <li>If that {@code count} is more than 1, {@code sumOfProductsOfDeltas} must be finite. 057 * </ul> 058 */ 059 PairedStats(Stats xStats, Stats yStats, double sumOfProductsOfDeltas) { 060 this.xStats = xStats; 061 this.yStats = yStats; 062 this.sumOfProductsOfDeltas = sumOfProductsOfDeltas; 063 } 064 065 /** 066 * Returns the number of pairs in the dataset. 067 */ 068 public long count() { 069 return xStats.count(); 070 } 071 072 /** 073 * Returns the statistics on the {@code x} values alone. 074 */ 075 public Stats xStats() { 076 return xStats; 077 } 078 079 /** 080 * Returns the statistics on the {@code y} values alone. 081 */ 082 public Stats yStats() { 083 return yStats; 084 } 085 086 /** 087 * Returns the population covariance of the values. The count must be non-zero. 088 * 089 * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It 090 * is not guaranteed to return zero when the dataset consists of the same pair of values multiple 091 * times, due to numerical errors. 092 * 093 * <h3>Non-finite values</h3> 094 * 095 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, 096 * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 097 * 098 * @throws IllegalStateException if the dataset is empty 099 */ 100 public double populationCovariance() { 101 checkState(count() != 0); 102 return sumOfProductsOfDeltas / count(); 103 } 104 105 /** 106 * Returns the sample covariance of the values. The count must be greater than one. 107 * 108 * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values 109 * multiple times, due to numerical errors. 110 * 111 * <h3>Non-finite values</h3> 112 * 113 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, 114 * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 115 * 116 * @throws IllegalStateException if the dataset is empty or contains a single pair of values 117 */ 118 public double sampleCovariance() { 119 checkState(count() > 1); 120 return sumOfProductsOfDeltas / (count() - 1); 121 } 122 123 /** 124 * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or 125 * product-moment correlation coefficient</a> of the values. The count must greater than one, and 126 * the {@code x} and {@code y} values must both have non-zero population variance (i.e. 127 * {@code xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result 128 * is not guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to 129 * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1]. 130 * 131 * <h3>Non-finite values</h3> 132 * 133 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, 134 * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 135 * 136 * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or 137 * either the {@code x} and {@code y} dataset has zero population variance 138 */ 139 public double pearsonsCorrelationCoefficient() { 140 checkState(count() > 1); 141 if (isNaN(sumOfProductsOfDeltas)) { 142 return NaN; 143 } 144 double xSumOfSquaresOfDeltas = xStats().sumOfSquaresOfDeltas(); 145 double ySumOfSquaresOfDeltas = yStats().sumOfSquaresOfDeltas(); 146 checkState(xSumOfSquaresOfDeltas > 0.0); 147 checkState(ySumOfSquaresOfDeltas > 0.0); 148 // The product of two positive numbers can be zero if the multiplication underflowed. We 149 // force a positive value by effectively rounding up to MIN_VALUE. 150 double productOfSumsOfSquaresOfDeltas = 151 ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas); 152 return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas)); 153 } 154 155 /** 156 * Returns a linear transformation giving the best fit to the data according to 157 * <a href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear 158 * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and 159 * either the {@code x} or {@code y} data must have a non-zero population variance (i.e. 160 * {@code xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result 161 * is guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y} 162 * data, and vertical if there is variance in the {@code y} data but not the {@code x} data. 163 * 164 * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This 165 * error is defined as the square root of the mean of the squares of the differences between the 166 * actual {@code y} values of the data and the values predicted by the fit for the {@code x} 167 * values (i.e. it is the square root of the mean of the squares of the vertical distances between 168 * the data points and the best fit line). For this fit, this error is a fraction 169 * {@code sqrt(1 - R*R)} of the population standard deviation of {@code y}, where {@code R} is the 170 * Pearson's correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}). 171 * 172 * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a 173 * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit 174 * does not normally minimize that error: to do that, you should swap the roles of {@code x} and 175 * {@code y}. 176 * 177 * <h3>Non-finite values</h3> 178 * 179 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, 180 * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is 181 * {@link LinearTransformation#forNaN()}. 182 * 183 * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or 184 * both the {@code x} and {@code y} dataset must have zero population variance 185 */ 186 public LinearTransformation leastSquaresFit() { 187 checkState(count() > 1); 188 if (isNaN(sumOfProductsOfDeltas)) { 189 return LinearTransformation.forNaN(); 190 } 191 double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas(); 192 if (xSumOfSquaresOfDeltas > 0.0) { 193 if (yStats.sumOfSquaresOfDeltas() > 0.0) { 194 return LinearTransformation.mapping(xStats.mean(), yStats.mean()) 195 .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas); 196 } else { 197 return LinearTransformation.horizontal(yStats.mean()); 198 } 199 } else { 200 checkState(yStats.sumOfSquaresOfDeltas() > 0.0); 201 return LinearTransformation.vertical(xStats.mean()); 202 } 203 } 204 205 /** 206 * {@inheritDoc} 207 * 208 * <p><b>Note:</b> This tests exact equality of the calculated statistics, including the floating 209 * point values. It is definitely true for instances constructed from exactly the same values in 210 * the same order. It is also true for an instance round-tripped through java serialization. 211 * However, floating point rounding errors mean that it may be false for some instances where the 212 * statistics are mathematically equal, including the same values in a different order. 213 */ 214 @Override 215 public boolean equals(@Nullable Object obj) { 216 if (obj == null) { 217 return false; 218 } 219 if (getClass() != obj.getClass()) { 220 return false; 221 } 222 PairedStats other = (PairedStats) obj; 223 return (xStats.equals(other.xStats)) 224 && (yStats.equals(other.yStats)) 225 && (doubleToLongBits(sumOfProductsOfDeltas) 226 == doubleToLongBits(other.sumOfProductsOfDeltas)); 227 } 228 229 /** 230 * {@inheritDoc} 231 * 232 * <p><b>Note:</b> This hash code is consistent with exact equality of the calculated statistics, 233 * including the floating point values. See the note on {@link #equals} for details. 234 */ 235 @Override 236 public int hashCode() { 237 return Objects.hashCode(xStats, yStats, sumOfProductsOfDeltas); 238 } 239 240 @Override 241 public String toString() { 242 if (count() > 0) { 243 return MoreObjects.toStringHelper(this) 244 .add("xStats", xStats) 245 .add("yStats", yStats) 246 .add("populationCovariance", populationCovariance()) 247 .toString(); 248 } else { 249 return MoreObjects.toStringHelper(this) 250 .add("xStats", xStats) 251 .add("yStats", yStats) 252 .toString(); 253 } 254 } 255 256 double sumOfProductsOfDeltas() { 257 return sumOfProductsOfDeltas; 258 } 259 260 private static double ensurePositive(double value) { 261 if (value > 0.0) { 262 return value; 263 } else { 264 return Double.MIN_VALUE; 265 } 266 } 267 268 private static double ensureInUnitRange(double value) { 269 if (value >= 1.0) { 270 return 1.0; 271 } 272 if (value <= -1.0) { 273 return -1.0; 274 } 275 return value; 276 } 277 278 // Serialization helpers 279 280 /** 281 * The size of byte array representaion in bytes. 282 */ 283 private static final int BYTES = Stats.BYTES * 2 + Double.SIZE / Byte.SIZE; 284 285 /** 286 * Gets a byte array representation of this instance. 287 * 288 * <p><b>Note:</b> No guarantees are made regarding stability of the representation between 289 * versions. 290 */ 291 public byte[] toByteArray() { 292 ByteBuffer buffer = ByteBuffer.allocate(BYTES).order(ByteOrder.LITTLE_ENDIAN); 293 xStats.writeTo(buffer); 294 yStats.writeTo(buffer); 295 buffer.putDouble(sumOfProductsOfDeltas); 296 return buffer.array(); 297 } 298 299 /** 300 * Creates a {@link PairedStats} instance from the given byte representation which was obtained by 301 * {@link #toByteArray}. 302 * 303 * <p><b>Note:</b> No guarantees are made regarding stability of the representation between 304 * versions. 305 */ 306 public static PairedStats fromByteArray(byte[] byteArray) { 307 checkNotNull(byteArray); 308 checkArgument( 309 byteArray.length == BYTES, 310 "Expected PairedStats.BYTES = %s, got %s", 311 BYTES, 312 byteArray.length); 313 ByteBuffer buffer = ByteBuffer.wrap(byteArray).order(ByteOrder.LITTLE_ENDIAN); 314 Stats xStats = Stats.readFrom(buffer); 315 Stats yStats = Stats.readFrom(buffer); 316 double sumOfProductsOfDeltas = buffer.getDouble(); 317 return new PairedStats(xStats, yStats, sumOfProductsOfDeltas); 318 } 319 320 private static final long serialVersionUID = 0; 321}