001/* 002 * Copyright (C) 2012 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.math; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkState; 020import static java.lang.Double.NaN; 021import static java.lang.Double.doubleToLongBits; 022import static java.lang.Double.isNaN; 023 024import com.google.common.annotations.Beta; 025import com.google.common.annotations.GwtIncompatible; 026import com.google.common.base.MoreObjects; 027import com.google.common.base.Objects; 028import java.io.Serializable; 029import java.nio.ByteBuffer; 030import java.nio.ByteOrder; 031import javax.annotation.Nullable; 032 033/** 034 * An immutable value object capturing some basic statistics about a collection of paired double 035 * values (e.g. points on a plane). Build instances with {@link PairedStatsAccumulator#snapshot}. 036 * 037 * @author Pete Gillin 038 * @since 20.0 039 */ 040@Beta 041@GwtIncompatible 042public final class PairedStats implements Serializable { 043 044 private final Stats xStats; 045 private final Stats yStats; 046 private final double sumOfProductsOfDeltas; 047 048 /** 049 * Internal constructor. Users should use {@link PairedStatsAccumulator#snapshot}. 050 * 051 * <p>To ensure that the created instance obeys its contract, the parameters should satisfy the 052 * following constraints. This is the callers responsibility and is not enforced here. 053 * 054 * <ul> 055 * <li>Both {@code xStats} and {@code yStats} must have the same {@code count}. 056 * <li>If that {@code count} is 1, {@code sumOfProductsOfDeltas} must be exactly 0.0. 057 * <li>If that {@code count} is more than 1, {@code sumOfProductsOfDeltas} must be finite. 058 * </ul> 059 */ 060 PairedStats(Stats xStats, Stats yStats, double sumOfProductsOfDeltas) { 061 this.xStats = xStats; 062 this.yStats = yStats; 063 this.sumOfProductsOfDeltas = sumOfProductsOfDeltas; 064 } 065 066 /** 067 * Returns the number of pairs in the dataset. 068 */ 069 public long count() { 070 return xStats.count(); 071 } 072 073 /** 074 * Returns the statistics on the {@code x} values alone. 075 */ 076 public Stats xStats() { 077 return xStats; 078 } 079 080 /** 081 * Returns the statistics on the {@code y} values alone. 082 */ 083 public Stats yStats() { 084 return yStats; 085 } 086 087 /** 088 * Returns the population covariance of the values. The count must be non-zero. 089 * 090 * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It 091 * is not guaranteed to return zero when the dataset consists of the same pair of values multiple 092 * times, due to numerical errors. 093 * 094 * <h3>Non-finite values</h3> 095 * 096 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, 097 * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 098 * 099 * @throws IllegalStateException if the dataset is empty 100 */ 101 public double populationCovariance() { 102 checkState(count() != 0); 103 return sumOfProductsOfDeltas / count(); 104 } 105 106 /** 107 * Returns the sample covariance of the values. The count must be greater than one. 108 * 109 * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values 110 * multiple times, due to numerical errors. 111 * 112 * <h3>Non-finite values</h3> 113 * 114 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, 115 * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 116 * 117 * @throws IllegalStateException if the dataset is empty or contains a single pair of values 118 */ 119 public double sampleCovariance() { 120 checkState(count() > 1); 121 return sumOfProductsOfDeltas / (count() - 1); 122 } 123 124 /** 125 * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or 126 * product-moment correlation coefficient</a> of the values. The count must greater than one, and 127 * the {@code x} and {@code y} values must both have non-zero population variance (i.e. 128 * {@code xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result 129 * is not guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to 130 * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1]. 131 * 132 * <h3>Non-finite values</h3> 133 * 134 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, 135 * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 136 * 137 * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or 138 * either the {@code x} and {@code y} dataset has zero population variance 139 */ 140 public double pearsonsCorrelationCoefficient() { 141 checkState(count() > 1); 142 if (isNaN(sumOfProductsOfDeltas)) { 143 return NaN; 144 } 145 double xSumOfSquaresOfDeltas = xStats().sumOfSquaresOfDeltas(); 146 double ySumOfSquaresOfDeltas = yStats().sumOfSquaresOfDeltas(); 147 checkState(xSumOfSquaresOfDeltas > 0.0); 148 checkState(ySumOfSquaresOfDeltas > 0.0); 149 // The product of two positive numbers can be zero if the multiplication underflowed. We 150 // force a positive value by effectively rounding up to MIN_VALUE. 151 double productOfSumsOfSquaresOfDeltas = 152 ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas); 153 return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas)); 154 } 155 156 /** 157 * Returns a linear transformation giving the best fit to the data according to 158 * <a href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear 159 * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and 160 * either the {@code x} or {@code y} data must have a non-zero population variance (i.e. 161 * {@code xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result 162 * is guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y} 163 * data, and vertical if there is variance in the {@code y} data but not the {@code x} data. 164 * 165 * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This 166 * error is defined as the square root of the mean of the squares of the differences between the 167 * actual {@code y} values of the data and the values predicted by the fit for the {@code x} 168 * values (i.e. it is the square root of the mean of the squares of the vertical distances between 169 * the data points and the best fit line). For this fit, this error is a fraction 170 * {@code sqrt(1 - R*R)} of the population standard deviation of {@code y}, where {@code R} is the 171 * Pearson's correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}). 172 * 173 * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a 174 * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit 175 * does not normally minimize that error: to do that, you should swap the roles of {@code x} and 176 * {@code y}. 177 * 178 * <h3>Non-finite values</h3> 179 * 180 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, 181 * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is 182 * {@link LinearTransformation#forNaN()}. 183 * 184 * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or 185 * both the {@code x} and {@code y} dataset must have zero population variance 186 */ 187 public LinearTransformation leastSquaresFit() { 188 checkState(count() > 1); 189 if (isNaN(sumOfProductsOfDeltas)) { 190 return LinearTransformation.forNaN(); 191 } 192 double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas(); 193 if (xSumOfSquaresOfDeltas > 0.0) { 194 if (yStats.sumOfSquaresOfDeltas() > 0.0) { 195 return LinearTransformation.mapping(xStats.mean(), yStats.mean()) 196 .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas); 197 } else { 198 return LinearTransformation.horizontal(yStats.mean()); 199 } 200 } else { 201 checkState(yStats.sumOfSquaresOfDeltas() > 0.0); 202 return LinearTransformation.vertical(xStats.mean()); 203 } 204 } 205 206 /** 207 * {@inheritDoc} 208 * 209 * <p><b>Note:</b> This tests exact equality of the calculated statistics, including the floating 210 * point values. Two instances are guaranteed to be considered equal if one is copied from the 211 * other using {@code second = new PairedStatsAccumulator().addAll(first).snapshot()}, if both 212 * were obtained by calling {@code snapshot()} on the same {@link PairedStatsAccumulator} without 213 * adding any values in between the two calls, or if one is obtained from the other after 214 * round-tripping through java serialization. However, floating point rounding errors mean that it 215 * may be false for some instances where the statistics are mathematically equal, including 216 * instances constructed from the same values in a different order... or (in the general case) 217 * even in the same order. (It is guaranteed to return true for instances constructed from the 218 * same values in the same order if {@code strictfp} is in effect, or if the system architecture 219 * guarantees {@code strictfp}-like semantics.) 220 */ 221 @Override 222 public boolean equals(@Nullable Object obj) { 223 if (obj == null) { 224 return false; 225 } 226 if (getClass() != obj.getClass()) { 227 return false; 228 } 229 PairedStats other = (PairedStats) obj; 230 return (xStats.equals(other.xStats)) 231 && (yStats.equals(other.yStats)) 232 && (doubleToLongBits(sumOfProductsOfDeltas) 233 == doubleToLongBits(other.sumOfProductsOfDeltas)); 234 } 235 236 /** 237 * {@inheritDoc} 238 * 239 * <p><b>Note:</b> This hash code is consistent with exact equality of the calculated statistics, 240 * including the floating point values. See the note on {@link #equals} for details. 241 */ 242 @Override 243 public int hashCode() { 244 return Objects.hashCode(xStats, yStats, sumOfProductsOfDeltas); 245 } 246 247 @Override 248 public String toString() { 249 if (count() > 0) { 250 return MoreObjects.toStringHelper(this) 251 .add("xStats", xStats) 252 .add("yStats", yStats) 253 .add("populationCovariance", populationCovariance()) 254 .toString(); 255 } else { 256 return MoreObjects.toStringHelper(this) 257 .add("xStats", xStats) 258 .add("yStats", yStats) 259 .toString(); 260 } 261 } 262 263 double sumOfProductsOfDeltas() { 264 return sumOfProductsOfDeltas; 265 } 266 267 private static double ensurePositive(double value) { 268 if (value > 0.0) { 269 return value; 270 } else { 271 return Double.MIN_VALUE; 272 } 273 } 274 275 private static double ensureInUnitRange(double value) { 276 if (value >= 1.0) { 277 return 1.0; 278 } 279 if (value <= -1.0) { 280 return -1.0; 281 } 282 return value; 283 } 284 285 // Serialization helpers 286 287 /** 288 * The size of byte array representation in bytes. 289 */ 290 private static final int BYTES = Stats.BYTES * 2 + Double.SIZE / Byte.SIZE; 291 292 /** 293 * Gets a byte array representation of this instance. 294 * 295 * <p><b>Note:</b> No guarantees are made regarding stability of the representation between 296 * versions. 297 */ 298 public byte[] toByteArray() { 299 ByteBuffer buffer = ByteBuffer.allocate(BYTES).order(ByteOrder.LITTLE_ENDIAN); 300 xStats.writeTo(buffer); 301 yStats.writeTo(buffer); 302 buffer.putDouble(sumOfProductsOfDeltas); 303 return buffer.array(); 304 } 305 306 /** 307 * Creates a {@link PairedStats} instance from the given byte representation which was obtained by 308 * {@link #toByteArray}. 309 * 310 * <p><b>Note:</b> No guarantees are made regarding stability of the representation between 311 * versions. 312 */ 313 public static PairedStats fromByteArray(byte[] byteArray) { 314 checkNotNull(byteArray); 315 checkArgument( 316 byteArray.length == BYTES, 317 "Expected PairedStats.BYTES = %s, got %s", 318 BYTES, 319 byteArray.length); 320 ByteBuffer buffer = ByteBuffer.wrap(byteArray).order(ByteOrder.LITTLE_ENDIAN); 321 Stats xStats = Stats.readFrom(buffer); 322 Stats yStats = Stats.readFrom(buffer); 323 double sumOfProductsOfDeltas = buffer.getDouble(); 324 return new PairedStats(xStats, yStats, sumOfProductsOfDeltas); 325 } 326 327 private static final long serialVersionUID = 0; 328}