001/* 002 * Copyright (C) 2012 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.math; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkState; 020import static java.lang.Double.NaN; 021import static java.lang.Double.doubleToLongBits; 022import static java.lang.Double.isNaN; 023 024import com.google.common.annotations.Beta; 025import com.google.common.annotations.GwtIncompatible; 026import com.google.common.annotations.J2ktIncompatible; 027import com.google.common.base.MoreObjects; 028import com.google.common.base.Objects; 029import java.io.Serializable; 030import java.nio.ByteBuffer; 031import java.nio.ByteOrder; 032import javax.annotation.CheckForNull; 033 034/** 035 * An immutable value object capturing some basic statistics about a collection of paired double 036 * values (e.g. points on a plane). Build instances with {@link PairedStatsAccumulator#snapshot}. 037 * 038 * @author Pete Gillin 039 * @since 20.0 040 */ 041@Beta 042@J2ktIncompatible 043@GwtIncompatible 044@ElementTypesAreNonnullByDefault 045public final class PairedStats implements Serializable { 046 047 private final Stats xStats; 048 private final Stats yStats; 049 private final double sumOfProductsOfDeltas; 050 051 /** 052 * Internal constructor. Users should use {@link PairedStatsAccumulator#snapshot}. 053 * 054 * <p>To ensure that the created instance obeys its contract, the parameters should satisfy the 055 * following constraints. This is the callers responsibility and is not enforced here. 056 * 057 * <ul> 058 * <li>Both {@code xStats} and {@code yStats} must have the same {@code count}. 059 * <li>If that {@code count} is 1, {@code sumOfProductsOfDeltas} must be exactly 0.0. 060 * <li>If that {@code count} is more than 1, {@code sumOfProductsOfDeltas} must be finite. 061 * </ul> 062 */ 063 PairedStats(Stats xStats, Stats yStats, double sumOfProductsOfDeltas) { 064 this.xStats = xStats; 065 this.yStats = yStats; 066 this.sumOfProductsOfDeltas = sumOfProductsOfDeltas; 067 } 068 069 /** Returns the number of pairs in the dataset. */ 070 public long count() { 071 return xStats.count(); 072 } 073 074 /** Returns the statistics on the {@code x} values alone. */ 075 public Stats xStats() { 076 return xStats; 077 } 078 079 /** Returns the statistics on the {@code y} values alone. */ 080 public Stats yStats() { 081 return yStats; 082 } 083 084 /** 085 * Returns the population covariance of the values. The count must be non-zero. 086 * 087 * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It 088 * is not guaranteed to return zero when the dataset consists of the same pair of values multiple 089 * times, due to numerical errors. 090 * 091 * <h3>Non-finite values</h3> 092 * 093 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 094 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 095 * 096 * @throws IllegalStateException if the dataset is empty 097 */ 098 public double populationCovariance() { 099 checkState(count() != 0); 100 return sumOfProductsOfDeltas / count(); 101 } 102 103 /** 104 * Returns the sample covariance of the values. The count must be greater than one. 105 * 106 * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values 107 * multiple times, due to numerical errors. 108 * 109 * <h3>Non-finite values</h3> 110 * 111 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 112 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 113 * 114 * @throws IllegalStateException if the dataset is empty or contains a single pair of values 115 */ 116 public double sampleCovariance() { 117 checkState(count() > 1); 118 return sumOfProductsOfDeltas / (count() - 1); 119 } 120 121 /** 122 * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or 123 * product-moment correlation coefficient</a> of the values. The count must greater than one, and 124 * the {@code x} and {@code y} values must both have non-zero population variance (i.e. {@code 125 * xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result is not 126 * guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to 127 * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1]. 128 * 129 * <h3>Non-finite values</h3> 130 * 131 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 132 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. 133 * 134 * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or 135 * either the {@code x} and {@code y} dataset has zero population variance 136 */ 137 public double pearsonsCorrelationCoefficient() { 138 checkState(count() > 1); 139 if (isNaN(sumOfProductsOfDeltas)) { 140 return NaN; 141 } 142 double xSumOfSquaresOfDeltas = xStats().sumOfSquaresOfDeltas(); 143 double ySumOfSquaresOfDeltas = yStats().sumOfSquaresOfDeltas(); 144 checkState(xSumOfSquaresOfDeltas > 0.0); 145 checkState(ySumOfSquaresOfDeltas > 0.0); 146 // The product of two positive numbers can be zero if the multiplication underflowed. We 147 // force a positive value by effectively rounding up to MIN_VALUE. 148 double productOfSumsOfSquaresOfDeltas = 149 ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas); 150 return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas)); 151 } 152 153 /** 154 * Returns a linear transformation giving the best fit to the data according to <a 155 * href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear 156 * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and 157 * either the {@code x} or {@code y} data must have a non-zero population variance (i.e. {@code 158 * xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result is 159 * guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y} 160 * data, and vertical if there is variance in the {@code y} data but not the {@code x} data. 161 * 162 * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This 163 * error is defined as the square root of the mean of the squares of the differences between the 164 * actual {@code y} values of the data and the values predicted by the fit for the {@code x} 165 * values (i.e. it is the square root of the mean of the squares of the vertical distances between 166 * the data points and the best fit line). For this fit, this error is a fraction {@code sqrt(1 - 167 * R*R)} of the population standard deviation of {@code y}, where {@code R} is the Pearson's 168 * correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}). 169 * 170 * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a 171 * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit 172 * does not normally minimize that error: to do that, you should swap the roles of {@code x} and 173 * {@code y}. 174 * 175 * <h3>Non-finite values</h3> 176 * 177 * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link 178 * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link 179 * LinearTransformation#forNaN()}. 180 * 181 * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or 182 * both the {@code x} and {@code y} dataset must have zero population variance 183 */ 184 public LinearTransformation leastSquaresFit() { 185 checkState(count() > 1); 186 if (isNaN(sumOfProductsOfDeltas)) { 187 return LinearTransformation.forNaN(); 188 } 189 double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas(); 190 if (xSumOfSquaresOfDeltas > 0.0) { 191 if (yStats.sumOfSquaresOfDeltas() > 0.0) { 192 return LinearTransformation.mapping(xStats.mean(), yStats.mean()) 193 .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas); 194 } else { 195 return LinearTransformation.horizontal(yStats.mean()); 196 } 197 } else { 198 checkState(yStats.sumOfSquaresOfDeltas() > 0.0); 199 return LinearTransformation.vertical(xStats.mean()); 200 } 201 } 202 203 /** 204 * {@inheritDoc} 205 * 206 * <p><b>Note:</b> This tests exact equality of the calculated statistics, including the floating 207 * point values. Two instances are guaranteed to be considered equal if one is copied from the 208 * other using {@code second = new PairedStatsAccumulator().addAll(first).snapshot()}, if both 209 * were obtained by calling {@code snapshot()} on the same {@link PairedStatsAccumulator} without 210 * adding any values in between the two calls, or if one is obtained from the other after 211 * round-tripping through java serialization. However, floating point rounding errors mean that it 212 * may be false for some instances where the statistics are mathematically equal, including 213 * instances constructed from the same values in a different order... or (in the general case) 214 * even in the same order. (It is guaranteed to return true for instances constructed from the 215 * same values in the same order if {@code strictfp} is in effect, or if the system architecture 216 * guarantees {@code strictfp}-like semantics.) 217 */ 218 @Override 219 public boolean equals(@CheckForNull Object obj) { 220 if (obj == null) { 221 return false; 222 } 223 if (getClass() != obj.getClass()) { 224 return false; 225 } 226 PairedStats other = (PairedStats) obj; 227 return xStats.equals(other.xStats) 228 && yStats.equals(other.yStats) 229 && doubleToLongBits(sumOfProductsOfDeltas) == doubleToLongBits(other.sumOfProductsOfDeltas); 230 } 231 232 /** 233 * {@inheritDoc} 234 * 235 * <p><b>Note:</b> This hash code is consistent with exact equality of the calculated statistics, 236 * including the floating point values. See the note on {@link #equals} for details. 237 */ 238 @Override 239 public int hashCode() { 240 return Objects.hashCode(xStats, yStats, sumOfProductsOfDeltas); 241 } 242 243 @Override 244 public String toString() { 245 if (count() > 0) { 246 return MoreObjects.toStringHelper(this) 247 .add("xStats", xStats) 248 .add("yStats", yStats) 249 .add("populationCovariance", populationCovariance()) 250 .toString(); 251 } else { 252 return MoreObjects.toStringHelper(this) 253 .add("xStats", xStats) 254 .add("yStats", yStats) 255 .toString(); 256 } 257 } 258 259 double sumOfProductsOfDeltas() { 260 return sumOfProductsOfDeltas; 261 } 262 263 private static double ensurePositive(double value) { 264 if (value > 0.0) { 265 return value; 266 } else { 267 return Double.MIN_VALUE; 268 } 269 } 270 271 private static double ensureInUnitRange(double value) { 272 if (value >= 1.0) { 273 return 1.0; 274 } 275 if (value <= -1.0) { 276 return -1.0; 277 } 278 return value; 279 } 280 281 // Serialization helpers 282 283 /** The size of byte array representation in bytes. */ 284 private static final int BYTES = Stats.BYTES * 2 + Double.SIZE / Byte.SIZE; 285 286 /** 287 * Gets a byte array representation of this instance. 288 * 289 * <p><b>Note:</b> No guarantees are made regarding stability of the representation between 290 * versions. 291 */ 292 public byte[] toByteArray() { 293 ByteBuffer buffer = ByteBuffer.allocate(BYTES).order(ByteOrder.LITTLE_ENDIAN); 294 xStats.writeTo(buffer); 295 yStats.writeTo(buffer); 296 buffer.putDouble(sumOfProductsOfDeltas); 297 return buffer.array(); 298 } 299 300 /** 301 * Creates a {@link PairedStats} instance from the given byte representation which was obtained by 302 * {@link #toByteArray}. 303 * 304 * <p><b>Note:</b> No guarantees are made regarding stability of the representation between 305 * versions. 306 */ 307 public static PairedStats fromByteArray(byte[] byteArray) { 308 checkNotNull(byteArray); 309 checkArgument( 310 byteArray.length == BYTES, 311 "Expected PairedStats.BYTES = %s, got %s", 312 BYTES, 313 byteArray.length); 314 ByteBuffer buffer = ByteBuffer.wrap(byteArray).order(ByteOrder.LITTLE_ENDIAN); 315 Stats xStats = Stats.readFrom(buffer); 316 Stats yStats = Stats.readFrom(buffer); 317 double sumOfProductsOfDeltas = buffer.getDouble(); 318 return new PairedStats(xStats, yStats, sumOfProductsOfDeltas); 319 } 320 321 private static final long serialVersionUID = 0; 322}