Source code

001/*
002 * Copyright (C) 2012 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.math;
016
017import static com.google.common.base.Preconditions.checkArgument;
018import static com.google.common.base.Preconditions.checkNotNull;
019import static com.google.common.base.Preconditions.checkState;
020import static java.lang.Double.NaN;
021import static java.lang.Double.doubleToLongBits;
022import static java.lang.Double.isNaN;
023
024import com.google.common.annotations.Beta;
025import com.google.common.annotations.GwtIncompatible;
026import com.google.common.base.MoreObjects;
027import com.google.common.base.Objects;
028import java.io.Serializable;
029import java.nio.ByteBuffer;
030import java.nio.ByteOrder;
031import javax.annotation.Nullable;
032
033/**
034 * An immutable value object capturing some basic statistics about a collection of paired double
035 * values (e.g. points on a plane). Build instances with {@link PairedStatsAccumulator#snapshot}.
036 *
037 * @author Pete Gillin
038 * @since 20.0
039 */
040@Beta
041@GwtIncompatible
042public final class PairedStats implements Serializable {
043
044  private final Stats xStats;
045  private final Stats yStats;
046  private final double sumOfProductsOfDeltas;
047
048  /**
049   * Internal constructor. Users should use {@link PairedStatsAccumulator#snapshot}.
050   *
051   * <p>To ensure that the created instance obeys its contract, the parameters should satisfy the
052   * following constraints. This is the callers responsibility and is not enforced here.
053   *
054   * <ul>
055   *   <li>Both {@code xStats} and {@code yStats} must have the same {@code count}.
056   *   <li>If that {@code count} is 1, {@code sumOfProductsOfDeltas} must be exactly 0.0.
057   *   <li>If that {@code count} is more than 1, {@code sumOfProductsOfDeltas} must be finite.
058   * </ul>
059   */
060  PairedStats(Stats xStats, Stats yStats, double sumOfProductsOfDeltas) {
061    this.xStats = xStats;
062    this.yStats = yStats;
063    this.sumOfProductsOfDeltas = sumOfProductsOfDeltas;
064  }
065
066  /**
067   * Returns the number of pairs in the dataset.
068   */
069  public long count() {
070    return xStats.count();
071  }
072
073  /**
074   * Returns the statistics on the {@code x} values alone.
075   */
076  public Stats xStats() {
077    return xStats;
078  }
079
080  /**
081   * Returns the statistics on the {@code y} values alone.
082   */
083  public Stats yStats() {
084    return yStats;
085  }
086
087  /**
088   * Returns the population covariance of the values. The count must be non-zero.
089   *
090   * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It
091   * is not guaranteed to return zero when the dataset consists of the same pair of values multiple
092   * times, due to numerical errors.
093   *
094   * <h3>Non-finite values</h3>
095   *
096   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
097   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
098   *
099   * @throws IllegalStateException if the dataset is empty
100   */
101  public double populationCovariance() {
102    checkState(count() != 0);
103    return sumOfProductsOfDeltas / count();
104  }
105
106  /**
107   * Returns the sample covariance of the values. The count must be greater than one.
108   *
109   * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values
110   * multiple times, due to numerical errors.
111   *
112   * <h3>Non-finite values</h3>
113   *
114   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
115   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
116   *
117   * @throws IllegalStateException if the dataset is empty or contains a single pair of values
118   */
119  public double sampleCovariance() {
120    checkState(count() > 1);
121    return sumOfProductsOfDeltas / (count() - 1);
122  }
123
124  /**
125   * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or
126   * product-moment correlation coefficient</a> of the values. The count must greater than one, and
127   * the {@code x} and {@code y} values must both have non-zero population variance (i.e.
128   * {@code xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result
129   * is not guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to
130   * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1].
131   *
132   * <h3>Non-finite values</h3>
133   *
134   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
135   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
136   *
137   * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
138   *     either the {@code x} and {@code y} dataset has zero population variance
139   */
140  public double pearsonsCorrelationCoefficient() {
141    checkState(count() > 1);
142    if (isNaN(sumOfProductsOfDeltas)) {
143      return NaN;
144    }
145    double xSumOfSquaresOfDeltas = xStats().sumOfSquaresOfDeltas();
146    double ySumOfSquaresOfDeltas = yStats().sumOfSquaresOfDeltas();
147    checkState(xSumOfSquaresOfDeltas > 0.0);
148    checkState(ySumOfSquaresOfDeltas > 0.0);
149    // The product of two positive numbers can be zero if the multiplication underflowed. We
150    // force a positive value by effectively rounding up to MIN_VALUE.
151    double productOfSumsOfSquaresOfDeltas =
152        ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas);
153    return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas));
154  }
155
156  /**
157   * Returns a linear transformation giving the best fit to the data according to
158   * <a href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear
159   * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and
160   * either the {@code x} or {@code y} data must have a non-zero population variance (i.e.
161   * {@code xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result
162   * is guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y}
163   * data, and vertical if there is variance in the {@code y} data but not the {@code x} data.
164   *
165   * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This
166   * error is defined as the square root of the mean of the squares of the differences between the
167   * actual {@code y} values of the data and the values predicted by the fit for the {@code x}
168   * values (i.e. it is the square root of the mean of the squares of the vertical distances between
169   * the data points and the best fit line). For this fit, this error is a fraction
170   * {@code sqrt(1 - R*R)} of the population standard deviation of {@code y}, where {@code R} is the
171   * Pearson's correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}).
172   *
173   * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a
174   * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit
175   * does not normally minimize that error: to do that, you should swap the roles of {@code x} and
176   * {@code y}.
177   *
178   * <h3>Non-finite values</h3>
179   *
180   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
181   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is
182   * {@link LinearTransformation#forNaN()}.
183   *
184   * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
185   *     both the {@code x} and {@code y} dataset must have zero population variance
186   */
187  public LinearTransformation leastSquaresFit() {
188    checkState(count() > 1);
189    if (isNaN(sumOfProductsOfDeltas)) {
190      return LinearTransformation.forNaN();
191    }
192    double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas();
193    if (xSumOfSquaresOfDeltas > 0.0) {
194      if (yStats.sumOfSquaresOfDeltas() > 0.0) {
195        return LinearTransformation.mapping(xStats.mean(), yStats.mean())
196            .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas);
197      } else {
198        return LinearTransformation.horizontal(yStats.mean());
199      }
200    } else {
201      checkState(yStats.sumOfSquaresOfDeltas() > 0.0);
202      return LinearTransformation.vertical(xStats.mean());
203    }
204  }
205
206  /**
207   * {@inheritDoc}
208   *
209   * <p><b>Note:</b> This tests exact equality of the calculated statistics, including the floating
210   * point values. Two instances are guaranteed to be considered equal if one is copied from the
211   * other using {@code second = new PairedStatsAccumulator().addAll(first).snapshot()}, if both
212   * were obtained by calling {@code snapshot()} on the same {@link PairedStatsAccumulator} without
213   * adding any values in between the two calls, or if one is obtained from the other after
214   * round-tripping through java serialization. However, floating point rounding errors mean that it
215   * may be false for some instances where the statistics are mathematically equal, including
216   * instances constructed from the same values in a different order... or (in the general case)
217   * even in the same order. (It is guaranteed to return true for instances constructed from the
218   * same values in the same order if {@code strictfp} is in effect, or if the system architecture
219   * guarantees {@code strictfp}-like semantics.)
220   */
221  @Override
222  public boolean equals(@Nullable Object obj) {
223    if (obj == null) {
224      return false;
225    }
226    if (getClass() != obj.getClass()) {
227      return false;
228    }
229    PairedStats other = (PairedStats) obj;
230    return (xStats.equals(other.xStats))
231        && (yStats.equals(other.yStats))
232        && (doubleToLongBits(sumOfProductsOfDeltas)
233            == doubleToLongBits(other.sumOfProductsOfDeltas));
234  }
235
236  /**
237   * {@inheritDoc}
238   *
239   * <p><b>Note:</b> This hash code is consistent with exact equality of the calculated statistics,
240   * including the floating point values. See the note on {@link #equals} for details.
241   */
242  @Override
243  public int hashCode() {
244    return Objects.hashCode(xStats, yStats, sumOfProductsOfDeltas);
245  }
246
247  @Override
248  public String toString() {
249    if (count() > 0) {
250      return MoreObjects.toStringHelper(this)
251          .add("xStats", xStats)
252          .add("yStats", yStats)
253          .add("populationCovariance", populationCovariance())
254          .toString();
255    } else {
256      return MoreObjects.toStringHelper(this)
257          .add("xStats", xStats)
258          .add("yStats", yStats)
259          .toString();
260    }
261  }
262
263  double sumOfProductsOfDeltas() {
264    return sumOfProductsOfDeltas;
265  }
266
267  private static double ensurePositive(double value) {
268    if (value > 0.0) {
269      return value;
270    } else {
271      return Double.MIN_VALUE;
272    }
273  }
274
275  private static double ensureInUnitRange(double value) {
276    if (value >= 1.0) {
277      return 1.0;
278    }
279    if (value <= -1.0) {
280      return -1.0;
281    }
282    return value;
283  }
284
285  // Serialization helpers
286
287  /**
288   * The size of byte array representation in bytes.
289   */
290  private static final int BYTES = Stats.BYTES * 2 + Double.SIZE / Byte.SIZE;
291
292  /**
293   * Gets a byte array representation of this instance.
294   *
295   * <p><b>Note:</b> No guarantees are made regarding stability of the representation between
296   * versions.
297   */
298  public byte[] toByteArray() {
299    ByteBuffer buffer = ByteBuffer.allocate(BYTES).order(ByteOrder.LITTLE_ENDIAN);
300    xStats.writeTo(buffer);
301    yStats.writeTo(buffer);
302    buffer.putDouble(sumOfProductsOfDeltas);
303    return buffer.array();
304  }
305
306  /**
307   * Creates a {@link PairedStats} instance from the given byte representation which was obtained by
308   * {@link #toByteArray}.
309   *
310   * <p><b>Note:</b> No guarantees are made regarding stability of the representation between
311   * versions.
312   */
313  public static PairedStats fromByteArray(byte[] byteArray) {
314    checkNotNull(byteArray);
315    checkArgument(
316        byteArray.length == BYTES,
317        "Expected PairedStats.BYTES = %s, got %s",
318        BYTES,
319        byteArray.length);
320    ByteBuffer buffer = ByteBuffer.wrap(byteArray).order(ByteOrder.LITTLE_ENDIAN);
321    Stats xStats = Stats.readFrom(buffer);
322    Stats yStats = Stats.readFrom(buffer);
323    double sumOfProductsOfDeltas = buffer.getDouble();
324    return new PairedStats(xStats, yStats, sumOfProductsOfDeltas);
325  }
326
327  private static final long serialVersionUID = 0;
328}