Source code

001/*
002 * Copyright (C) 2012 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.math;
016
017import static com.google.common.base.Preconditions.checkArgument;
018import static com.google.common.base.Preconditions.checkNotNull;
019import static com.google.common.base.Preconditions.checkState;
020import static java.lang.Double.NaN;
021import static java.lang.Double.doubleToLongBits;
022import static java.lang.Double.isNaN;
023
024import com.google.common.annotations.Beta;
025import com.google.common.annotations.GwtIncompatible;
026import com.google.common.base.MoreObjects;
027import com.google.common.base.Objects;
028import java.io.Serializable;
029import java.nio.ByteBuffer;
030import java.nio.ByteOrder;
031import javax.annotation.Nullable;
032
033/**
034 * An immutable value object capturing some basic statistics about a collection of paired double
035 * values (e.g. points on a plane). Build instances with {@link PairedStatsAccumulator#snapshot}.
036 *
037 * @author Pete Gillin
038 * @since 20.0
039 */
040@Beta
041@GwtIncompatible
042public final class PairedStats implements Serializable {
043
044  private final Stats xStats;
045  private final Stats yStats;
046  private final double sumOfProductsOfDeltas;
047
048  /**
049   * Internal constructor. Users should use {@link PairedStatsAccumulator#snapshot}.
050   *
051   * <p>To ensure that the created instance obeys its contract, the parameters should satisfy the
052   * following constraints. This is the callers responsibility and is not enforced here.
053   * <ul>
054   * <li>Both {@code xStats} and {@code yStats} must have the same {@count}.
055   * <li>If that {@code count} is 1, {@code sumOfProductsOfDeltas} must be exactly 0.0.
056   * <li>If that {@code count} is more than 1, {@code sumOfProductsOfDeltas} must be finite.
057   * </ul>
058   */
059  PairedStats(Stats xStats, Stats yStats, double sumOfProductsOfDeltas) {
060    this.xStats = xStats;
061    this.yStats = yStats;
062    this.sumOfProductsOfDeltas = sumOfProductsOfDeltas;
063  }
064
065  /**
066   * Returns the number of pairs in the dataset.
067   */
068  public long count() {
069    return xStats.count();
070  }
071
072  /**
073   * Returns the statistics on the {@code x} values alone.
074   */
075  public Stats xStats() {
076    return xStats;
077  }
078
079  /**
080   * Returns the statistics on the {@code y} values alone.
081   */
082  public Stats yStats() {
083    return yStats;
084  }
085
086  /**
087   * Returns the population covariance of the values. The count must be non-zero.
088   *
089   * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It
090   * is not guaranteed to return zero when the dataset consists of the same pair of values multiple
091   * times, due to numerical errors.
092   *
093   * <h3>Non-finite values</h3>
094   *
095   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
096   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
097   *
098   * @throws IllegalStateException if the dataset is empty
099   */
100  public double populationCovariance() {
101    checkState(count() != 0);
102    return sumOfProductsOfDeltas / count();
103  }
104
105  /**
106   * Returns the sample covariance of the values. The count must be greater than one.
107   *
108   * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values
109   * multiple times, due to numerical errors.
110   *
111   * <h3>Non-finite values</h3>
112   *
113   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
114   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
115   *
116   * @throws IllegalStateException if the dataset is empty or contains a single pair of values
117   */
118  public double sampleCovariance() {
119    checkState(count() > 1);
120    return sumOfProductsOfDeltas / (count() - 1);
121  }
122
123  /**
124   * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or
125   * product-moment correlation coefficient</a> of the values. The count must greater than one, and
126   * the {@code x} and {@code y} values must both have non-zero population variance (i.e.
127   * {@code xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result
128   * is not guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to
129   * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1].
130   *
131   * <h3>Non-finite values</h3>
132   *
133   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
134   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
135   *
136   * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
137   *     either the {@code x} and {@code y} dataset has zero population variance
138   */
139  public double pearsonsCorrelationCoefficient() {
140    checkState(count() > 1);
141    if (isNaN(sumOfProductsOfDeltas)) {
142      return NaN;
143    }
144    double xSumOfSquaresOfDeltas = xStats().sumOfSquaresOfDeltas();
145    double ySumOfSquaresOfDeltas = yStats().sumOfSquaresOfDeltas();
146    checkState(xSumOfSquaresOfDeltas > 0.0);
147    checkState(ySumOfSquaresOfDeltas > 0.0);
148    // The product of two positive numbers can be zero if the multiplication underflowed. We
149    // force a positive value by effectively rounding up to MIN_VALUE.
150    double productOfSumsOfSquaresOfDeltas =
151        ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas);
152    return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas));
153  }
154
155  /**
156   * Returns a linear transformation giving the best fit to the data according to
157   * <a href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear
158   * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and
159   * either the {@code x} or {@code y} data must have a non-zero population variance (i.e.
160   * {@code xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result
161   * is guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y}
162   * data, and vertical if there is variance in the {@code y} data but not the {@code x} data.
163   *
164   * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This
165   * error is defined as the square root of the mean of the squares of the differences between the
166   * actual {@code y} values of the data and the values predicted by the fit for the {@code x}
167   * values (i.e. it is the square root of the mean of the squares of the vertical distances between
168   * the data points and the best fit line). For this fit, this error is a fraction
169   * {@code sqrt(1 - R*R)} of the population standard deviation of {@code y}, where {@code R} is the
170   * Pearson's correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}).
171   *
172   * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a
173   * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit
174   * does not normally minimize that error: to do that, you should swap the roles of {@code x} and
175   * {@code y}.
176   *
177   * <h3>Non-finite values</h3>
178   *
179   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
180   * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is
181   * {@link LinearTransformation#forNaN()}.
182   *
183   * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
184   *     both the {@code x} and {@code y} dataset must have zero population variance
185   */
186  public LinearTransformation leastSquaresFit() {
187    checkState(count() > 1);
188    if (isNaN(sumOfProductsOfDeltas)) {
189      return LinearTransformation.forNaN();
190    }
191    double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas();
192    if (xSumOfSquaresOfDeltas > 0.0) {
193      if (yStats.sumOfSquaresOfDeltas() > 0.0) {
194        return LinearTransformation.mapping(xStats.mean(), yStats.mean())
195            .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas);
196      } else {
197        return LinearTransformation.horizontal(yStats.mean());
198      }
199    } else {
200      checkState(yStats.sumOfSquaresOfDeltas() > 0.0);
201      return LinearTransformation.vertical(xStats.mean());
202    }
203  }
204
205  /**
206   * {@inheritDoc}
207   *
208   * <p><b>Note:</b> This tests exact equality of the calculated statistics, including the floating
209   * point values. It is definitely true for instances constructed from exactly the same values in
210   * the same order. It is also true for an instance round-tripped through java serialization.
211   * However, floating point rounding errors mean that it may be false for some instances where the
212   * statistics are mathematically equal, including the same values in a different order.
213   */
214  @Override
215  public boolean equals(@Nullable Object obj) {
216    if (obj == null) {
217      return false;
218    }
219    if (getClass() != obj.getClass()) {
220      return false;
221    }
222    PairedStats other = (PairedStats) obj;
223    return (xStats.equals(other.xStats))
224        && (yStats.equals(other.yStats))
225        && (doubleToLongBits(sumOfProductsOfDeltas)
226            == doubleToLongBits(other.sumOfProductsOfDeltas));
227  }
228
229  /**
230   * {@inheritDoc}
231   *
232   * <p><b>Note:</b> This hash code is consistent with exact equality of the calculated statistics,
233   * including the floating point values. See the note on {@link #equals} for details.
234   */
235  @Override
236  public int hashCode() {
237    return Objects.hashCode(xStats, yStats, sumOfProductsOfDeltas);
238  }
239
240  @Override
241  public String toString() {
242    if (count() > 0) {
243      return MoreObjects.toStringHelper(this)
244          .add("xStats", xStats)
245          .add("yStats", yStats)
246          .add("populationCovariance", populationCovariance())
247          .toString();
248    } else {
249      return MoreObjects.toStringHelper(this)
250          .add("xStats", xStats)
251          .add("yStats", yStats)
252          .toString();
253    }
254  }
255
256  double sumOfProductsOfDeltas() {
257    return sumOfProductsOfDeltas;
258  }
259
260  private static double ensurePositive(double value) {
261    if (value > 0.0) {
262      return value;
263    } else {
264      return Double.MIN_VALUE;
265    }
266  }
267
268  private static double ensureInUnitRange(double value) {
269    if (value >= 1.0) {
270      return 1.0;
271    }
272    if (value <= -1.0) {
273      return -1.0;
274    }
275    return value;
276  }
277
278  // Serialization helpers
279
280  /**
281   * The size of byte array representaion in bytes.
282   */
283  private static final int BYTES = Stats.BYTES * 2 + Double.SIZE / Byte.SIZE;
284
285  /**
286   * Gets a byte array representation of this instance.
287   *
288   * <p><b>Note:</b> No guarantees are made regarding stability of the representation between
289   * versions.
290   */
291  public byte[] toByteArray() {
292    ByteBuffer buffer = ByteBuffer.allocate(BYTES).order(ByteOrder.LITTLE_ENDIAN);
293    xStats.writeTo(buffer);
294    yStats.writeTo(buffer);
295    buffer.putDouble(sumOfProductsOfDeltas);
296    return buffer.array();
297  }
298
299  /**
300   * Creates a {@link PairedStats} instance from the given byte representation which was obtained by
301   * {@link #toByteArray}.
302   *
303   * <p><b>Note:</b> No guarantees are made regarding stability of the representation between
304   * versions.
305   */
306  public static PairedStats fromByteArray(byte[] byteArray) {
307    checkNotNull(byteArray);
308    checkArgument(
309        byteArray.length == BYTES,
310        "Expected PairedStats.BYTES = %s, got %s",
311        BYTES,
312        byteArray.length);
313    ByteBuffer buffer = ByteBuffer.wrap(byteArray).order(ByteOrder.LITTLE_ENDIAN);
314    Stats xStats = Stats.readFrom(buffer);
315    Stats yStats = Stats.readFrom(buffer);
316    double sumOfProductsOfDeltas = buffer.getDouble();
317    return new PairedStats(xStats, yStats, sumOfProductsOfDeltas);
318  }
319
320  private static final long serialVersionUID = 0;
321}