001/*
002 * Copyright (C) 2012 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.math;
016
017import static com.google.common.base.Preconditions.checkArgument;
018import static com.google.common.base.Preconditions.checkNotNull;
019import static com.google.common.base.Preconditions.checkState;
020import static java.lang.Double.NaN;
021import static java.lang.Double.doubleToLongBits;
022import static java.lang.Double.isNaN;
023
024import com.google.common.annotations.Beta;
025import com.google.common.annotations.GwtIncompatible;
026import com.google.common.annotations.J2ktIncompatible;
027import com.google.common.base.MoreObjects;
028import com.google.common.base.Objects;
029import java.io.Serializable;
030import java.nio.ByteBuffer;
031import java.nio.ByteOrder;
032import javax.annotation.CheckForNull;
033
034/**
035 * An immutable value object capturing some basic statistics about a collection of paired double
036 * values (e.g. points on a plane). Build instances with {@link PairedStatsAccumulator#snapshot}.
037 *
038 * @author Pete Gillin
039 * @since 20.0
040 */
041@Beta
042@J2ktIncompatible
043@GwtIncompatible
044@ElementTypesAreNonnullByDefault
045public final class PairedStats implements Serializable {
046
047  private final Stats xStats;
048  private final Stats yStats;
049  private final double sumOfProductsOfDeltas;
050
051  /**
052   * Internal constructor. Users should use {@link PairedStatsAccumulator#snapshot}.
053   *
054   * <p>To ensure that the created instance obeys its contract, the parameters should satisfy the
055   * following constraints. This is the callers responsibility and is not enforced here.
056   *
057   * <ul>
058   *   <li>Both {@code xStats} and {@code yStats} must have the same {@code count}.
059   *   <li>If that {@code count} is 1, {@code sumOfProductsOfDeltas} must be exactly 0.0.
060   *   <li>If that {@code count} is more than 1, {@code sumOfProductsOfDeltas} must be finite.
061   * </ul>
062   */
063  PairedStats(Stats xStats, Stats yStats, double sumOfProductsOfDeltas) {
064    this.xStats = xStats;
065    this.yStats = yStats;
066    this.sumOfProductsOfDeltas = sumOfProductsOfDeltas;
067  }
068
069  /** Returns the number of pairs in the dataset. */
070  public long count() {
071    return xStats.count();
072  }
073
074  /** Returns the statistics on the {@code x} values alone. */
075  public Stats xStats() {
076    return xStats;
077  }
078
079  /** Returns the statistics on the {@code y} values alone. */
080  public Stats yStats() {
081    return yStats;
082  }
083
084  /**
085   * Returns the population covariance of the values. The count must be non-zero.
086   *
087   * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It
088   * is not guaranteed to return zero when the dataset consists of the same pair of values multiple
089   * times, due to numerical errors.
090   *
091   * <h3>Non-finite values</h3>
092   *
093   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
094   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
095   *
096   * @throws IllegalStateException if the dataset is empty
097   */
098  public double populationCovariance() {
099    checkState(count() != 0);
100    return sumOfProductsOfDeltas / count();
101  }
102
103  /**
104   * Returns the sample covariance of the values. The count must be greater than one.
105   *
106   * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values
107   * multiple times, due to numerical errors.
108   *
109   * <h3>Non-finite values</h3>
110   *
111   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
112   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
113   *
114   * @throws IllegalStateException if the dataset is empty or contains a single pair of values
115   */
116  public double sampleCovariance() {
117    checkState(count() > 1);
118    return sumOfProductsOfDeltas / (count() - 1);
119  }
120
121  /**
122   * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or
123   * product-moment correlation coefficient</a> of the values. The count must greater than one, and
124   * the {@code x} and {@code y} values must both have non-zero population variance (i.e. {@code
125   * xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result is not
126   * guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to
127   * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1].
128   *
129   * <h3>Non-finite values</h3>
130   *
131   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
132   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
133   *
134   * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
135   *     either the {@code x} and {@code y} dataset has zero population variance
136   */
137  public double pearsonsCorrelationCoefficient() {
138    checkState(count() > 1);
139    if (isNaN(sumOfProductsOfDeltas)) {
140      return NaN;
141    }
142    double xSumOfSquaresOfDeltas = xStats().sumOfSquaresOfDeltas();
143    double ySumOfSquaresOfDeltas = yStats().sumOfSquaresOfDeltas();
144    checkState(xSumOfSquaresOfDeltas > 0.0);
145    checkState(ySumOfSquaresOfDeltas > 0.0);
146    // The product of two positive numbers can be zero if the multiplication underflowed. We
147    // force a positive value by effectively rounding up to MIN_VALUE.
148    double productOfSumsOfSquaresOfDeltas =
149        ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas);
150    return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas));
151  }
152
153  /**
154   * Returns a linear transformation giving the best fit to the data according to <a
155   * href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear
156   * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and
157   * either the {@code x} or {@code y} data must have a non-zero population variance (i.e. {@code
158   * xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result is
159   * guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y}
160   * data, and vertical if there is variance in the {@code y} data but not the {@code x} data.
161   *
162   * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This
163   * error is defined as the square root of the mean of the squares of the differences between the
164   * actual {@code y} values of the data and the values predicted by the fit for the {@code x}
165   * values (i.e. it is the square root of the mean of the squares of the vertical distances between
166   * the data points and the best fit line). For this fit, this error is a fraction {@code sqrt(1 -
167   * R*R)} of the population standard deviation of {@code y}, where {@code R} is the Pearson's
168   * correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}).
169   *
170   * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a
171   * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit
172   * does not normally minimize that error: to do that, you should swap the roles of {@code x} and
173   * {@code y}.
174   *
175   * <h3>Non-finite values</h3>
176   *
177   * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
178   * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link
179   * LinearTransformation#forNaN()}.
180   *
181   * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
182   *     both the {@code x} and {@code y} dataset must have zero population variance
183   */
184  public LinearTransformation leastSquaresFit() {
185    checkState(count() > 1);
186    if (isNaN(sumOfProductsOfDeltas)) {
187      return LinearTransformation.forNaN();
188    }
189    double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas();
190    if (xSumOfSquaresOfDeltas > 0.0) {
191      if (yStats.sumOfSquaresOfDeltas() > 0.0) {
192        return LinearTransformation.mapping(xStats.mean(), yStats.mean())
193            .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas);
194      } else {
195        return LinearTransformation.horizontal(yStats.mean());
196      }
197    } else {
198      checkState(yStats.sumOfSquaresOfDeltas() > 0.0);
199      return LinearTransformation.vertical(xStats.mean());
200    }
201  }
202
203  /**
204   * {@inheritDoc}
205   *
206   * <p><b>Note:</b> This tests exact equality of the calculated statistics, including the floating
207   * point values. Two instances are guaranteed to be considered equal if one is copied from the
208   * other using {@code second = new PairedStatsAccumulator().addAll(first).snapshot()}, if both
209   * were obtained by calling {@code snapshot()} on the same {@link PairedStatsAccumulator} without
210   * adding any values in between the two calls, or if one is obtained from the other after
211   * round-tripping through java serialization. However, floating point rounding errors mean that it
212   * may be false for some instances where the statistics are mathematically equal, including
213   * instances constructed from the same values in a different order... or (in the general case)
214   * even in the same order. (It is guaranteed to return true for instances constructed from the
215   * same values in the same order if {@code strictfp} is in effect, or if the system architecture
216   * guarantees {@code strictfp}-like semantics.)
217   */
218  @Override
219  public boolean equals(@CheckForNull Object obj) {
220    if (obj == null) {
221      return false;
222    }
223    if (getClass() != obj.getClass()) {
224      return false;
225    }
226    PairedStats other = (PairedStats) obj;
227    return xStats.equals(other.xStats)
228        && yStats.equals(other.yStats)
229        && doubleToLongBits(sumOfProductsOfDeltas) == doubleToLongBits(other.sumOfProductsOfDeltas);
230  }
231
232  /**
233   * {@inheritDoc}
234   *
235   * <p><b>Note:</b> This hash code is consistent with exact equality of the calculated statistics,
236   * including the floating point values. See the note on {@link #equals} for details.
237   */
238  @Override
239  public int hashCode() {
240    return Objects.hashCode(xStats, yStats, sumOfProductsOfDeltas);
241  }
242
243  @Override
244  public String toString() {
245    if (count() > 0) {
246      return MoreObjects.toStringHelper(this)
247          .add("xStats", xStats)
248          .add("yStats", yStats)
249          .add("populationCovariance", populationCovariance())
250          .toString();
251    } else {
252      return MoreObjects.toStringHelper(this)
253          .add("xStats", xStats)
254          .add("yStats", yStats)
255          .toString();
256    }
257  }
258
259  double sumOfProductsOfDeltas() {
260    return sumOfProductsOfDeltas;
261  }
262
263  private static double ensurePositive(double value) {
264    if (value > 0.0) {
265      return value;
266    } else {
267      return Double.MIN_VALUE;
268    }
269  }
270
271  private static double ensureInUnitRange(double value) {
272    if (value >= 1.0) {
273      return 1.0;
274    }
275    if (value <= -1.0) {
276      return -1.0;
277    }
278    return value;
279  }
280
281  // Serialization helpers
282
283  /** The size of byte array representation in bytes. */
284  private static final int BYTES = Stats.BYTES * 2 + Double.SIZE / Byte.SIZE;
285
286  /**
287   * Gets a byte array representation of this instance.
288   *
289   * <p><b>Note:</b> No guarantees are made regarding stability of the representation between
290   * versions.
291   */
292  public byte[] toByteArray() {
293    ByteBuffer buffer = ByteBuffer.allocate(BYTES).order(ByteOrder.LITTLE_ENDIAN);
294    xStats.writeTo(buffer);
295    yStats.writeTo(buffer);
296    buffer.putDouble(sumOfProductsOfDeltas);
297    return buffer.array();
298  }
299
300  /**
301   * Creates a {@link PairedStats} instance from the given byte representation which was obtained by
302   * {@link #toByteArray}.
303   *
304   * <p><b>Note:</b> No guarantees are made regarding stability of the representation between
305   * versions.
306   */
307  public static PairedStats fromByteArray(byte[] byteArray) {
308    checkNotNull(byteArray);
309    checkArgument(
310        byteArray.length == BYTES,
311        "Expected PairedStats.BYTES = %s, got %s",
312        BYTES,
313        byteArray.length);
314    ByteBuffer buffer = ByteBuffer.wrap(byteArray).order(ByteOrder.LITTLE_ENDIAN);
315    Stats xStats = Stats.readFrom(buffer);
316    Stats yStats = Stats.readFrom(buffer);
317    double sumOfProductsOfDeltas = buffer.getDouble();
318    return new PairedStats(xStats, yStats, sumOfProductsOfDeltas);
319  }
320
321  private static final long serialVersionUID = 0;
322}