001/*
002 * Copyright (C) 2011 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.hash;
016
017import static com.google.common.base.Preconditions.checkArgument;
018
019import com.google.common.annotations.Beta;
020import com.google.common.annotations.VisibleForTesting;
021import com.google.common.base.Supplier;
022
023import java.nio.ByteBuffer;
024import java.security.MessageDigest;
025import java.util.Iterator;
026import java.util.zip.Adler32;
027import java.util.zip.CRC32;
028import java.util.zip.Checksum;
029
030import javax.annotation.Nullable;
031
032/**
033 * Static methods to obtain {@link HashFunction} instances, and other static hashing-related
034 * utilities.
035 *
036 * <p>A comparison of the various hash functions can be found
037 * <a href="http://goo.gl/jS7HH">here</a>.
038 *
039 * @author Kevin Bourrillion
040 * @author Dimitris Andreou
041 * @author Kurt Alfred Kluever
042 * @since 11.0
043 */
044@Beta
045public final class Hashing {
046  /**
047   * Returns a general-purpose, <b>temporary-use</b>, non-cryptographic hash function. The algorithm
048   * the returned function implements is unspecified and subject to change without notice.
049   *
050   * <p><b>Warning:</b> a new random seed for these functions is chosen each time the {@code
051   * Hashing} class is loaded. <b>Do not use this method</b> if hash codes may escape the current
052   * process in any way, for example being sent over RPC, or saved to disk.
053   *
054   * <p>Repeated calls to this method on the same loaded {@code Hashing} class, using the same value
055   * for {@code minimumBits}, will return identically-behaving {@link HashFunction} instances.
056   *
057   * @param minimumBits a positive integer (can be arbitrarily large)
058   * @return a hash function, described above, that produces hash codes of length {@code
059   *     minimumBits} or greater
060   */
061  public static HashFunction goodFastHash(int minimumBits) {
062    int bits = checkPositiveAndMakeMultipleOf32(minimumBits);
063
064    if (bits == 32) {
065      return GOOD_FAST_HASH_FUNCTION_32;
066    }
067    if (bits <= 128) {
068      return GOOD_FAST_HASH_FUNCTION_128;
069    }
070
071    // Otherwise, join together some 128-bit murmur3s
072    int hashFunctionsNeeded = (bits + 127) / 128;
073    HashFunction[] hashFunctions = new HashFunction[hashFunctionsNeeded];
074    hashFunctions[0] = GOOD_FAST_HASH_FUNCTION_128;
075    int seed = GOOD_FAST_HASH_SEED;
076    for (int i = 1; i < hashFunctionsNeeded; i++) {
077      seed += 1500450271; // a prime; shouldn't matter
078      hashFunctions[i] = murmur3_128(seed);
079    }
080    return new ConcatenatedHashFunction(hashFunctions);
081  }
082
083  /**
084   * Used to randomize {@link #goodFastHash} instances, so that programs which persist anything
085   * dependent on the hash codes they produce will fail sooner.
086   */
087  private static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();
088
089  /** Returned by {@link #goodFastHash} when {@code minimumBits <= 32}. */
090  private static final HashFunction GOOD_FAST_HASH_FUNCTION_32 = murmur3_32(GOOD_FAST_HASH_SEED);
091
092  /** Returned by {@link #goodFastHash} when {@code 32 < minimumBits <= 128}. */
093  private static final HashFunction GOOD_FAST_HASH_FUNCTION_128 = murmur3_128(GOOD_FAST_HASH_SEED);
094
095  /**
096   * Returns a hash function implementing the
097   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
098   * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant),
099   * using the given seed value.
100   *
101   * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A).
102   */
103  public static HashFunction murmur3_32(int seed) {
104    return new Murmur3_32HashFunction(seed);
105  }
106
107  /**
108   * Returns a hash function implementing the
109   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
110   * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant),
111   * using a seed value of zero.
112   *
113   * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A).
114   */
115  public static HashFunction murmur3_32() {
116    return MURMUR3_32;
117  }
118
119  private static final HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0);
120
121  /**
122   * Returns a hash function implementing the
123   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
124   * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant),
125   * using the given seed value.
126   *
127   * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F).
128   */
129  public static HashFunction murmur3_128(int seed) {
130    return new Murmur3_128HashFunction(seed);
131  }
132
133  /**
134   * Returns a hash function implementing the
135   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
136   * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant),
137   * using a seed value of zero.
138   *
139   * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F).
140   */
141  public static HashFunction murmur3_128() {
142    return MURMUR3_128;
143  }
144
145  private static final HashFunction MURMUR3_128 = new Murmur3_128HashFunction(0);
146
147  /**
148   * Returns a hash function implementing the
149   * <a href="https://131002.net/siphash/">64-bit SipHash-2-4 algorithm</a>
150   * using a seed value of {@code k = 00 01 02 ...}.
151   *
152   * @since 15.0
153   */
154  public static HashFunction sipHash24() {
155    return SIP_HASH_24;
156  }
157
158  private static final HashFunction SIP_HASH_24 =
159      new SipHashFunction(2, 4, 0x0706050403020100L, 0x0f0e0d0c0b0a0908L);
160
161  /**
162   * Returns a hash function implementing the
163   * <a href="https://131002.net/siphash/">64-bit SipHash-2-4 algorithm</a>
164   * using the given seed.
165   *
166   * @since 15.0
167   */
168  public static HashFunction sipHash24(long k0, long k1) {
169    return new SipHashFunction(2, 4, k0, k1);
170  }
171
172  /**
173   * Returns a hash function implementing the MD5 hash algorithm (128 hash bits) by delegating to
174   * the MD5 {@link MessageDigest}.
175   */
176  public static HashFunction md5() {
177    return MD5;
178  }
179
180  private static final HashFunction MD5 = new MessageDigestHashFunction("MD5", "Hashing.md5()");
181
182  /**
183   * Returns a hash function implementing the SHA-1 algorithm (160 hash bits) by delegating to the
184   * SHA-1 {@link MessageDigest}.
185   */
186  public static HashFunction sha1() {
187    return SHA_1;
188  }
189
190  private static final HashFunction SHA_1 =
191      new MessageDigestHashFunction("SHA-1", "Hashing.sha1()");
192
193  /**
194   * Returns a hash function implementing the SHA-256 algorithm (256 hash bits) by delegating to
195   * the SHA-256 {@link MessageDigest}.
196   */
197  public static HashFunction sha256() {
198    return SHA_256;
199  }
200
201  private static final HashFunction SHA_256 =
202      new MessageDigestHashFunction("SHA-256", "Hashing.sha256()");
203
204  /**
205   * Returns a hash function implementing the SHA-512 algorithm (512 hash bits) by delegating to the
206   * SHA-512 {@link MessageDigest}.
207   */
208  public static HashFunction sha512() {
209    return SHA_512;
210  }
211
212  private static final HashFunction SHA_512 =
213      new MessageDigestHashFunction("SHA-512", "Hashing.sha512()");
214
215  /**
216   * Returns a hash function implementing the CRC-32 checksum algorithm (32 hash bits) by delegating
217   * to the {@link CRC32} {@link Checksum}.
218   *
219   * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a
220   * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}.
221   *
222   * @since 14.0
223   */
224  public static HashFunction crc32() {
225    return CRC_32;
226  }
227
228  private static final HashFunction CRC_32 =
229      checksumHashFunction(ChecksumType.CRC_32, "Hashing.crc32()");
230
231  /**
232   * Returns a hash function implementing the Adler-32 checksum algorithm (32 hash bits) by
233   * delegating to the {@link Adler32} {@link Checksum}.
234   *
235   * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a
236   * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}.
237   *
238   * @since 14.0
239   */
240  public static HashFunction adler32() {
241    return ADLER_32;
242  }
243
244  private static final HashFunction ADLER_32 =
245      checksumHashFunction(ChecksumType.ADLER_32, "Hashing.adler32()");
246
247  private static HashFunction checksumHashFunction(ChecksumType type, String toString) {
248    return new ChecksumHashFunction(type, type.bits, toString);
249  }
250
251  enum ChecksumType implements Supplier<Checksum> {
252    CRC_32(32) {
253      @Override
254      public Checksum get() {
255        return new CRC32();
256      }
257    },
258    ADLER_32(32) {
259      @Override
260      public Checksum get() {
261        return new Adler32();
262      }
263    };
264
265    private final int bits;
266
267    ChecksumType(int bits) {
268      this.bits = bits;
269    }
270
271    @Override
272    public abstract Checksum get();
273  }
274
275  // Lazy initialization holder class idiom.
276  // TODO(user): Investigate whether we need to still use this idiom now that we have a fallback
277  // option for our use of Unsafe.
278
279  /**
280   * Assigns to {@code hashCode} a "bucket" in the range {@code [0, buckets)}, in a uniform
281   * manner that minimizes the need for remapping as {@code buckets} grows. That is,
282   * {@code consistentHash(h, n)} equals:
283   *
284   * <ul>
285   * <li>{@code n - 1}, with approximate probability {@code 1/n}
286   * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n})
287   * </ul>
288   *
289   * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia
290   * article on consistent hashing</a> for more information.
291   */
292  public static int consistentHash(HashCode hashCode, int buckets) {
293    return consistentHash(hashCode.padToLong(), buckets);
294  }
295
296  /**
297   * Assigns to {@code input} a "bucket" in the range {@code [0, buckets)}, in a uniform
298   * manner that minimizes the need for remapping as {@code buckets} grows. That is,
299   * {@code consistentHash(h, n)} equals:
300   *
301   * <ul>
302   * <li>{@code n - 1}, with approximate probability {@code 1/n}
303   * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n})
304   * </ul>
305   *
306   * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia
307   * article on consistent hashing</a> for more information.
308   */
309  public static int consistentHash(long input, int buckets) {
310    checkArgument(buckets > 0, "buckets must be positive: %s", buckets);
311    LinearCongruentialGenerator generator = new LinearCongruentialGenerator(input);
312    int candidate = 0;
313    int next;
314
315    // Jump from bucket to bucket until we go out of range
316    while (true) {
317      next = (int) ((candidate + 1) / generator.nextDouble());
318      if (next >= 0 && next < buckets) {
319        candidate = next;
320      } else {
321        return candidate;
322      }
323    }
324  }
325
326  /**
327   * Returns a hash code, having the same bit length as each of the input hash codes,
328   * that combines the information of these hash codes in an ordered fashion. That
329   * is, whenever two equal hash codes are produced by two calls to this method, it
330   * is <i>as likely as possible</i> that each was computed from the <i>same</i>
331   * input hash codes in the <i>same</i> order.
332   *
333   * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes
334   *     do not all have the same bit length
335   */
336  public static HashCode combineOrdered(Iterable<HashCode> hashCodes) {
337    Iterator<HashCode> iterator = hashCodes.iterator();
338    checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine.");
339    int bits = iterator.next().bits();
340    byte[] resultBytes = new byte[bits / 8];
341    for (HashCode hashCode : hashCodes) {
342      byte[] nextBytes = hashCode.asBytes();
343      checkArgument(nextBytes.length == resultBytes.length,
344          "All hashcodes must have the same bit length.");
345      for (int i = 0; i < nextBytes.length; i++) {
346        resultBytes[i] = (byte) (resultBytes[i] * 37 ^ nextBytes[i]);
347      }
348    }
349    return HashCode.fromBytesNoCopy(resultBytes);
350  }
351
352  /**
353   * Returns a hash code, having the same bit length as each of the input hash codes,
354   * that combines the information of these hash codes in an unordered fashion. That
355   * is, whenever two equal hash codes are produced by two calls to this method, it
356   * is <i>as likely as possible</i> that each was computed from the <i>same</i>
357   * input hash codes in <i>some</i> order.
358   *
359   * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes
360   *     do not all have the same bit length
361   */
362  public static HashCode combineUnordered(Iterable<HashCode> hashCodes) {
363    Iterator<HashCode> iterator = hashCodes.iterator();
364    checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine.");
365    byte[] resultBytes = new byte[iterator.next().bits() / 8];
366    for (HashCode hashCode : hashCodes) {
367      byte[] nextBytes = hashCode.asBytes();
368      checkArgument(nextBytes.length == resultBytes.length,
369          "All hashcodes must have the same bit length.");
370      for (int i = 0; i < nextBytes.length; i++) {
371        resultBytes[i] += nextBytes[i];
372      }
373    }
374    return HashCode.fromBytesNoCopy(resultBytes);
375  }
376
377  /**
378   * Checks that the passed argument is positive, and ceils it to a multiple of 32.
379   */
380  static int checkPositiveAndMakeMultipleOf32(int bits) {
381    checkArgument(bits > 0, "Number of bits must be positive");
382    return (bits + 31) & ~31;
383  }
384
385  // TODO(kevinb): Maybe expose this class via a static Hashing method?
386  @VisibleForTesting
387  static final class ConcatenatedHashFunction extends AbstractCompositeHashFunction {
388    private final int bits;
389
390    ConcatenatedHashFunction(HashFunction... functions) {
391      super(functions);
392      int bitSum = 0;
393      for (HashFunction function : functions) {
394        bitSum += function.bits();
395      }
396      this.bits = bitSum;
397    }
398
399    @Override
400    HashCode makeHash(Hasher[] hashers) {
401      // TODO(user): Get rid of the ByteBuffer here?
402      byte[] bytes = new byte[bits / 8];
403      ByteBuffer buffer = ByteBuffer.wrap(bytes);
404      for (Hasher hasher : hashers) {
405        buffer.put(hasher.hash().asBytes());
406      }
407      return HashCode.fromBytesNoCopy(bytes);
408    }
409
410    @Override
411    public int bits() {
412      return bits;
413    }
414
415    @Override
416    public boolean equals(@Nullable Object object) {
417      if (object instanceof ConcatenatedHashFunction) {
418        ConcatenatedHashFunction other = (ConcatenatedHashFunction) object;
419        if (bits != other.bits || functions.length != other.functions.length) {
420          return false;
421        }
422        for (int i = 0; i < functions.length; i++) {
423          if (!functions[i].equals(other.functions[i])) {
424            return false;
425          }
426        }
427        return true;
428      }
429      return false;
430    }
431
432    @Override
433    public int hashCode() {
434      int hash = bits;
435      for (HashFunction function : functions) {
436        hash ^= function.hashCode();
437      }
438      return hash;
439    }
440  }
441
442  /**
443   * Linear CongruentialGenerator to use for consistent hashing.
444   * See http://en.wikipedia.org/wiki/Linear_congruential_generator
445   */
446  private static final class LinearCongruentialGenerator {
447    private long state;
448
449    public LinearCongruentialGenerator(long seed) {
450      this.state = seed;
451    }
452
453    public double nextDouble() {
454      state = 2862933555777941757L * state + 1;
455      return ((double) ((int) (state >>> 33) + 1)) / (0x1.0p31);
456    }
457  }
458
459  private Hashing() {}
460}