001/*
002 * Copyright (C) 2011 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.hash;
016
017import static com.google.common.base.Preconditions.checkArgument;
018
019import com.google.common.annotations.Beta;
020import com.google.common.annotations.VisibleForTesting;
021import com.google.common.base.Supplier;
022
023import java.nio.ByteBuffer;
024import java.security.MessageDigest;
025import java.util.Iterator;
026import java.util.zip.Adler32;
027import java.util.zip.CRC32;
028import java.util.zip.Checksum;
029
030/**
031 * Static methods to obtain {@link HashFunction} instances, and other static hashing-related
032 * utilities.
033 *
034 * @author Kevin Bourrillion
035 * @author Dimitris Andreou
036 * @author Kurt Alfred Kluever
037 * @since 11.0
038 */
039@Beta
040public final class Hashing {
041  private Hashing() {}
042
043  /**
044   * Used to randomize {@link #goodFastHash} instances, so that programs which persist anything
045   * dependent on hashcodes of those, will fail sooner than later.
046   */
047  private static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();
048
049  // Used by goodFastHash when minimumBits == 32.
050  private static final HashFunction GOOD_FAST_HASH_FUNCTION_32 = murmur3_32(GOOD_FAST_HASH_SEED);
051
052  // Used by goodFastHash when 32 < minimumBits <= 128.
053  private static final HashFunction GOOD_FAST_HASH_FUNCTION_128 = murmur3_128(GOOD_FAST_HASH_SEED);
054
055  /**
056   * Returns a general-purpose, <b>non-cryptographic-strength</b>, streaming hash function that
057   * produces hash codes of length at least {@code minimumBits}. Users without specific
058   * compatibility requirements and who do not persist the hash codes are encouraged to
059   * choose this hash function.
060   *
061   * <p>Repeated calls to {@link #goodFastHash} with the same {@code minimumBits} value will
062   * return {@link HashFunction} instances with identical behavior (but not necessarily the
063   * same instance) for the duration of the current virtual machine.
064   *
065   * <p><b>Warning: the implementation is unspecified and is subject to change.</b>
066   *
067   * @throws IllegalArgumentException if {@code minimumBits} is not positive
068   */
069  public static HashFunction goodFastHash(int minimumBits) {
070    int bits = checkPositiveAndMakeMultipleOf32(minimumBits);
071
072    if (bits == 32) {
073      return GOOD_FAST_HASH_FUNCTION_32;
074    }
075    if (bits <= 128) {
076      return GOOD_FAST_HASH_FUNCTION_128;
077    }
078
079    // Otherwise, join together some 128-bit murmur3s
080    int hashFunctionsNeeded = (bits + 127) / 128;
081    HashFunction[] hashFunctions = new HashFunction[hashFunctionsNeeded];
082    hashFunctions[0] = GOOD_FAST_HASH_FUNCTION_128;
083    int seed = GOOD_FAST_HASH_SEED;
084    for (int i = 1; i < hashFunctionsNeeded; i++) {
085      seed += 1500450271; // a prime; shouldn't matter
086      hashFunctions[i] = murmur3_128(seed);
087    }
088    return new ConcatenatedHashFunction(hashFunctions);
089  }
090
091  /**
092   * Returns a hash function implementing the
093   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
094   * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant),
095   * using the given seed value.
096   *
097   * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A).
098   */
099  public static HashFunction murmur3_32(int seed) {
100    return new Murmur3_32HashFunction(seed);
101  }
102
103  /**
104   * Returns a hash function implementing the
105   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
106   * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant),
107   * using a seed value of zero.
108   *
109   * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A).
110   */
111  public static HashFunction murmur3_32() {
112    return MURMUR3_32;
113  }
114
115  private static final HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0);
116
117  /**
118   * Returns a hash function implementing the
119   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
120   * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant),
121   * using the given seed value.
122   *
123   * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F).
124   */
125  public static HashFunction murmur3_128(int seed) {
126    return new Murmur3_128HashFunction(seed);
127  }
128
129  /**
130   * Returns a hash function implementing the
131   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
132   * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant),
133   * using a seed value of zero.
134   *
135   * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F).
136   */
137  public static HashFunction murmur3_128() {
138    return MURMUR3_128;
139  }
140
141  private static final HashFunction MURMUR3_128 = new Murmur3_128HashFunction(0);
142
143  /**
144   * Returns a hash function implementing the MD5 hash algorithm (128 hash bits) by delegating to
145   * the MD5 {@link MessageDigest}.
146   */
147  public static HashFunction md5() {
148    return MD5;
149  }
150
151  private static final HashFunction MD5 = new MessageDigestHashFunction("MD5", "Hashing.md5()");
152
153  /**
154   * Returns a hash function implementing the SHA-1 algorithm (160 hash bits) by delegating to the
155   * SHA-1 {@link MessageDigest}.
156   */
157  public static HashFunction sha1() {
158    return SHA_1;
159  }
160
161  private static final HashFunction SHA_1 =
162      new MessageDigestHashFunction("SHA-1", "Hashing.sha1()");
163
164  /**
165   * Returns a hash function implementing the SHA-256 algorithm (256 hash bits) by delegating to
166   * the SHA-256 {@link MessageDigest}.
167   */
168  public static HashFunction sha256() {
169    return SHA_256;
170  }
171
172  private static final HashFunction SHA_256 =
173      new MessageDigestHashFunction("SHA-256", "Hashing.sha256()");
174
175  /**
176   * Returns a hash function implementing the SHA-512 algorithm (512 hash bits) by delegating to the
177   * SHA-512 {@link MessageDigest}.
178   */
179  public static HashFunction sha512() {
180    return SHA_512;
181  }
182
183  private static final HashFunction SHA_512 =
184      new MessageDigestHashFunction("SHA-512", "Hashing.sha512()");
185
186  /**
187   * Returns a hash function implementing the CRC-32 checksum algorithm (32 hash bits) by delegating
188   * to the {@link CRC32} {@link Checksum}.
189   *
190   * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a
191   * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}.
192   *
193   * @since 14.0
194   */
195  public static HashFunction crc32() {
196    return CRC_32;
197  }
198
199  private static final HashFunction CRC_32 =
200      checksumHashFunction(ChecksumType.CRC_32, "Hashing.crc32()");
201
202  /**
203   * Returns a hash function implementing the Adler-32 checksum algorithm (32 hash bits) by
204   * delegating to the {@link Adler32} {@link Checksum}.
205   *
206   * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a
207   * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}.
208   *
209   * @since 14.0
210   */
211  public static HashFunction adler32() {
212    return ADLER_32;
213  }
214
215  private static final HashFunction ADLER_32 =
216      checksumHashFunction(ChecksumType.ADLER_32, "Hashing.adler32()");
217
218  private static HashFunction checksumHashFunction(ChecksumType type, String toString) {
219    return new ChecksumHashFunction(type, type.bits, toString);
220  }
221
222  enum ChecksumType implements Supplier<Checksum> {
223    CRC_32(32) {
224      @Override
225      public Checksum get() {
226        return new CRC32();
227      }
228    },
229    ADLER_32(32) {
230      @Override
231      public Checksum get() {
232        return new Adler32();
233      }
234    };
235
236    private final int bits;
237
238    ChecksumType(int bits) {
239      this.bits = bits;
240    }
241
242    @Override
243    public abstract Checksum get();
244  }
245
246  // Lazy initialization holder class idiom.
247
248  /**
249   * If {@code hashCode} has enough bits, returns {@code hashCode.asLong()}, otherwise
250   * returns a {@code long} value with {@code hashCode.asInt()} as the least-significant
251   * four bytes and {@code 0x00} as each of the most-significant four bytes.
252   *
253   * @deprecated Use {@code HashCode.padToLong()} instead. This method is scheduled to be
254   *     removed in Guava 15.0.
255   */
256  @Deprecated
257  public static long padToLong(HashCode hashCode) {
258    return hashCode.padToLong();
259  }
260
261  /**
262   * Assigns to {@code hashCode} a "bucket" in the range {@code [0, buckets)}, in a uniform
263   * manner that minimizes the need for remapping as {@code buckets} grows. That is,
264   * {@code consistentHash(h, n)} equals:
265   *
266   * <ul>
267   * <li>{@code n - 1}, with approximate probability {@code 1/n}
268   * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n})
269   * </ul>
270   *
271   * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia
272   * article on consistent hashing</a> for more information.
273   * <p>
274   * If you might want to have weights for the buckets in the future, take a look at
275   * {@code weightedConsistentHash}.
276   */
277  public static int consistentHash(HashCode hashCode, int buckets) {
278    return consistentHash(hashCode.padToLong(), buckets);
279  }
280
281  /**
282   * Assigns to {@code input} a "bucket" in the range {@code [0, buckets)}, in a uniform
283   * manner that minimizes the need for remapping as {@code buckets} grows. That is,
284   * {@code consistentHash(h, n)} equals:
285   *
286   * <ul>
287   * <li>{@code n - 1}, with approximate probability {@code 1/n}
288   * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n})
289   * </ul>
290   *
291   * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia
292   * article on consistent hashing</a> for more information.
293   * <p>
294   * If you might want to have weights for the buckets in the future, take a look at
295   * {@code weightedConsistentHash}.
296   */
297  public static int consistentHash(long input, int buckets) {
298    checkArgument(buckets > 0, "buckets must be positive: %s", buckets);
299    LinearCongruentialGenerator generator = new LinearCongruentialGenerator(input);
300    int candidate = 0;
301    int next;
302
303    // Jump from bucket to bucket until we go out of range
304    while (true) {
305      next = (int) ((candidate + 1) / generator.nextDouble());
306      if (next >= 0 && next < buckets) {
307        candidate = next;
308      } else {
309        return candidate;
310      }
311    }
312  }
313
314  /**
315   * Returns a hash code, having the same bit length as each of the input hash codes,
316   * that combines the information of these hash codes in an ordered fashion. That
317   * is, whenever two equal hash codes are produced by two calls to this method, it
318   * is <i>as likely as possible</i> that each was computed from the <i>same</i>
319   * input hash codes in the <i>same</i> order.
320   *
321   * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes
322   *     do not all have the same bit length
323   */
324  public static HashCode combineOrdered(Iterable<HashCode> hashCodes) {
325    Iterator<HashCode> iterator = hashCodes.iterator();
326    checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine.");
327    int bits = iterator.next().bits();
328    byte[] resultBytes = new byte[bits / 8];
329    for (HashCode hashCode : hashCodes) {
330      byte[] nextBytes = hashCode.asBytes();
331      checkArgument(nextBytes.length == resultBytes.length,
332          "All hashcodes must have the same bit length.");
333      for (int i = 0; i < nextBytes.length; i++) {
334        resultBytes[i] = (byte) (resultBytes[i] * 37 ^ nextBytes[i]);
335      }
336    }
337    return HashCodes.fromBytesNoCopy(resultBytes);
338  }
339
340  /**
341   * Returns a hash code, having the same bit length as each of the input hash codes,
342   * that combines the information of these hash codes in an unordered fashion. That
343   * is, whenever two equal hash codes are produced by two calls to this method, it
344   * is <i>as likely as possible</i> that each was computed from the <i>same</i>
345   * input hash codes in <i>some</i> order.
346   *
347   * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes
348   *     do not all have the same bit length
349   */
350  public static HashCode combineUnordered(Iterable<HashCode> hashCodes) {
351    Iterator<HashCode> iterator = hashCodes.iterator();
352    checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine.");
353    byte[] resultBytes = new byte[iterator.next().bits() / 8];
354    for (HashCode hashCode : hashCodes) {
355      byte[] nextBytes = hashCode.asBytes();
356      checkArgument(nextBytes.length == resultBytes.length,
357          "All hashcodes must have the same bit length.");
358      for (int i = 0; i < nextBytes.length; i++) {
359        resultBytes[i] += nextBytes[i];
360      }
361    }
362    return HashCodes.fromBytesNoCopy(resultBytes);
363  }
364
365  /**
366   * Checks that the passed argument is positive, and ceils it to a multiple of 32.
367   */
368  static int checkPositiveAndMakeMultipleOf32(int bits) {
369    checkArgument(bits > 0, "Number of bits must be positive");
370    return (bits + 31) & ~31;
371  }
372
373  // TODO(kevinb): Maybe expose this class via a static Hashing method?
374  @VisibleForTesting
375  static final class ConcatenatedHashFunction extends AbstractCompositeHashFunction {
376    private final int bits;
377
378    ConcatenatedHashFunction(HashFunction... functions) {
379      super(functions);
380      int bitSum = 0;
381      for (HashFunction function : functions) {
382        bitSum += function.bits();
383      }
384      this.bits = bitSum;
385    }
386
387    @Override
388    HashCode makeHash(Hasher[] hashers) {
389      // TODO(user): Get rid of the ByteBuffer here?
390      byte[] bytes = new byte[bits / 8];
391      ByteBuffer buffer = ByteBuffer.wrap(bytes);
392      for (Hasher hasher : hashers) {
393        buffer.put(hasher.hash().asBytes());
394      }
395      return HashCodes.fromBytesNoCopy(bytes);
396    }
397
398    @Override
399    public int bits() {
400      return bits;
401    }
402  }
403
404  /**
405   * Linear CongruentialGenerator to use for consistent hashing.
406   * See http://en.wikipedia.org/wiki/Linear_congruential_generator
407   */
408  private static final class LinearCongruentialGenerator {
409    private long state;
410
411    public LinearCongruentialGenerator(long seed) {
412      this.state = seed;
413    }
414
415    public double nextDouble() {
416      state = 2862933555777941757L * state + 1;
417      return ((double) ((int) (state >>> 33) + 1)) / (0x1.0p31);
418    }
419  }
420}