001    /*
002     * Copyright (C) 2011 The Guava Authors
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005     * in compliance with the License. You may obtain a copy of the License at
006     *
007     * http://www.apache.org/licenses/LICENSE-2.0
008     *
009     * Unless required by applicable law or agreed to in writing, software distributed under the License
010     * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011     * or implied. See the License for the specific language governing permissions and limitations under
012     * the License.
013     */
014    
015    package com.google.common.hash;
016    
017    import static com.google.common.base.Preconditions.checkArgument;
018    
019    import com.google.common.annotations.Beta;
020    import com.google.common.annotations.VisibleForTesting;
021    import com.google.common.primitives.UnsignedInts;
022    
023    import java.nio.ByteBuffer;
024    import java.security.MessageDigest;
025    import java.util.Iterator;
026    
027    /**
028     * Static methods to obtain {@link HashFunction} instances, and other static
029     * hashing-related utilities.
030     *
031     * @author Kevin Bourrillion
032     * @author Dimitris Andreou
033     * @author Kurt Alfred Kluever
034     * @since 11.0
035     */
036    @Beta
037    public final class Hashing {
038      private Hashing() {}
039    
040      /**
041       * Used to randomize {@link #goodFastHash} instances, so that programs which persist anything
042       * dependent on hashcodes of those, will fail sooner than later.
043       */
044      private static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();
045    
046      // Used by goodFastHash when minimumBits == 32.
047      private static final HashFunction GOOD_FAST_HASH_FUNCTION_32 = murmur3_32(GOOD_FAST_HASH_SEED);
048    
049      // Used by goodFastHash when 32 < minimumBits <= 128.
050      private static final HashFunction GOOD_FAST_HASH_FUNCTION_128 = murmur3_128(GOOD_FAST_HASH_SEED);
051    
052      /**
053       * Returns a general-purpose, <b>non-cryptographic-strength</b>, streaming hash function that
054       * produces hash codes of length at least {@code minimumBits}. Users without specific
055       * compatibility requirements and who do not persist the hash codes are encouraged to
056       * choose this hash function.
057       *
058       * <p>Repeated calls to {@link #goodFastHash} with the same {@code minimumBits} value will
059       * return {@link HashFunction} instances with identical behavior (but not necessarily the
060       * same instance) for the duration of the current virtual machine.
061       *
062       * <p><b>Warning: the implementation is unspecified and is subject to change.</b>
063       *
064       * @throws IllegalArgumentException if {@code minimumBits} is not positive
065       */
066      public static HashFunction goodFastHash(int minimumBits) {
067        int bits = checkPositiveAndMakeMultipleOf32(minimumBits);
068    
069        if (bits == 32) {
070          return GOOD_FAST_HASH_FUNCTION_32;
071        }
072        if (bits <= 128) {
073          return GOOD_FAST_HASH_FUNCTION_128;
074        }
075    
076        // Otherwise, join together some 128-bit murmur3s
077        int hashFunctionsNeeded = (bits + 127) / 128;
078        HashFunction[] hashFunctions = new HashFunction[hashFunctionsNeeded];
079        hashFunctions[0] = GOOD_FAST_HASH_FUNCTION_128;
080        int seed = GOOD_FAST_HASH_SEED;
081        for (int i = 1; i < hashFunctionsNeeded; i++) {
082          seed += 1500450271; // a prime; shouldn't matter
083          hashFunctions[i] = murmur3_128(seed);
084        }
085        return new ConcatenatedHashFunction(hashFunctions);
086      }
087    
088      /**
089       * Returns a hash function implementing the
090       * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">32-bit murmur3
091       * algorithm</a> (little-endian variant), using the given seed value.
092       */
093      public static HashFunction murmur3_32(int seed) {
094        return new Murmur3_32HashFunction(seed);
095      }
096    
097      /**
098       * Returns a hash function implementing the
099       * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">32-bit murmur3
100       * algorithm</a> (little-endian variant), using a seed value of zero.
101       */
102      public static HashFunction murmur3_32() {
103        return MURMUR3_32;
104      }
105    
106      private static final Murmur3_32HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0);
107    
108      /**
109       * Returns a hash function implementing the
110       * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
111       * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), using the given seed
112       * value.
113       */
114      public static HashFunction murmur3_128(int seed) {
115        return new Murmur3_128HashFunction(seed);
116      }
117    
118      /**
119       * Returns a hash function implementing the
120       * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
121       * 128-bit murmur3 algorithm, x64 variant</a>  (little-endian variant), using a seed value
122       * of zero.
123       */
124      public static HashFunction murmur3_128() {
125        return MURMUR3_128;
126      }
127    
128      private static final Murmur3_128HashFunction MURMUR3_128 = new Murmur3_128HashFunction(0);
129    
130      /**
131       * Returns a hash function implementing the MD5 hash algorithm (128 hash bits) by delegating to
132       * the MD5 {@link MessageDigest}.
133       */
134      public static HashFunction md5() {
135        return MD5;
136      }
137    
138      private static final HashFunction MD5 = new MessageDigestHashFunction("MD5");
139    
140      /**
141       * Returns a hash function implementing the SHA-1 algorithm (160 hash bits) by delegating to the
142       * SHA-1 {@link MessageDigest}.
143       */
144      public static HashFunction sha1() {
145        return SHA_1;
146      }
147    
148      private static final HashFunction SHA_1 = new MessageDigestHashFunction("SHA-1");
149    
150      /**
151       * Returns a hash function implementing the SHA-256 algorithm (256 hash bits) by delegating to
152       * the SHA-256 {@link MessageDigest}.
153       */
154      public static HashFunction sha256() {
155        return SHA_256;
156      }
157    
158      private static final HashFunction SHA_256 = new MessageDigestHashFunction("SHA-256");
159    
160      /**
161       * Returns a hash function implementing the SHA-512 algorithm (512 hash bits) by delegating to the
162       * SHA-512 {@link MessageDigest}.
163       */
164      public static HashFunction sha512() {
165        return SHA_512;
166      }
167    
168      private static final HashFunction SHA_512 = new MessageDigestHashFunction("SHA-512");
169    
170      // Lazy initiliazation holder class idiom.
171    
172      /**
173       * If {@code hashCode} has enough bits, returns {@code hashCode.asLong()}, otherwise
174       * returns a {@code long} value with {@code hashCode.asInt()} as the least-significant
175       * four bytes and {@code 0x00} as each of the most-significant four bytes.
176       */
177      public static long padToLong(HashCode hashCode) {
178        return (hashCode.bits() < 64) ? UnsignedInts.toLong(hashCode.asInt()) : hashCode.asLong();
179      }
180    
181      /**
182       * Assigns to {@code hashCode} a "bucket" in the range {@code [0, buckets)}, in a uniform
183       * manner that minimizes the need for remapping as {@code buckets} grows. That is,
184       * {@code consistentHash(h, n)} equals:
185       *
186       * <ul>
187       * <li>{@code n - 1}, with approximate probability {@code 1/n}
188       * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n})
189       * </ul>
190       *
191       * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia
192       * article on consistent hashing</a> for more information.
193       * <p>
194       * If you might want to have weights for the buckets in the future, take a look at
195       * {@code weightedConsistentHash}.
196       */
197      public static int consistentHash(HashCode hashCode, int buckets) {
198        return consistentHash(padToLong(hashCode), buckets);
199      }
200    
201      /**
202       * Assigns to {@code input} a "bucket" in the range {@code [0, buckets)}, in a uniform
203       * manner that minimizes the need for remapping as {@code buckets} grows. That is,
204       * {@code consistentHash(h, n)} equals:
205       *
206       * <ul>
207       * <li>{@code n - 1}, with approximate probability {@code 1/n}
208       * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n})
209       * </ul>
210       *
211       * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia
212       * article on consistent hashing</a> for more information.
213       * <p>
214       * If you might want to have weights for the buckets in the future, take a look at
215       * {@code weightedConsistentHash}.
216       */
217      public static int consistentHash(long input, int buckets) {
218        checkArgument(buckets > 0, "buckets must be positive: %s", buckets);
219        LinearCongruentialGenerator generator = new LinearCongruentialGenerator(input);
220        int candidate = 0;
221        int next;
222    
223        // Jump from bucket to bucket until we go out of range
224        while (true) {
225          next = (int) ((candidate + 1) / generator.nextDouble());
226          if (next >= 0 && next < buckets) {
227            candidate = next;
228          } else {
229            return candidate;
230          }
231        }
232      }
233    
234      /**
235       * Returns a hash code, having the same bit length as each of the input hash codes,
236       * that combines the information of these hash codes in an ordered fashion. That
237       * is, whenever two equal hash codes are produced by two calls to this method, it
238       * is <i>as likely as possible</i> that each was computed from the <i>same</i>
239       * input hash codes in the <i>same</i> order.
240       *
241       * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes
242       *     do not all have the same bit length
243       */
244      public static HashCode combineOrdered(Iterable<HashCode> hashCodes) {
245        Iterator<HashCode> iterator = hashCodes.iterator();
246        checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine.");
247        int bits = iterator.next().bits();
248        byte[] resultBytes = new byte[bits / 8];
249        for (HashCode hashCode : hashCodes) {
250          byte[] nextBytes = hashCode.asBytes();
251          checkArgument(nextBytes.length == resultBytes.length,
252              "All hashcodes must have the same bit length.");
253          for (int i = 0; i < nextBytes.length; i++) {
254            resultBytes[i] = (byte) (resultBytes[i] * 37 ^ nextBytes[i]);
255          }
256        }
257        return HashCodes.fromBytesNoCopy(resultBytes);
258      }
259    
260      /**
261       * Returns a hash code, having the same bit length as each of the input hash codes,
262       * that combines the information of these hash codes in an unordered fashion. That
263       * is, whenever two equal hash codes are produced by two calls to this method, it
264       * is <i>as likely as possible</i> that each was computed from the <i>same</i>
265       * input hash codes in <i>some</i> order.
266       *
267       * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes
268       *     do not all have the same bit length
269       */
270      public static HashCode combineUnordered(Iterable<HashCode> hashCodes) {
271        Iterator<HashCode> iterator = hashCodes.iterator();
272        checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine.");
273        byte[] resultBytes = new byte[iterator.next().bits() / 8];
274        for (HashCode hashCode : hashCodes) {
275          byte[] nextBytes = hashCode.asBytes();
276          checkArgument(nextBytes.length == resultBytes.length,
277              "All hashcodes must have the same bit length.");
278          for (int i = 0; i < nextBytes.length; i++) {
279            resultBytes[i] += nextBytes[i];
280          }
281        }
282        return HashCodes.fromBytesNoCopy(resultBytes);
283      }
284    
285      /**
286       * Checks that the passed argument is positive, and ceils it to a multiple of 32.
287       */
288      static int checkPositiveAndMakeMultipleOf32(int bits) {
289        checkArgument(bits > 0, "Number of bits must be positive");
290        return (bits + 31) & ~31;
291      }
292    
293      // TODO(kevinb): Maybe expose this class via a static Hashing method?
294      @VisibleForTesting
295      static final class ConcatenatedHashFunction extends AbstractCompositeHashFunction {
296        private final int bits;
297    
298        ConcatenatedHashFunction(HashFunction... functions) {
299          super(functions);
300          int bitSum = 0;
301          for (HashFunction function : functions) {
302            bitSum += function.bits();
303          }
304          this.bits = bitSum;
305        }
306    
307        @Override
308        HashCode makeHash(Hasher[] hashers) {
309          // TODO(user): Get rid of the ByteBuffer here?
310          byte[] bytes = new byte[bits / 8];
311          ByteBuffer buffer = ByteBuffer.wrap(bytes);
312          for (Hasher hasher : hashers) {
313            buffer.put(hasher.hash().asBytes());
314          }
315          return HashCodes.fromBytesNoCopy(bytes);
316        }
317    
318        @Override
319        public int bits() {
320          return bits;
321        }
322      }
323    
324      private static final class LinearCongruentialGenerator {
325        private long state;
326    
327        public LinearCongruentialGenerator(long seed) {
328          this.state = seed;
329        }
330    
331        public double nextDouble() {
332          state = 2862933555777941757L * state + 1;
333          return ((double) ((int) (state >>> 33) + 1)) / (0x1.0p31);
334        }
335      }
336    }