001/* 002 * Copyright (C) 2011 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.hash; 016 017import static com.google.common.base.Preconditions.checkArgument; 018 019import com.google.common.annotations.Beta; 020import com.google.common.annotations.VisibleForTesting; 021import com.google.common.base.Supplier; 022 023import java.nio.ByteBuffer; 024import java.security.MessageDigest; 025import java.util.Iterator; 026import java.util.zip.Adler32; 027import java.util.zip.CRC32; 028import java.util.zip.Checksum; 029 030import javax.annotation.Nullable; 031 032/** 033 * Static methods to obtain {@link HashFunction} instances, and other static hashing-related 034 * utilities. 035 * 036 * <p>A comparison of the various hash functions can be found 037 * <a href="http://goo.gl/jS7HH">here</a>. 038 * 039 * @author Kevin Bourrillion 040 * @author Dimitris Andreou 041 * @author Kurt Alfred Kluever 042 * @since 11.0 043 */ 044@Beta 045public final class Hashing { 046 /** 047 * Returns a general-purpose, <b>temporary-use</b>, non-cryptographic hash function. The algorithm 048 * the returned function implements is unspecified and subject to change without notice. 049 * 050 * <p><b>Warning:</b> a new random seed for these functions is chosen each time the {@code 051 * Hashing} class is loaded. <b>Do not use this method</b> if hash codes may escape the current 052 * process in any way, for example being sent over RPC, or saved to disk. 053 * 054 * <p>Repeated calls to this method on the same loaded {@code Hashing} class, using the same value 055 * for {@code minimumBits}, will return identically-behaving {@link HashFunction} instances. 056 * 057 * @param minimumBits a positive integer (can be arbitrarily large) 058 * @return a hash function, described above, that produces hash codes of length {@code 059 * minimumBits} or greater 060 */ 061 public static HashFunction goodFastHash(int minimumBits) { 062 int bits = checkPositiveAndMakeMultipleOf32(minimumBits); 063 064 if (bits == 32) { 065 return GOOD_FAST_HASH_FUNCTION_32; 066 } 067 if (bits <= 128) { 068 return GOOD_FAST_HASH_FUNCTION_128; 069 } 070 071 // Otherwise, join together some 128-bit murmur3s 072 int hashFunctionsNeeded = (bits + 127) / 128; 073 HashFunction[] hashFunctions = new HashFunction[hashFunctionsNeeded]; 074 hashFunctions[0] = GOOD_FAST_HASH_FUNCTION_128; 075 int seed = GOOD_FAST_HASH_SEED; 076 for (int i = 1; i < hashFunctionsNeeded; i++) { 077 seed += 1500450271; // a prime; shouldn't matter 078 hashFunctions[i] = murmur3_128(seed); 079 } 080 return new ConcatenatedHashFunction(hashFunctions); 081 } 082 083 /** 084 * Used to randomize {@link #goodFastHash} instances, so that programs which persist anything 085 * dependent on the hash codes they produce will fail sooner. 086 */ 087 private static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis(); 088 089 /** Returned by {@link #goodFastHash} when {@code minimumBits <= 32}. */ 090 private static final HashFunction GOOD_FAST_HASH_FUNCTION_32 = murmur3_32(GOOD_FAST_HASH_SEED); 091 092 /** Returned by {@link #goodFastHash} when {@code 32 < minimumBits <= 128}. */ 093 private static final HashFunction GOOD_FAST_HASH_FUNCTION_128 = murmur3_128(GOOD_FAST_HASH_SEED); 094 095 /** 096 * Returns a hash function implementing the 097 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 098 * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant), 099 * using the given seed value. 100 * 101 * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A). 102 */ 103 public static HashFunction murmur3_32(int seed) { 104 return new Murmur3_32HashFunction(seed); 105 } 106 107 /** 108 * Returns a hash function implementing the 109 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 110 * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant), 111 * using a seed value of zero. 112 * 113 * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A). 114 */ 115 public static HashFunction murmur3_32() { 116 return MURMUR3_32; 117 } 118 119 private static final HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0); 120 121 /** 122 * Returns a hash function implementing the 123 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 124 * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), 125 * using the given seed value. 126 * 127 * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F). 128 */ 129 public static HashFunction murmur3_128(int seed) { 130 return new Murmur3_128HashFunction(seed); 131 } 132 133 /** 134 * Returns a hash function implementing the 135 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 136 * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), 137 * using a seed value of zero. 138 * 139 * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F). 140 */ 141 public static HashFunction murmur3_128() { 142 return MURMUR3_128; 143 } 144 145 private static final HashFunction MURMUR3_128 = new Murmur3_128HashFunction(0); 146 147 /** 148 * Returns a hash function implementing the 149 * <a href="https://131002.net/siphash/">64-bit SipHash-2-4 algorithm</a> 150 * using a seed value of {@code k = 00 01 02 ...}. 151 * 152 * @since 15.0 153 */ 154 public static HashFunction sipHash24() { 155 return SIP_HASH_24; 156 } 157 158 private static final HashFunction SIP_HASH_24 = 159 new SipHashFunction(2, 4, 0x0706050403020100L, 0x0f0e0d0c0b0a0908L); 160 161 /** 162 * Returns a hash function implementing the 163 * <a href="https://131002.net/siphash/">64-bit SipHash-2-4 algorithm</a> 164 * using the given seed. 165 * 166 * @since 15.0 167 */ 168 public static HashFunction sipHash24(long k0, long k1) { 169 return new SipHashFunction(2, 4, k0, k1); 170 } 171 172 /** 173 * Returns a hash function implementing the MD5 hash algorithm (128 hash bits) by delegating to 174 * the MD5 {@link MessageDigest}. 175 */ 176 public static HashFunction md5() { 177 return MD5; 178 } 179 180 private static final HashFunction MD5 = new MessageDigestHashFunction("MD5", "Hashing.md5()"); 181 182 /** 183 * Returns a hash function implementing the SHA-1 algorithm (160 hash bits) by delegating to the 184 * SHA-1 {@link MessageDigest}. 185 */ 186 public static HashFunction sha1() { 187 return SHA_1; 188 } 189 190 private static final HashFunction SHA_1 = 191 new MessageDigestHashFunction("SHA-1", "Hashing.sha1()"); 192 193 /** 194 * Returns a hash function implementing the SHA-256 algorithm (256 hash bits) by delegating to 195 * the SHA-256 {@link MessageDigest}. 196 */ 197 public static HashFunction sha256() { 198 return SHA_256; 199 } 200 201 private static final HashFunction SHA_256 = 202 new MessageDigestHashFunction("SHA-256", "Hashing.sha256()"); 203 204 /** 205 * Returns a hash function implementing the SHA-512 algorithm (512 hash bits) by delegating to the 206 * SHA-512 {@link MessageDigest}. 207 */ 208 public static HashFunction sha512() { 209 return SHA_512; 210 } 211 212 private static final HashFunction SHA_512 = 213 new MessageDigestHashFunction("SHA-512", "Hashing.sha512()"); 214 215 /** 216 * Returns a hash function implementing the CRC-32 checksum algorithm (32 hash bits) by delegating 217 * to the {@link CRC32} {@link Checksum}. 218 * 219 * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a 220 * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}. 221 * 222 * @since 14.0 223 */ 224 public static HashFunction crc32() { 225 return CRC_32; 226 } 227 228 private static final HashFunction CRC_32 = 229 checksumHashFunction(ChecksumType.CRC_32, "Hashing.crc32()"); 230 231 /** 232 * Returns a hash function implementing the Adler-32 checksum algorithm (32 hash bits) by 233 * delegating to the {@link Adler32} {@link Checksum}. 234 * 235 * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a 236 * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}. 237 * 238 * @since 14.0 239 */ 240 public static HashFunction adler32() { 241 return ADLER_32; 242 } 243 244 private static final HashFunction ADLER_32 = 245 checksumHashFunction(ChecksumType.ADLER_32, "Hashing.adler32()"); 246 247 private static HashFunction checksumHashFunction(ChecksumType type, String toString) { 248 return new ChecksumHashFunction(type, type.bits, toString); 249 } 250 251 enum ChecksumType implements Supplier<Checksum> { 252 CRC_32(32) { 253 @Override 254 public Checksum get() { 255 return new CRC32(); 256 } 257 }, 258 ADLER_32(32) { 259 @Override 260 public Checksum get() { 261 return new Adler32(); 262 } 263 }; 264 265 private final int bits; 266 267 ChecksumType(int bits) { 268 this.bits = bits; 269 } 270 271 @Override 272 public abstract Checksum get(); 273 } 274 275 // Lazy initialization holder class idiom. 276 // TODO(user): Investigate whether we need to still use this idiom now that we have a fallback 277 // option for our use of Unsafe. 278 279 /** 280 * Assigns to {@code hashCode} a "bucket" in the range {@code [0, buckets)}, in a uniform 281 * manner that minimizes the need for remapping as {@code buckets} grows. That is, 282 * {@code consistentHash(h, n)} equals: 283 * 284 * <ul> 285 * <li>{@code n - 1}, with approximate probability {@code 1/n} 286 * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n}) 287 * </ul> 288 * 289 * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia 290 * article on consistent hashing</a> for more information. 291 */ 292 public static int consistentHash(HashCode hashCode, int buckets) { 293 return consistentHash(hashCode.padToLong(), buckets); 294 } 295 296 /** 297 * Assigns to {@code input} a "bucket" in the range {@code [0, buckets)}, in a uniform 298 * manner that minimizes the need for remapping as {@code buckets} grows. That is, 299 * {@code consistentHash(h, n)} equals: 300 * 301 * <ul> 302 * <li>{@code n - 1}, with approximate probability {@code 1/n} 303 * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n}) 304 * </ul> 305 * 306 * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia 307 * article on consistent hashing</a> for more information. 308 */ 309 public static int consistentHash(long input, int buckets) { 310 checkArgument(buckets > 0, "buckets must be positive: %s", buckets); 311 LinearCongruentialGenerator generator = new LinearCongruentialGenerator(input); 312 int candidate = 0; 313 int next; 314 315 // Jump from bucket to bucket until we go out of range 316 while (true) { 317 next = (int) ((candidate + 1) / generator.nextDouble()); 318 if (next >= 0 && next < buckets) { 319 candidate = next; 320 } else { 321 return candidate; 322 } 323 } 324 } 325 326 /** 327 * Returns a hash code, having the same bit length as each of the input hash codes, 328 * that combines the information of these hash codes in an ordered fashion. That 329 * is, whenever two equal hash codes are produced by two calls to this method, it 330 * is <i>as likely as possible</i> that each was computed from the <i>same</i> 331 * input hash codes in the <i>same</i> order. 332 * 333 * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes 334 * do not all have the same bit length 335 */ 336 public static HashCode combineOrdered(Iterable<HashCode> hashCodes) { 337 Iterator<HashCode> iterator = hashCodes.iterator(); 338 checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine."); 339 int bits = iterator.next().bits(); 340 byte[] resultBytes = new byte[bits / 8]; 341 for (HashCode hashCode : hashCodes) { 342 byte[] nextBytes = hashCode.asBytes(); 343 checkArgument(nextBytes.length == resultBytes.length, 344 "All hashcodes must have the same bit length."); 345 for (int i = 0; i < nextBytes.length; i++) { 346 resultBytes[i] = (byte) (resultBytes[i] * 37 ^ nextBytes[i]); 347 } 348 } 349 return HashCode.fromBytesNoCopy(resultBytes); 350 } 351 352 /** 353 * Returns a hash code, having the same bit length as each of the input hash codes, 354 * that combines the information of these hash codes in an unordered fashion. That 355 * is, whenever two equal hash codes are produced by two calls to this method, it 356 * is <i>as likely as possible</i> that each was computed from the <i>same</i> 357 * input hash codes in <i>some</i> order. 358 * 359 * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes 360 * do not all have the same bit length 361 */ 362 public static HashCode combineUnordered(Iterable<HashCode> hashCodes) { 363 Iterator<HashCode> iterator = hashCodes.iterator(); 364 checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine."); 365 byte[] resultBytes = new byte[iterator.next().bits() / 8]; 366 for (HashCode hashCode : hashCodes) { 367 byte[] nextBytes = hashCode.asBytes(); 368 checkArgument(nextBytes.length == resultBytes.length, 369 "All hashcodes must have the same bit length."); 370 for (int i = 0; i < nextBytes.length; i++) { 371 resultBytes[i] += nextBytes[i]; 372 } 373 } 374 return HashCode.fromBytesNoCopy(resultBytes); 375 } 376 377 /** 378 * Checks that the passed argument is positive, and ceils it to a multiple of 32. 379 */ 380 static int checkPositiveAndMakeMultipleOf32(int bits) { 381 checkArgument(bits > 0, "Number of bits must be positive"); 382 return (bits + 31) & ~31; 383 } 384 385 // TODO(kevinb): Maybe expose this class via a static Hashing method? 386 @VisibleForTesting 387 static final class ConcatenatedHashFunction extends AbstractCompositeHashFunction { 388 private final int bits; 389 390 ConcatenatedHashFunction(HashFunction... functions) { 391 super(functions); 392 int bitSum = 0; 393 for (HashFunction function : functions) { 394 bitSum += function.bits(); 395 } 396 this.bits = bitSum; 397 } 398 399 @Override 400 HashCode makeHash(Hasher[] hashers) { 401 // TODO(user): Get rid of the ByteBuffer here? 402 byte[] bytes = new byte[bits / 8]; 403 ByteBuffer buffer = ByteBuffer.wrap(bytes); 404 for (Hasher hasher : hashers) { 405 buffer.put(hasher.hash().asBytes()); 406 } 407 return HashCode.fromBytesNoCopy(bytes); 408 } 409 410 @Override 411 public int bits() { 412 return bits; 413 } 414 415 @Override 416 public boolean equals(@Nullable Object object) { 417 if (object instanceof ConcatenatedHashFunction) { 418 ConcatenatedHashFunction other = (ConcatenatedHashFunction) object; 419 if (bits != other.bits || functions.length != other.functions.length) { 420 return false; 421 } 422 for (int i = 0; i < functions.length; i++) { 423 if (!functions[i].equals(other.functions[i])) { 424 return false; 425 } 426 } 427 return true; 428 } 429 return false; 430 } 431 432 @Override 433 public int hashCode() { 434 int hash = bits; 435 for (HashFunction function : functions) { 436 hash ^= function.hashCode(); 437 } 438 return hash; 439 } 440 } 441 442 /** 443 * Linear CongruentialGenerator to use for consistent hashing. 444 * See http://en.wikipedia.org/wiki/Linear_congruential_generator 445 */ 446 private static final class LinearCongruentialGenerator { 447 private long state; 448 449 public LinearCongruentialGenerator(long seed) { 450 this.state = seed; 451 } 452 453 public double nextDouble() { 454 state = 2862933555777941757L * state + 1; 455 return ((double) ((int) (state >>> 33) + 1)) / (0x1.0p31); 456 } 457 } 458 459 private Hashing() {} 460}