001/* 002 * Copyright (C) 2011 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.hash; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019 020import com.google.common.annotations.Beta; 021import com.google.common.base.Supplier; 022 023import java.security.MessageDigest; 024import java.util.ArrayList; 025import java.util.Arrays; 026import java.util.Iterator; 027import java.util.List; 028import java.util.zip.Adler32; 029import java.util.zip.CRC32; 030import java.util.zip.Checksum; 031 032import javax.annotation.CheckReturnValue; 033import javax.annotation.Nullable; 034 035/** 036 * Static methods to obtain {@link HashFunction} instances, and other static hashing-related 037 * utilities. 038 * 039 * <p>A comparison of the various hash functions can be found 040 * <a href="http://goo.gl/jS7HH">here</a>. 041 * 042 * @author Kevin Bourrillion 043 * @author Dimitris Andreou 044 * @author Kurt Alfred Kluever 045 * @since 11.0 046 */ 047@Beta 048@CheckReturnValue 049public final class Hashing { 050 /** 051 * Returns a general-purpose, <b>temporary-use</b>, non-cryptographic hash function. The algorithm 052 * the returned function implements is unspecified and subject to change without notice. 053 * 054 * <p><b>Warning:</b> a new random seed for these functions is chosen each time the {@code 055 * Hashing} class is loaded. <b>Do not use this method</b> if hash codes may escape the current 056 * process in any way, for example being sent over RPC, or saved to disk. 057 * 058 * <p>Repeated calls to this method on the same loaded {@code Hashing} class, using the same value 059 * for {@code minimumBits}, will return identically-behaving {@link HashFunction} instances. 060 * 061 * @param minimumBits a positive integer (can be arbitrarily large) 062 * @return a hash function, described above, that produces hash codes of length {@code 063 * minimumBits} or greater 064 */ 065 public static HashFunction goodFastHash(int minimumBits) { 066 int bits = checkPositiveAndMakeMultipleOf32(minimumBits); 067 068 if (bits == 32) { 069 return Murmur3_32Holder.GOOD_FAST_HASH_FUNCTION_32; 070 } 071 if (bits <= 128) { 072 return Murmur3_128Holder.GOOD_FAST_HASH_FUNCTION_128; 073 } 074 075 // Otherwise, join together some 128-bit murmur3s 076 int hashFunctionsNeeded = (bits + 127) / 128; 077 HashFunction[] hashFunctions = new HashFunction[hashFunctionsNeeded]; 078 hashFunctions[0] = Murmur3_128Holder.GOOD_FAST_HASH_FUNCTION_128; 079 int seed = GOOD_FAST_HASH_SEED; 080 for (int i = 1; i < hashFunctionsNeeded; i++) { 081 seed += 1500450271; // a prime; shouldn't matter 082 hashFunctions[i] = murmur3_128(seed); 083 } 084 return new ConcatenatedHashFunction(hashFunctions); 085 } 086 087 /** 088 * Used to randomize {@link #goodFastHash} instances, so that programs which persist anything 089 * dependent on the hash codes they produce will fail sooner. 090 */ 091 private static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis(); 092 093 /** 094 * Returns a hash function implementing the 095 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 096 * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant), 097 * using the given seed value. 098 * 099 * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A). 100 */ 101 public static HashFunction murmur3_32(int seed) { 102 return new Murmur3_32HashFunction(seed); 103 } 104 105 /** 106 * Returns a hash function implementing the 107 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 108 * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant), 109 * using a seed value of zero. 110 * 111 * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A). 112 */ 113 public static HashFunction murmur3_32() { 114 return Murmur3_32Holder.MURMUR3_32; 115 } 116 117 private static class Murmur3_32Holder { 118 static final HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0); 119 120 /** Returned by {@link #goodFastHash} when {@code minimumBits <= 32}. */ 121 static final HashFunction GOOD_FAST_HASH_FUNCTION_32 = murmur3_32(GOOD_FAST_HASH_SEED); 122 } 123 124 /** 125 * Returns a hash function implementing the 126 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 127 * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), 128 * using the given seed value. 129 * 130 * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F). 131 */ 132 public static HashFunction murmur3_128(int seed) { 133 return new Murmur3_128HashFunction(seed); 134 } 135 136 /** 137 * Returns a hash function implementing the 138 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 139 * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), 140 * using a seed value of zero. 141 * 142 * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F). 143 */ 144 public static HashFunction murmur3_128() { 145 return Murmur3_128Holder.MURMUR3_128; 146 } 147 148 private static class Murmur3_128Holder { 149 static final HashFunction MURMUR3_128 = new Murmur3_128HashFunction(0); 150 151 /** Returned by {@link #goodFastHash} when {@code 32 < minimumBits <= 128}. */ 152 static final HashFunction GOOD_FAST_HASH_FUNCTION_128 = murmur3_128(GOOD_FAST_HASH_SEED); 153 } 154 155 /** 156 * Returns a hash function implementing the 157 * <a href="https://131002.net/siphash/">64-bit SipHash-2-4 algorithm</a> 158 * using a seed value of {@code k = 00 01 02 ...}. 159 * 160 * @since 15.0 161 */ 162 public static HashFunction sipHash24() { 163 return SipHash24Holder.SIP_HASH_24; 164 } 165 166 private static class SipHash24Holder { 167 static final HashFunction SIP_HASH_24 = 168 new SipHashFunction(2, 4, 0x0706050403020100L, 0x0f0e0d0c0b0a0908L); 169 } 170 171 /** 172 * Returns a hash function implementing the 173 * <a href="https://131002.net/siphash/">64-bit SipHash-2-4 algorithm</a> 174 * using the given seed. 175 * 176 * @since 15.0 177 */ 178 public static HashFunction sipHash24(long k0, long k1) { 179 return new SipHashFunction(2, 4, k0, k1); 180 } 181 182 /** 183 * Returns a hash function implementing the MD5 hash algorithm (128 hash bits) by delegating to 184 * the MD5 {@link MessageDigest}. 185 */ 186 public static HashFunction md5() { 187 return Md5Holder.MD5; 188 } 189 190 private static class Md5Holder { 191 static final HashFunction MD5 = new MessageDigestHashFunction("MD5", "Hashing.md5()"); 192 } 193 194 /** 195 * Returns a hash function implementing the SHA-1 algorithm (160 hash bits) by delegating to the 196 * SHA-1 {@link MessageDigest}. 197 */ 198 public static HashFunction sha1() { 199 return Sha1Holder.SHA_1; 200 } 201 202 private static class Sha1Holder { 203 static final HashFunction SHA_1 = new MessageDigestHashFunction("SHA-1", "Hashing.sha1()"); 204 } 205 206 /** 207 * Returns a hash function implementing the SHA-256 algorithm (256 hash bits) by delegating to 208 * the SHA-256 {@link MessageDigest}. 209 */ 210 public static HashFunction sha256() { 211 return Sha256Holder.SHA_256; 212 } 213 214 private static class Sha256Holder { 215 static final HashFunction SHA_256 = 216 new MessageDigestHashFunction("SHA-256", "Hashing.sha256()"); 217 } 218 219 /** 220 * Returns a hash function implementing the SHA-384 algorithm (384 hash bits) by delegating to 221 * the SHA-384 {@link MessageDigest}. 222 * 223 * @since 19.0 224 */ 225 public static HashFunction sha384() { 226 return Sha384Holder.SHA_384; 227 } 228 229 private static class Sha384Holder { 230 static final HashFunction SHA_384 = 231 new MessageDigestHashFunction("SHA-384", "Hashing.sha384()"); 232 } 233 234 /** 235 * Returns a hash function implementing the SHA-512 algorithm (512 hash bits) by delegating to the 236 * SHA-512 {@link MessageDigest}. 237 */ 238 public static HashFunction sha512() { 239 return Sha512Holder.SHA_512; 240 } 241 242 private static class Sha512Holder { 243 static final HashFunction SHA_512 = 244 new MessageDigestHashFunction("SHA-512", "Hashing.sha512()"); 245 } 246 247 /** 248 * Returns a hash function implementing the CRC32C checksum algorithm (32 hash bits) as described 249 * by RFC 3720, Section 12.1. 250 * 251 * @since 18.0 252 */ 253 public static HashFunction crc32c() { 254 return Crc32cHolder.CRC_32_C; 255 } 256 257 private static final class Crc32cHolder { 258 static final HashFunction CRC_32_C = new Crc32cHashFunction(); 259 } 260 261 /** 262 * Returns a hash function implementing the CRC-32 checksum algorithm (32 hash bits) by delegating 263 * to the {@link CRC32} {@link Checksum}. 264 * 265 * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a 266 * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}. 267 * 268 * @since 14.0 269 */ 270 public static HashFunction crc32() { 271 return Crc32Holder.CRC_32; 272 } 273 274 private static class Crc32Holder { 275 static final HashFunction CRC_32 = checksumHashFunction(ChecksumType.CRC_32, "Hashing.crc32()"); 276 } 277 278 /** 279 * Returns a hash function implementing the Adler-32 checksum algorithm (32 hash bits) by 280 * delegating to the {@link Adler32} {@link Checksum}. 281 * 282 * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a 283 * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}. 284 * 285 * @since 14.0 286 */ 287 public static HashFunction adler32() { 288 return Adler32Holder.ADLER_32; 289 } 290 291 private static class Adler32Holder { 292 static final HashFunction ADLER_32 = 293 checksumHashFunction(ChecksumType.ADLER_32, "Hashing.adler32()"); 294 } 295 296 private static HashFunction checksumHashFunction(ChecksumType type, String toString) { 297 return new ChecksumHashFunction(type, type.bits, toString); 298 } 299 300 enum ChecksumType implements Supplier<Checksum> { 301 CRC_32(32) { 302 @Override 303 public Checksum get() { 304 return new CRC32(); 305 } 306 }, 307 ADLER_32(32) { 308 @Override 309 public Checksum get() { 310 return new Adler32(); 311 } 312 }; 313 314 private final int bits; 315 316 ChecksumType(int bits) { 317 this.bits = bits; 318 } 319 320 @Override 321 public abstract Checksum get(); 322 } 323 324 /** 325 * Assigns to {@code hashCode} a "bucket" in the range {@code [0, buckets)}, in a uniform manner 326 * that minimizes the need for remapping as {@code buckets} grows. That is, {@code 327 * consistentHash(h, n)} equals: 328 * 329 * <ul> 330 * <li>{@code n - 1}, with approximate probability {@code 1/n} 331 * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n}) 332 * </ul> 333 * 334 * <p>This method is suitable for the common use case of dividing work among buckets that meet the 335 * following conditions: 336 * 337 * <ul> 338 * <li>You want to assign the same fraction of inputs to each bucket. 339 * <li>When you reduce the number of buckets, you can accept that the most recently added buckets 340 * will be removed first. More concretely, if you are dividing traffic among tasks, you can 341 * decrease the number of tasks from 15 and 10, killing off the final 5 tasks, and {@code 342 * consistentHash} will handle it. If, however, you are dividing traffic among servers {@code 343 * alpha}, {@code bravo}, and {@code charlie} and you occasionally need to take each of the 344 * servers offline, {@code consistentHash} will be a poor fit: It provides no way for you to 345 * specify which of the three buckets is disappearing. Thus, if your buckets change from {@code 346 * [alpha, bravo, charlie]} to {@code [bravo, charlie]}, it will assign all the old {@code alpha} 347 * traffic to {@code bravo} and all the old {@code bravo} traffic to {@code charlie}, rather than 348 * letting {@code bravo} keep its traffic. 349 * </ul> 350 * 351 * 352 * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">Wikipedia article on 353 * consistent hashing</a> for more information. 354 */ 355 public static int consistentHash(HashCode hashCode, int buckets) { 356 return consistentHash(hashCode.padToLong(), buckets); 357 } 358 359 /** 360 * Assigns to {@code input} a "bucket" in the range {@code [0, buckets)}, in a uniform manner that 361 * minimizes the need for remapping as {@code buckets} grows. That is, {@code consistentHash(h, 362 * n)} equals: 363 * 364 * <ul> 365 * <li>{@code n - 1}, with approximate probability {@code 1/n} 366 * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n}) 367 * </ul> 368 * 369 * <p>This method is suitable for the common use case of dividing work among buckets that meet the 370 * following conditions: 371 * 372 * <ul> 373 * <li>You want to assign the same fraction of inputs to each bucket. 374 * <li>When you reduce the number of buckets, you can accept that the most recently added buckets 375 * will be removed first. More concretely, if you are dividing traffic among tasks, you can 376 * decrease the number of tasks from 15 and 10, killing off the final 5 tasks, and {@code 377 * consistentHash} will handle it. If, however, you are dividing traffic among servers {@code 378 * alpha}, {@code bravo}, and {@code charlie} and you occasionally need to take each of the 379 * servers offline, {@code consistentHash} will be a poor fit: It provides no way for you to 380 * specify which of the three buckets is disappearing. Thus, if your buckets change from {@code 381 * [alpha, bravo, charlie]} to {@code [bravo, charlie]}, it will assign all the old {@code alpha} 382 * traffic to {@code bravo} and all the old {@code bravo} traffic to {@code charlie}, rather than 383 * letting {@code bravo} keep its traffic. 384 * </ul> 385 * 386 * 387 * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">Wikipedia article on 388 * consistent hashing</a> for more information. 389 */ 390 public static int consistentHash(long input, int buckets) { 391 checkArgument(buckets > 0, "buckets must be positive: %s", buckets); 392 LinearCongruentialGenerator generator = new LinearCongruentialGenerator(input); 393 int candidate = 0; 394 int next; 395 396 // Jump from bucket to bucket until we go out of range 397 while (true) { 398 next = (int) ((candidate + 1) / generator.nextDouble()); 399 if (next >= 0 && next < buckets) { 400 candidate = next; 401 } else { 402 return candidate; 403 } 404 } 405 } 406 407 /** 408 * Returns a hash code, having the same bit length as each of the input hash codes, 409 * that combines the information of these hash codes in an ordered fashion. That 410 * is, whenever two equal hash codes are produced by two calls to this method, it 411 * is <i>as likely as possible</i> that each was computed from the <i>same</i> 412 * input hash codes in the <i>same</i> order. 413 * 414 * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes 415 * do not all have the same bit length 416 */ 417 public static HashCode combineOrdered(Iterable<HashCode> hashCodes) { 418 Iterator<HashCode> iterator = hashCodes.iterator(); 419 checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine."); 420 int bits = iterator.next().bits(); 421 byte[] resultBytes = new byte[bits / 8]; 422 for (HashCode hashCode : hashCodes) { 423 byte[] nextBytes = hashCode.asBytes(); 424 checkArgument( 425 nextBytes.length == resultBytes.length, "All hashcodes must have the same bit length."); 426 for (int i = 0; i < nextBytes.length; i++) { 427 resultBytes[i] = (byte) (resultBytes[i] * 37 ^ nextBytes[i]); 428 } 429 } 430 return HashCode.fromBytesNoCopy(resultBytes); 431 } 432 433 /** 434 * Returns a hash code, having the same bit length as each of the input hash codes, 435 * that combines the information of these hash codes in an unordered fashion. That 436 * is, whenever two equal hash codes are produced by two calls to this method, it 437 * is <i>as likely as possible</i> that each was computed from the <i>same</i> 438 * input hash codes in <i>some</i> order. 439 * 440 * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes 441 * do not all have the same bit length 442 */ 443 public static HashCode combineUnordered(Iterable<HashCode> hashCodes) { 444 Iterator<HashCode> iterator = hashCodes.iterator(); 445 checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine."); 446 byte[] resultBytes = new byte[iterator.next().bits() / 8]; 447 for (HashCode hashCode : hashCodes) { 448 byte[] nextBytes = hashCode.asBytes(); 449 checkArgument( 450 nextBytes.length == resultBytes.length, "All hashcodes must have the same bit length."); 451 for (int i = 0; i < nextBytes.length; i++) { 452 resultBytes[i] += nextBytes[i]; 453 } 454 } 455 return HashCode.fromBytesNoCopy(resultBytes); 456 } 457 458 /** 459 * Checks that the passed argument is positive, and ceils it to a multiple of 32. 460 */ 461 static int checkPositiveAndMakeMultipleOf32(int bits) { 462 checkArgument(bits > 0, "Number of bits must be positive"); 463 return (bits + 31) & ~31; 464 } 465 466 /** 467 * Returns a hash function which computes its hash code by concatenating the hash codes of the 468 * underlying hash functions together. This can be useful if you need to generate hash codes 469 * of a specific length. 470 * 471 * <p>For example, if you need 1024-bit hash codes, you could join two {@link Hashing#sha512} 472 * hash functions together: {@code Hashing.concatenating(Hashing.sha512(), Hashing.sha512())}. 473 * 474 * @since 19.0 475 */ 476 public static HashFunction concatenating( 477 HashFunction first, HashFunction second, HashFunction... rest) { 478 // We can't use Lists.asList() here because there's no hash->collect dependency 479 List<HashFunction> list = new ArrayList<HashFunction>(); 480 list.add(first); 481 list.add(second); 482 for (HashFunction hashFunc : rest) { 483 list.add(hashFunc); 484 } 485 return new ConcatenatedHashFunction(list.toArray(new HashFunction[0])); 486 } 487 488 /** 489 * Returns a hash function which computes its hash code by concatenating the hash codes of the 490 * underlying hash functions together. This can be useful if you need to generate hash codes 491 * of a specific length. 492 * 493 * <p>For example, if you need 1024-bit hash codes, you could join two {@link Hashing#sha512} 494 * hash functions together: {@code Hashing.concatenating(Hashing.sha512(), Hashing.sha512())}. 495 * 496 * @since 19.0 497 */ 498 public static HashFunction concatenating(Iterable<HashFunction> hashFunctions) { 499 checkNotNull(hashFunctions); 500 // We can't use Iterables.toArray() here because there's no hash->collect dependency 501 List<HashFunction> list = new ArrayList<HashFunction>(); 502 for (HashFunction hashFunction : hashFunctions) { 503 list.add(hashFunction); 504 } 505 checkArgument(list.size() > 0, "number of hash functions (%s) must be > 0", list.size()); 506 return new ConcatenatedHashFunction(list.toArray(new HashFunction[0])); 507 } 508 509 private static final class ConcatenatedHashFunction extends AbstractCompositeHashFunction { 510 private final int bits; 511 512 private ConcatenatedHashFunction(HashFunction... functions) { 513 super(functions); 514 int bitSum = 0; 515 for (HashFunction function : functions) { 516 bitSum += function.bits(); 517 checkArgument( 518 function.bits() % 8 == 0, 519 "the number of bits (%s) in hashFunction (%s) must be divisible by 8", 520 function.bits(), 521 function); 522 } 523 this.bits = bitSum; 524 } 525 526 @Override 527 HashCode makeHash(Hasher[] hashers) { 528 byte[] bytes = new byte[bits / 8]; 529 int i = 0; 530 for (Hasher hasher : hashers) { 531 HashCode newHash = hasher.hash(); 532 i += newHash.writeBytesTo(bytes, i, newHash.bits() / 8); 533 } 534 return HashCode.fromBytesNoCopy(bytes); 535 } 536 537 @Override 538 public int bits() { 539 return bits; 540 } 541 542 @Override 543 public boolean equals(@Nullable Object object) { 544 if (object instanceof ConcatenatedHashFunction) { 545 ConcatenatedHashFunction other = (ConcatenatedHashFunction) object; 546 return Arrays.equals(functions, other.functions); 547 } 548 return false; 549 } 550 551 @Override 552 public int hashCode() { 553 return Arrays.hashCode(functions) * 31 + bits; 554 } 555 } 556 557 /** 558 * Linear CongruentialGenerator to use for consistent hashing. 559 * See http://en.wikipedia.org/wiki/Linear_congruential_generator 560 */ 561 private static final class LinearCongruentialGenerator { 562 private long state; 563 564 public LinearCongruentialGenerator(long seed) { 565 this.state = seed; 566 } 567 568 public double nextDouble() { 569 state = 2862933555777941757L * state + 1; 570 return ((double) ((int) (state >>> 33) + 1)) / (0x1.0p31); 571 } 572 } 573 574 private Hashing() {} 575}