001/* 002 * Copyright (C) 2011 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.hash; 016 017import static com.google.common.base.Preconditions.checkArgument; 018 019import com.google.common.annotations.Beta; 020import com.google.common.annotations.VisibleForTesting; 021import com.google.common.base.Supplier; 022 023import java.security.MessageDigest; 024import java.util.Iterator; 025import java.util.zip.Adler32; 026import java.util.zip.CRC32; 027import java.util.zip.Checksum; 028 029import javax.annotation.Nullable; 030 031/** 032 * Static methods to obtain {@link HashFunction} instances, and other static hashing-related 033 * utilities. 034 * 035 * <p>A comparison of the various hash functions can be found 036 * <a href="http://goo.gl/jS7HH">here</a>. 037 * 038 * @author Kevin Bourrillion 039 * @author Dimitris Andreou 040 * @author Kurt Alfred Kluever 041 * @since 11.0 042 */ 043@Beta 044public final class Hashing { 045 /** 046 * Returns a general-purpose, <b>temporary-use</b>, non-cryptographic hash function. The algorithm 047 * the returned function implements is unspecified and subject to change without notice. 048 * 049 * <p><b>Warning:</b> a new random seed for these functions is chosen each time the {@code 050 * Hashing} class is loaded. <b>Do not use this method</b> if hash codes may escape the current 051 * process in any way, for example being sent over RPC, or saved to disk. 052 * 053 * <p>Repeated calls to this method on the same loaded {@code Hashing} class, using the same value 054 * for {@code minimumBits}, will return identically-behaving {@link HashFunction} instances. 055 * 056 * @param minimumBits a positive integer (can be arbitrarily large) 057 * @return a hash function, described above, that produces hash codes of length {@code 058 * minimumBits} or greater 059 */ 060 public static HashFunction goodFastHash(int minimumBits) { 061 int bits = checkPositiveAndMakeMultipleOf32(minimumBits); 062 063 if (bits == 32) { 064 return Murmur3_32Holder.GOOD_FAST_HASH_FUNCTION_32; 065 } 066 if (bits <= 128) { 067 return Murmur3_128Holder.GOOD_FAST_HASH_FUNCTION_128; 068 } 069 070 // Otherwise, join together some 128-bit murmur3s 071 int hashFunctionsNeeded = (bits + 127) / 128; 072 HashFunction[] hashFunctions = new HashFunction[hashFunctionsNeeded]; 073 hashFunctions[0] = Murmur3_128Holder.GOOD_FAST_HASH_FUNCTION_128; 074 int seed = GOOD_FAST_HASH_SEED; 075 for (int i = 1; i < hashFunctionsNeeded; i++) { 076 seed += 1500450271; // a prime; shouldn't matter 077 hashFunctions[i] = murmur3_128(seed); 078 } 079 return new ConcatenatedHashFunction(hashFunctions); 080 } 081 082 /** 083 * Used to randomize {@link #goodFastHash} instances, so that programs which persist anything 084 * dependent on the hash codes they produce will fail sooner. 085 */ 086 private static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis(); 087 088 /** 089 * Returns a hash function implementing the 090 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 091 * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant), 092 * using the given seed value. 093 * 094 * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A). 095 */ 096 public static HashFunction murmur3_32(int seed) { 097 return new Murmur3_32HashFunction(seed); 098 } 099 100 /** 101 * Returns a hash function implementing the 102 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 103 * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant), 104 * using a seed value of zero. 105 * 106 * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A). 107 */ 108 public static HashFunction murmur3_32() { 109 return Murmur3_32Holder.MURMUR3_32; 110 } 111 112 private static class Murmur3_32Holder { 113 static final HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0); 114 115 /** Returned by {@link #goodFastHash} when {@code minimumBits <= 32}. */ 116 static final HashFunction GOOD_FAST_HASH_FUNCTION_32 = murmur3_32(GOOD_FAST_HASH_SEED); 117 } 118 119 /** 120 * Returns a hash function implementing the 121 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 122 * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), 123 * using the given seed value. 124 * 125 * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F). 126 */ 127 public static HashFunction murmur3_128(int seed) { 128 return new Murmur3_128HashFunction(seed); 129 } 130 131 /** 132 * Returns a hash function implementing the 133 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 134 * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), 135 * using a seed value of zero. 136 * 137 * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F). 138 */ 139 public static HashFunction murmur3_128() { 140 return Murmur3_128Holder.MURMUR3_128; 141 } 142 143 private static class Murmur3_128Holder { 144 static final HashFunction MURMUR3_128 = new Murmur3_128HashFunction(0); 145 146 /** Returned by {@link #goodFastHash} when {@code 32 < minimumBits <= 128}. */ 147 static final HashFunction GOOD_FAST_HASH_FUNCTION_128 = murmur3_128(GOOD_FAST_HASH_SEED); 148 } 149 150 /** 151 * Returns a hash function implementing the 152 * <a href="https://131002.net/siphash/">64-bit SipHash-2-4 algorithm</a> 153 * using a seed value of {@code k = 00 01 02 ...}. 154 * 155 * @since 15.0 156 */ 157 public static HashFunction sipHash24() { 158 return SipHash24Holder.SIP_HASH_24; 159 } 160 161 private static class SipHash24Holder { 162 static final HashFunction SIP_HASH_24 = 163 new SipHashFunction(2, 4, 0x0706050403020100L, 0x0f0e0d0c0b0a0908L); 164 } 165 166 /** 167 * Returns a hash function implementing the 168 * <a href="https://131002.net/siphash/">64-bit SipHash-2-4 algorithm</a> 169 * using the given seed. 170 * 171 * @since 15.0 172 */ 173 public static HashFunction sipHash24(long k0, long k1) { 174 return new SipHashFunction(2, 4, k0, k1); 175 } 176 177 /** 178 * Returns a hash function implementing the MD5 hash algorithm (128 hash bits) by delegating to 179 * the MD5 {@link MessageDigest}. 180 */ 181 public static HashFunction md5() { 182 return Md5Holder.MD5; 183 } 184 185 private static class Md5Holder { 186 static final HashFunction MD5 = new MessageDigestHashFunction("MD5", "Hashing.md5()"); 187 } 188 189 /** 190 * Returns a hash function implementing the SHA-1 algorithm (160 hash bits) by delegating to the 191 * SHA-1 {@link MessageDigest}. 192 */ 193 public static HashFunction sha1() { 194 return Sha1Holder.SHA_1; 195 } 196 197 private static class Sha1Holder { 198 static final HashFunction SHA_1 = 199 new MessageDigestHashFunction("SHA-1", "Hashing.sha1()"); 200 } 201 202 /** 203 * Returns a hash function implementing the SHA-256 algorithm (256 hash bits) by delegating to 204 * the SHA-256 {@link MessageDigest}. 205 */ 206 public static HashFunction sha256() { 207 return Sha256Holder.SHA_256; 208 } 209 210 private static class Sha256Holder { 211 static final HashFunction SHA_256 = 212 new MessageDigestHashFunction("SHA-256", "Hashing.sha256()"); 213 } 214 215 /** 216 * Returns a hash function implementing the SHA-512 algorithm (512 hash bits) by delegating to the 217 * SHA-512 {@link MessageDigest}. 218 */ 219 public static HashFunction sha512() { 220 return Sha512Holder.SHA_512; 221 } 222 223 private static class Sha512Holder { 224 static final HashFunction SHA_512 = 225 new MessageDigestHashFunction("SHA-512", "Hashing.sha512()"); 226 } 227 228 /** 229 * Returns a hash function implementing the CRC-32 checksum algorithm (32 hash bits) by delegating 230 * to the {@link CRC32} {@link Checksum}. 231 * 232 * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a 233 * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}. 234 * 235 * @since 14.0 236 */ 237 public static HashFunction crc32() { 238 return Crc32Holder.CRC_32; 239 } 240 241 private static class Crc32Holder { 242 static final HashFunction CRC_32 = 243 checksumHashFunction(ChecksumType.CRC_32, "Hashing.crc32()"); 244 } 245 246 /** 247 * Returns a hash function implementing the Adler-32 checksum algorithm (32 hash bits) by 248 * delegating to the {@link Adler32} {@link Checksum}. 249 * 250 * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a 251 * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}. 252 * 253 * @since 14.0 254 */ 255 public static HashFunction adler32() { 256 return Adler32Holder.ADLER_32; 257 } 258 259 private static class Adler32Holder { 260 static final HashFunction ADLER_32 = 261 checksumHashFunction(ChecksumType.ADLER_32, "Hashing.adler32()"); 262 } 263 264 private static HashFunction checksumHashFunction(ChecksumType type, String toString) { 265 return new ChecksumHashFunction(type, type.bits, toString); 266 } 267 268 enum ChecksumType implements Supplier<Checksum> { 269 CRC_32(32) { 270 @Override 271 public Checksum get() { 272 return new CRC32(); 273 } 274 }, 275 ADLER_32(32) { 276 @Override 277 public Checksum get() { 278 return new Adler32(); 279 } 280 }; 281 282 private final int bits; 283 284 ChecksumType(int bits) { 285 this.bits = bits; 286 } 287 288 @Override 289 public abstract Checksum get(); 290 } 291 292 /** 293 * Assigns to {@code hashCode} a "bucket" in the range {@code [0, buckets)}, in a uniform 294 * manner that minimizes the need for remapping as {@code buckets} grows. That is, 295 * {@code consistentHash(h, n)} equals: 296 * 297 * <ul> 298 * <li>{@code n - 1}, with approximate probability {@code 1/n} 299 * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n}) 300 * </ul> 301 * 302 * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia 303 * article on consistent hashing</a> for more information. 304 */ 305 public static int consistentHash(HashCode hashCode, int buckets) { 306 return consistentHash(hashCode.padToLong(), buckets); 307 } 308 309 /** 310 * Assigns to {@code input} a "bucket" in the range {@code [0, buckets)}, in a uniform 311 * manner that minimizes the need for remapping as {@code buckets} grows. That is, 312 * {@code consistentHash(h, n)} equals: 313 * 314 * <ul> 315 * <li>{@code n - 1}, with approximate probability {@code 1/n} 316 * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n}) 317 * </ul> 318 * 319 * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia 320 * article on consistent hashing</a> for more information. 321 */ 322 public static int consistentHash(long input, int buckets) { 323 checkArgument(buckets > 0, "buckets must be positive: %s", buckets); 324 LinearCongruentialGenerator generator = new LinearCongruentialGenerator(input); 325 int candidate = 0; 326 int next; 327 328 // Jump from bucket to bucket until we go out of range 329 while (true) { 330 next = (int) ((candidate + 1) / generator.nextDouble()); 331 if (next >= 0 && next < buckets) { 332 candidate = next; 333 } else { 334 return candidate; 335 } 336 } 337 } 338 339 /** 340 * Returns a hash code, having the same bit length as each of the input hash codes, 341 * that combines the information of these hash codes in an ordered fashion. That 342 * is, whenever two equal hash codes are produced by two calls to this method, it 343 * is <i>as likely as possible</i> that each was computed from the <i>same</i> 344 * input hash codes in the <i>same</i> order. 345 * 346 * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes 347 * do not all have the same bit length 348 */ 349 public static HashCode combineOrdered(Iterable<HashCode> hashCodes) { 350 Iterator<HashCode> iterator = hashCodes.iterator(); 351 checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine."); 352 int bits = iterator.next().bits(); 353 byte[] resultBytes = new byte[bits / 8]; 354 for (HashCode hashCode : hashCodes) { 355 byte[] nextBytes = hashCode.asBytes(); 356 checkArgument(nextBytes.length == resultBytes.length, 357 "All hashcodes must have the same bit length."); 358 for (int i = 0; i < nextBytes.length; i++) { 359 resultBytes[i] = (byte) (resultBytes[i] * 37 ^ nextBytes[i]); 360 } 361 } 362 return HashCode.fromBytesNoCopy(resultBytes); 363 } 364 365 /** 366 * Returns a hash code, having the same bit length as each of the input hash codes, 367 * that combines the information of these hash codes in an unordered fashion. That 368 * is, whenever two equal hash codes are produced by two calls to this method, it 369 * is <i>as likely as possible</i> that each was computed from the <i>same</i> 370 * input hash codes in <i>some</i> order. 371 * 372 * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes 373 * do not all have the same bit length 374 */ 375 public static HashCode combineUnordered(Iterable<HashCode> hashCodes) { 376 Iterator<HashCode> iterator = hashCodes.iterator(); 377 checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine."); 378 byte[] resultBytes = new byte[iterator.next().bits() / 8]; 379 for (HashCode hashCode : hashCodes) { 380 byte[] nextBytes = hashCode.asBytes(); 381 checkArgument(nextBytes.length == resultBytes.length, 382 "All hashcodes must have the same bit length."); 383 for (int i = 0; i < nextBytes.length; i++) { 384 resultBytes[i] += nextBytes[i]; 385 } 386 } 387 return HashCode.fromBytesNoCopy(resultBytes); 388 } 389 390 /** 391 * Checks that the passed argument is positive, and ceils it to a multiple of 32. 392 */ 393 static int checkPositiveAndMakeMultipleOf32(int bits) { 394 checkArgument(bits > 0, "Number of bits must be positive"); 395 return (bits + 31) & ~31; 396 } 397 398 // TODO(kevinb): Maybe expose this class via a static Hashing method? 399 @VisibleForTesting 400 static final class ConcatenatedHashFunction extends AbstractCompositeHashFunction { 401 private final int bits; 402 403 ConcatenatedHashFunction(HashFunction... functions) { 404 super(functions); 405 int bitSum = 0; 406 for (HashFunction function : functions) { 407 bitSum += function.bits(); 408 } 409 this.bits = bitSum; 410 } 411 412 @Override 413 HashCode makeHash(Hasher[] hashers) { 414 byte[] bytes = new byte[bits / 8]; 415 int i = 0; 416 for (Hasher hasher : hashers) { 417 HashCode newHash = hasher.hash(); 418 i += newHash.writeBytesTo(bytes, i, newHash.bits() / 8); 419 } 420 return HashCode.fromBytesNoCopy(bytes); 421 } 422 423 @Override 424 public int bits() { 425 return bits; 426 } 427 428 @Override 429 public boolean equals(@Nullable Object object) { 430 if (object instanceof ConcatenatedHashFunction) { 431 ConcatenatedHashFunction other = (ConcatenatedHashFunction) object; 432 if (bits != other.bits || functions.length != other.functions.length) { 433 return false; 434 } 435 for (int i = 0; i < functions.length; i++) { 436 if (!functions[i].equals(other.functions[i])) { 437 return false; 438 } 439 } 440 return true; 441 } 442 return false; 443 } 444 445 @Override 446 public int hashCode() { 447 int hash = bits; 448 for (HashFunction function : functions) { 449 hash ^= function.hashCode(); 450 } 451 return hash; 452 } 453 } 454 455 /** 456 * Linear CongruentialGenerator to use for consistent hashing. 457 * See http://en.wikipedia.org/wiki/Linear_congruential_generator 458 */ 459 private static final class LinearCongruentialGenerator { 460 private long state; 461 462 public LinearCongruentialGenerator(long seed) { 463 this.state = seed; 464 } 465 466 public double nextDouble() { 467 state = 2862933555777941757L * state + 1; 468 return ((double) ((int) (state >>> 33) + 1)) / (0x1.0p31); 469 } 470 } 471 472 private Hashing() {} 473}