001 /*
002 * Copyright (C) 2011 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015 package com.google.common.hash;
016
017 import static com.google.common.base.Preconditions.checkArgument;
018
019 import com.google.common.annotations.Beta;
020 import com.google.common.annotations.VisibleForTesting;
021 import com.google.common.primitives.UnsignedInts;
022
023 import java.nio.ByteBuffer;
024 import java.security.MessageDigest;
025 import java.util.Iterator;
026
027 /**
028 * Static methods to obtain {@link HashFunction} instances, and other static
029 * hashing-related utilities.
030 *
031 * @author Kevin Bourrillion
032 * @author Dimitris Andreou
033 * @author Kurt Alfred Kluever
034 * @since 11.0
035 */
036 @Beta
037 public final class Hashing {
038 private Hashing() {}
039
040 /**
041 * Used to randomize goodFastHash() instances, so that programs which persist anything
042 * dependent on hashcodes of those, will fail sooner than later.
043 */
044 private static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();
045
046 /**
047 * Returns a general-purpose, <b>non-cryptographic-strength</b>, streaming hash function that
048 * produces hash codes of length at least {@code minimumBits}. Users without specific
049 * compatibility requirements and who do not persist the hash codes are encouraged to
050 * choose this hash function.
051 *
052 * <p><b>Warning: the implementation is unspecified and is subject to change.</b>
053 *
054 * @throws IllegalArgumentException if {@code minimumBits} is not positive
055 */
056 public static HashFunction goodFastHash(int minimumBits) {
057 int bits = checkPositiveAndMakeMultipleOf32(minimumBits);
058
059 if (bits == 32) {
060 return murmur3_32(GOOD_FAST_HASH_SEED);
061 } else if (bits <= 128) {
062 return murmur3_128(GOOD_FAST_HASH_SEED);
063 } else {
064 // Join some 128-bit murmur3s
065 int hashFunctionsNeeded = (bits + 127) / 128;
066 HashFunction[] hashFunctions = new HashFunction[hashFunctionsNeeded];
067 int seed = GOOD_FAST_HASH_SEED;
068 for (int i = 0; i < hashFunctionsNeeded; i++) {
069 hashFunctions[i] = murmur3_128(seed);
070 seed += 1500450271; // a prime; shouldn't matter
071 }
072 return new ConcatenatedHashFunction(hashFunctions);
073 }
074 }
075
076 /**
077 * Returns a hash function implementing the
078 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">32-bit murmur3
079 * algorithm</a> (little-endian variant), using the given seed value.
080 */
081 public static HashFunction murmur3_32(int seed) {
082 return new Murmur3_32HashFunction(seed);
083 }
084
085 /**
086 * Returns a hash function implementing the
087 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">32-bit murmur3
088 * algorithm</a> (little-endian variant), using a seed value of zero.
089 */
090 public static HashFunction murmur3_32() {
091 return MURMUR3_32;
092 }
093
094 private static final Murmur3_32HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0);
095
096 /**
097 * Returns a hash function implementing the
098 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
099 * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), using the given seed
100 * value.
101 */
102 public static HashFunction murmur3_128(int seed) {
103 return new Murmur3_128HashFunction(seed);
104 }
105
106 /**
107 * Returns a hash function implementing the
108 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
109 * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), using a seed value
110 * of zero.
111 */
112 public static HashFunction murmur3_128() {
113 return MURMUR3_128;
114 }
115
116 private static final Murmur3_128HashFunction MURMUR3_128 = new Murmur3_128HashFunction(0);
117
118 /**
119 * Returns a hash function implementing the MD5 hash algorithm (128 hash bits) by delegating to
120 * the MD5 {@link MessageDigest}.
121 */
122 public static HashFunction md5() {
123 return MD5;
124 }
125
126 private static final HashFunction MD5 = new MessageDigestHashFunction("MD5");
127
128 /**
129 * Returns a hash function implementing the SHA-1 algorithm (160 hash bits) by delegating to the
130 * SHA-1 {@link MessageDigest}.
131 */
132 public static HashFunction sha1() {
133 return SHA_1;
134 }
135
136 private static final HashFunction SHA_1 = new MessageDigestHashFunction("SHA-1");
137
138 /**
139 * Returns a hash function implementing the SHA-256 algorithm (256 hash bits) by delegating to
140 * the SHA-256 {@link MessageDigest}.
141 */
142 public static HashFunction sha256() {
143 return SHA_256;
144 }
145
146 private static final HashFunction SHA_256 = new MessageDigestHashFunction("SHA-256");
147
148 /**
149 * Returns a hash function implementing the SHA-512 algorithm (512 hash bits) by delegating to the
150 * SHA-512 {@link MessageDigest}.
151 */
152 public static HashFunction sha512() {
153 return SHA_512;
154 }
155
156 private static final HashFunction SHA_512 = new MessageDigestHashFunction("SHA-512");
157
158 /**
159 * If {@code hashCode} has enough bits, returns {@code hashCode.asLong()}, otherwise
160 * returns a {@code long} value with {@code hashCode.asInt()} as the least-significant
161 * four bytes and {@code 0x00} as each of the most-significant four bytes.
162 */
163 public static long padToLong(HashCode hashCode) {
164 return (hashCode.bits() < 64) ? UnsignedInts.toLong(hashCode.asInt()) : hashCode.asLong();
165 }
166
167 /**
168 * Assigns to {@code hashCode} a "bucket" in the range {@code [0, buckets)}, in a uniform
169 * manner that minimizes the need for remapping as {@code buckets} grows. That is,
170 * {@code consistentHash(h, n)} equals:
171 *
172 * <ul>
173 * <li>{@code n - 1}, with approximate probability {@code 1/n}
174 * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n})
175 * </ul>
176 *
177 * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia
178 * article on consistent hashing</a> for more information.
179 */
180 public static int consistentHash(HashCode hashCode, int buckets) {
181 return consistentHash(padToLong(hashCode), buckets);
182 }
183
184 /**
185 * Assigns to {@code input} a "bucket" in the range {@code [0, buckets)}, in a uniform
186 * manner that minimizes the need for remapping as {@code buckets} grows. That is,
187 * {@code consistentHash(h, n)} equals:
188 *
189 * <ul>
190 * <li>{@code n - 1}, with approximate probability {@code 1/n}
191 * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n})
192 * </ul>
193 *
194 * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia
195 * article on consistent hashing</a> for more information.
196 */
197 public static int consistentHash(long input, int buckets) {
198 checkArgument(buckets > 0, "buckets must be positive: %s", buckets);
199 long h = input;
200 int candidate = 0;
201 int next;
202
203 // Jump from bucket to bucket until we go out of range
204 while (true) {
205 // See http://en.wikipedia.org/wiki/Linear_congruential_generator
206 // These values for a and m come from the C++ version of this function.
207 h = 2862933555777941757L * h + 1;
208 double inv = 0x1.0p31 / ((int) (h >>> 33) + 1);
209 next = (int) ((candidate + 1) * inv);
210
211 if (next >= 0 && next < buckets) {
212 candidate = next;
213 } else {
214 return candidate;
215 }
216 }
217 }
218
219 /**
220 * Returns a hash code, having the same bit length as each of the input hash codes,
221 * that combines the information of these hash codes in an ordered fashion. That
222 * is, whenever two equal hash codes are produced by two calls to this method, it
223 * is <i>as likely as possible</i> that each was computed from the <i>same</i>
224 * input hash codes in the <i>same</i> order.
225 *
226 * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes
227 * do not all have the same bit length
228 */
229 public static HashCode combineOrdered(Iterable<HashCode> hashCodes) {
230 Iterator<HashCode> iterator = hashCodes.iterator();
231 checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine.");
232 int bits = iterator.next().bits();
233 byte[] resultBytes = new byte[bits / 8];
234 for (HashCode hashCode : hashCodes) {
235 byte[] nextBytes = hashCode.asBytes();
236 checkArgument(nextBytes.length == resultBytes.length,
237 "All hashcodes must have the same bit length.");
238 for (int i = 0; i < nextBytes.length; i++) {
239 resultBytes[i] = (byte) (resultBytes[i] * 37 ^ nextBytes[i]);
240 }
241 }
242 return HashCodes.fromBytesNoCopy(resultBytes);
243 }
244
245 /**
246 * Returns a hash code, having the same bit length as each of the input hash codes,
247 * that combines the information of these hash codes in an unordered fashion. That
248 * is, whenever two equal hash codes are produced by two calls to this method, it
249 * is <i>as likely as possible</i> that each was computed from the <i>same</i>
250 * input hash codes in <i>some</i> order.
251 *
252 * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes
253 * do not all have the same bit length
254 */
255 public static HashCode combineUnordered(Iterable<HashCode> hashCodes) {
256 Iterator<HashCode> iterator = hashCodes.iterator();
257 checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine.");
258 byte[] resultBytes = new byte[iterator.next().bits() / 8];
259 for (HashCode hashCode : hashCodes) {
260 byte[] nextBytes = hashCode.asBytes();
261 checkArgument(nextBytes.length == resultBytes.length,
262 "All hashcodes must have the same bit length.");
263 for (int i = 0; i < nextBytes.length; i++) {
264 resultBytes[i] += nextBytes[i];
265 }
266 }
267 return HashCodes.fromBytesNoCopy(resultBytes);
268 }
269
270 /**
271 * Checks that the passed argument is positive, and ceils it to a multiple of 32.
272 */
273 static int checkPositiveAndMakeMultipleOf32(int bits) {
274 checkArgument(bits > 0, "Number of bits must be positive");
275 return (bits + 31) & ~31;
276 }
277
278 // TODO(kevinb): Maybe expose this class via a static Hashing method?
279 @VisibleForTesting
280 static final class ConcatenatedHashFunction extends AbstractCompositeHashFunction {
281 private final int bits;
282
283 ConcatenatedHashFunction(HashFunction... functions) {
284 super(functions);
285 int bitSum = 0;
286 for (HashFunction function : functions) {
287 bitSum += function.bits();
288 }
289 this.bits = bitSum;
290 }
291
292 @Override
293 HashCode makeHash(Hasher[] hashers) {
294 // TODO(user): Get rid of the ByteBuffer here?
295 byte[] bytes = new byte[bits / 8];
296 ByteBuffer buffer = ByteBuffer.wrap(bytes);
297 for (Hasher hasher : hashers) {
298 buffer.put(hasher.hash().asBytes());
299 }
300 return HashCodes.fromBytesNoCopy(bytes);
301 }
302
303 @Override
304 public int bits() {
305 return bits;
306 }
307 }
308 }