001/* 002 * Copyright (C) 2011 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.hash; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019 020import com.google.common.annotations.Beta; 021import com.google.common.annotations.VisibleForTesting; 022import com.google.common.base.Objects; 023import com.google.common.base.Predicate; 024import com.google.common.hash.BloomFilterStrategies.BitArray; 025import com.google.common.primitives.SignedBytes; 026import com.google.common.primitives.UnsignedBytes; 027 028import java.io.DataInputStream; 029import java.io.DataOutputStream; 030import java.io.IOException; 031import java.io.InputStream; 032import java.io.OutputStream; 033import java.io.Serializable; 034 035import javax.annotation.Nullable; 036 037/** 038 * A Bloom filter for instances of {@code T}. A Bloom filter offers an approximate containment test 039 * with one-sided error: if it claims that an element is contained in it, this might be in error, 040 * but if it claims that an element is <i>not</i> contained in it, then this is definitely true. 041 * 042 * <p>If you are unfamiliar with Bloom filters, this nice 043 * <a href="http://llimllib.github.com/bloomfilter-tutorial/">tutorial</a> may help you understand 044 * how they work. 045 * 046 * <p>The false positive probability ({@code FPP}) of a bloom filter is defined as the probability 047 * that {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that 048 * has not actually been put in the {@code BloomFilter}. 049 * 050 * <p>Bloom filters are serializable. They also support a more compact serial representation via 051 * the {@link #writeTo} and {@link #readFrom} methods. Both serialized forms will continue to be 052 * supported by future versions of this library. However, serial forms generated by newer versions 053 * of the code may not be readable by older versions of the code (e.g., a serialized bloom filter 054 * generated today may <i>not</i> be readable by a binary that was compiled 6 months ago). 055 * 056 * @param <T> the type of instances that the {@code BloomFilter} accepts 057 * @author Dimitris Andreou 058 * @author Kevin Bourrillion 059 * @since 11.0 060 */ 061@Beta 062public final class BloomFilter<T> implements Predicate<T>, Serializable { 063 /** 064 * A strategy to translate T instances, to {@code numHashFunctions} bit indexes. 065 * 066 * <p>Implementations should be collections of pure functions (i.e. stateless). 067 */ 068 interface Strategy extends java.io.Serializable { 069 070 /** 071 * Sets {@code numHashFunctions} bits of the given bit array, by hashing a user element. 072 * 073 * <p>Returns whether any bits changed as a result of this operation. 074 */ 075 <T> boolean put(T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits); 076 077 /** 078 * Queries {@code numHashFunctions} bits of the given bit array, by hashing a user element; 079 * returns {@code true} if and only if all selected bits are set. 080 */ 081 <T> boolean mightContain( 082 T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits); 083 084 /** 085 * Identifier used to encode this strategy, when marshalled as part of a BloomFilter. 086 * Only values in the [-128, 127] range are valid for the compact serial form. 087 * Non-negative values are reserved for enums defined in BloomFilterStrategies; 088 * negative values are reserved for any custom, stateful strategy we may define 089 * (e.g. any kind of strategy that would depend on user input). 090 */ 091 int ordinal(); 092 } 093 094 /** The bit set of the BloomFilter (not necessarily power of 2!)*/ 095 private final BitArray bits; 096 097 /** Number of hashes per element */ 098 private final int numHashFunctions; 099 100 /** The funnel to translate Ts to bytes */ 101 private final Funnel<? super T> funnel; 102 103 /** 104 * The strategy we employ to map an element T to {@code numHashFunctions} bit indexes. 105 */ 106 private final Strategy strategy; 107 108 /** 109 * Creates a BloomFilter. 110 */ 111 private BloomFilter(BitArray bits, int numHashFunctions, Funnel<? super T> funnel, 112 Strategy strategy) { 113 checkArgument(numHashFunctions > 0, 114 "numHashFunctions (%s) must be > 0", numHashFunctions); 115 checkArgument(numHashFunctions <= 255, 116 "numHashFunctions (%s) must be <= 255", numHashFunctions); 117 this.bits = checkNotNull(bits); 118 this.numHashFunctions = numHashFunctions; 119 this.funnel = checkNotNull(funnel); 120 this.strategy = checkNotNull(strategy); 121 } 122 123 /** 124 * Creates a new {@code BloomFilter} that's a copy of this instance. The new instance is equal to 125 * this instance but shares no mutable state. 126 * 127 * @since 12.0 128 */ 129 public BloomFilter<T> copy() { 130 return new BloomFilter<T>(bits.copy(), numHashFunctions, funnel, strategy); 131 } 132 133 /** 134 * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter, 135 * {@code false} if this is <i>definitely</i> not the case. 136 */ 137 public boolean mightContain(T object) { 138 return strategy.mightContain(object, funnel, numHashFunctions, bits); 139 } 140 141 /** 142 * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #mightContain} 143 * instead. 144 */ 145 @Deprecated 146 @Override 147 public boolean apply(T input) { 148 return mightContain(input); 149 } 150 151 /** 152 * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of 153 * {@link #mightContain(Object)} with the same element will always return {@code true}. 154 * 155 * @return true if the bloom filter's bits changed as a result of this operation. If the bits 156 * changed, this is <i>definitely</i> the first time {@code object} has been added to the 157 * filter. If the bits haven't changed, this <i>might</i> be the first time {@code object} 158 * has been added to the filter. Note that {@code put(t)} always returns the 159 * <i>opposite</i> result to what {@code mightContain(t)} would have returned at the time 160 * it is called." 161 * @since 12.0 (present in 11.0 with {@code void} return type}) 162 */ 163 public boolean put(T object) { 164 return strategy.put(object, funnel, numHashFunctions, bits); 165 } 166 167 /** 168 * Returns the probability that {@linkplain #mightContain(Object)} will erroneously return 169 * {@code true} for an object that has not actually been put in the {@code BloomFilter}. 170 * 171 * <p>Ideally, this number should be close to the {@code fpp} parameter 172 * passed in {@linkplain #create(Funnel, int, double)}, or smaller. If it is 173 * significantly higher, it is usually the case that too many elements (more than 174 * expected) have been put in the {@code BloomFilter}, degenerating it. 175 * 176 * @since 14.0 (since 11.0 as expectedFalsePositiveProbability()) 177 */ 178 public double expectedFpp() { 179 // You down with FPP? (Yeah you know me!) Who's down with FPP? (Every last homie!) 180 return Math.pow((double) bits.bitCount() / bitSize(), numHashFunctions); 181 } 182 183 /** 184 * Returns the number of bits in the underlying bit array. 185 */ 186 @VisibleForTesting long bitSize() { 187 return bits.bitSize(); 188 } 189 190 /** 191 * Determines whether a given bloom filter is compatible with this bloom filter. For two 192 * bloom filters to be compatible, they must: 193 * 194 * <ul> 195 * <li>not be the same instance 196 * <li>have the same number of hash functions 197 * <li>have the same bit size 198 * <li>have the same strategy 199 * <li>have equal funnels 200 * <ul> 201 * 202 * @param that The bloom filter to check for compatibility. 203 * @since 15.0 204 */ 205 public boolean isCompatible(BloomFilter<T> that) { 206 checkNotNull(that); 207 return (this != that) && 208 (this.numHashFunctions == that.numHashFunctions) && 209 (this.bitSize() == that.bitSize()) && 210 (this.strategy.equals(that.strategy)) && 211 (this.funnel.equals(that.funnel)); 212 } 213 214 /** 215 * Combines this bloom filter with another bloom filter by performing a bitwise OR of the 216 * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the 217 * bloom filters are appropriately sized to avoid saturating them. 218 * 219 * @param that The bloom filter to combine this bloom filter with. It is not mutated. 220 * @throws IllegalArgumentException if {@code isCompatible(that) == false} 221 * 222 * @since 15.0 223 */ 224 public void putAll(BloomFilter<T> that) { 225 checkNotNull(that); 226 checkArgument(this != that, "Cannot combine a BloomFilter with itself."); 227 checkArgument(this.numHashFunctions == that.numHashFunctions, 228 "BloomFilters must have the same number of hash functions (%s != %s)", 229 this.numHashFunctions, that.numHashFunctions); 230 checkArgument(this.bitSize() == that.bitSize(), 231 "BloomFilters must have the same size underlying bit arrays (%s != %s)", 232 this.bitSize(), that.bitSize()); 233 checkArgument(this.strategy.equals(that.strategy), 234 "BloomFilters must have equal strategies (%s != %s)", 235 this.strategy, that.strategy); 236 checkArgument(this.funnel.equals(that.funnel), 237 "BloomFilters must have equal funnels (%s != %s)", 238 this.funnel, that.funnel); 239 this.bits.putAll(that.bits); 240 } 241 242 @Override 243 public boolean equals(@Nullable Object object) { 244 if (object == this) { 245 return true; 246 } 247 if (object instanceof BloomFilter) { 248 BloomFilter<?> that = (BloomFilter<?>) object; 249 return this.numHashFunctions == that.numHashFunctions 250 && this.funnel.equals(that.funnel) 251 && this.bits.equals(that.bits) 252 && this.strategy.equals(that.strategy); 253 } 254 return false; 255 } 256 257 @Override 258 public int hashCode() { 259 return Objects.hashCode(numHashFunctions, funnel, strategy, bits); 260 } 261 262 private static final Strategy DEFAULT_STRATEGY = 263 BloomFilterStrategies.MURMUR128_MITZ_64; 264 265 /** 266 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of 267 * insertions and expected false positive probability. 268 * 269 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements 270 * than specified, will result in its saturation, and a sharp deterioration of its 271 * false positive probability. 272 * 273 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided 274 * {@code Funnel<T>} is. 275 * 276 * <p>It is recommended that the funnel be implemented as a Java enum. This has the 277 * benefit of ensuring proper serialization and deserialization, which is important 278 * since {@link #equals} also relies on object identity of funnels. 279 * 280 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use 281 * @param expectedInsertions the number of expected insertions to the constructed 282 * {@code BloomFilter<T>}; must be positive 283 * @param fpp the desired false positive probability (must be positive and less than 1.0) 284 * @return a {@code BloomFilter} 285 */ 286 public static <T> BloomFilter<T> create( 287 Funnel<? super T> funnel, int expectedInsertions /* n */, double fpp) { 288 return create(funnel, expectedInsertions, fpp, DEFAULT_STRATEGY); 289 } 290 291 @VisibleForTesting 292 static <T> BloomFilter<T> create( 293 Funnel<? super T> funnel, int expectedInsertions /* n */, double fpp, Strategy strategy) { 294 checkNotNull(funnel); 295 checkArgument(expectedInsertions >= 0, "Expected insertions (%s) must be >= 0", 296 expectedInsertions); 297 checkArgument(fpp > 0.0, "False positive probability (%s) must be > 0.0", fpp); 298 checkArgument(fpp < 1.0, "False positive probability (%s) must be < 1.0", fpp); 299 checkNotNull(strategy); 300 301 if (expectedInsertions == 0) { 302 expectedInsertions = 1; 303 } 304 /* 305 * TODO(user): Put a warning in the javadoc about tiny fpp values, 306 * since the resulting size is proportional to -log(p), but there is not 307 * much of a point after all, e.g. optimalM(1000, 0.0000000000000001) = 76680 308 * which is less than 10kb. Who cares! 309 */ 310 long numBits = optimalNumOfBits(expectedInsertions, fpp); 311 int numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits); 312 try { 313 return new BloomFilter<T>(new BitArray(numBits), numHashFunctions, funnel, strategy); 314 } catch (IllegalArgumentException e) { 315 throw new IllegalArgumentException("Could not create BloomFilter of " + numBits + " bits", e); 316 } 317 } 318 319 /** 320 * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of 321 * insertions and a default expected false positive probability of 3%. 322 * 323 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements 324 * than specified, will result in its saturation, and a sharp deterioration of its 325 * false positive probability. 326 * 327 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided 328 * {@code Funnel<T>} is. 329 * 330 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use 331 * @param expectedInsertions the number of expected insertions to the constructed 332 * {@code BloomFilter<T>}; must be positive 333 * @return a {@code BloomFilter} 334 */ 335 public static <T> BloomFilter<T> create( 336 Funnel<? super T> funnel, int expectedInsertions /* n */) { 337 return create(funnel, expectedInsertions, 0.03); // FYI, for 3%, we always get 5 hash functions 338 } 339 340 /* 341 * Cheat sheet: 342 * 343 * m: total bits 344 * n: expected insertions 345 * b: m/n, bits per insertion 346 * p: expected false positive probability 347 * 348 * 1) Optimal k = b * ln2 349 * 2) p = (1 - e ^ (-kn/m))^k 350 * 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b 351 * 4) For optimal k: m = -nlnp / ((ln2) ^ 2) 352 */ 353 354 /** 355 * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the 356 * expected insertions and total number of bits in the Bloom filter. 357 * 358 * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula. 359 * 360 * @param n expected insertions (must be positive) 361 * @param m total number of bits in Bloom filter (must be positive) 362 */ 363 @VisibleForTesting 364 static int optimalNumOfHashFunctions(long n, long m) { 365 // (m / n) * log(2), but avoid truncation due to division! 366 return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); 367 } 368 369 /** 370 * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified 371 * expected insertions, the required false positive probability. 372 * 373 * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula. 374 * 375 * @param n expected insertions (must be positive) 376 * @param p false positive rate (must be 0 < p < 1) 377 */ 378 @VisibleForTesting 379 static long optimalNumOfBits(long n, double p) { 380 if (p == 0) { 381 p = Double.MIN_VALUE; 382 } 383 return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); 384 } 385 386 private Object writeReplace() { 387 return new SerialForm<T>(this); 388 } 389 390 private static class SerialForm<T> implements Serializable { 391 final long[] data; 392 final int numHashFunctions; 393 final Funnel<? super T> funnel; 394 final Strategy strategy; 395 396 SerialForm(BloomFilter<T> bf) { 397 this.data = bf.bits.data; 398 this.numHashFunctions = bf.numHashFunctions; 399 this.funnel = bf.funnel; 400 this.strategy = bf.strategy; 401 } 402 Object readResolve() { 403 return new BloomFilter<T>(new BitArray(data), numHashFunctions, funnel, strategy); 404 } 405 private static final long serialVersionUID = 1; 406 } 407 408 /** 409 * Writes this {@code BloomFilter} to an output stream, with a custom format (not Java 410 * serialization). This has been measured to save at least 400 bytes compared to regular 411 * serialization. 412 * 413 * <p>Use {@linkplain #readFrom(InputStream, Funnel)} to reconstruct the written BloomFilter. 414 */ 415 public void writeTo(OutputStream out) throws IOException { 416 /* 417 * Serial form: 418 * 1 signed byte for the strategy 419 * 1 unsigned byte for the number of hash functions 420 * 1 big endian int, the number of longs in our bitset 421 * N big endian longs of our bitset 422 */ 423 DataOutputStream dout = new DataOutputStream(out); 424 dout.writeByte(SignedBytes.checkedCast(strategy.ordinal())); 425 dout.writeByte(UnsignedBytes.checkedCast(numHashFunctions)); // note: checked at the c'tor 426 dout.writeInt(bits.data.length); 427 for (long value : bits.data) { 428 dout.writeLong(value); 429 } 430 } 431 432 /** 433 * Reads a byte stream, which was written by {@linkplain #writeTo(OutputStream)}, into 434 * a {@code BloomFilter<T>}. 435 * 436 * The {@code Funnel} to be used is not encoded in the stream, so it must be provided here. 437 * <b>Warning:</b> the funnel provided <b>must</b> behave identically to the one used to 438 * populate the original Bloom filter! 439 * 440 * @throws IOException if the InputStream throws an {@code IOException}, or if its data does 441 * not appear to be a BloomFilter serialized using the 442 * {@linkplain #writeTo(OutputStream)} method. 443 */ 444 public static <T> BloomFilter<T> readFrom(InputStream in, Funnel<T> funnel) throws IOException { 445 checkNotNull(in, "InputStream"); 446 checkNotNull(funnel, "Funnel"); 447 int strategyOrdinal = -1; 448 int numHashFunctions = -1; 449 int dataLength = -1; 450 try { 451 DataInputStream din = new DataInputStream(in); 452 // currently this assumes there is no negative ordinal; will have to be updated if we 453 // add non-stateless strategies (for which we've reserved negative ordinals; see 454 // Strategy.ordinal()). 455 strategyOrdinal = din.readByte(); 456 numHashFunctions = UnsignedBytes.toInt(din.readByte()); 457 dataLength = din.readInt(); 458 459 Strategy strategy = BloomFilterStrategies.values()[strategyOrdinal]; 460 long[] data = new long[dataLength]; 461 for (int i = 0; i < data.length; i++) { 462 data[i] = din.readLong(); 463 } 464 return new BloomFilter<T>(new BitArray(data), numHashFunctions, funnel, strategy); 465 } catch (RuntimeException e) { 466 IOException ioException = new IOException( 467 "Unable to deserialize BloomFilter from InputStream." 468 + " strategyOrdinal: " + strategyOrdinal 469 + " numHashFunctions: " + numHashFunctions 470 + " dataLength: " + dataLength); 471 ioException.initCause(e); 472 throw ioException; 473 } 474 } 475}