001/*
002 * Copyright (C) 2011 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.hash;
016
017import static com.google.common.base.Preconditions.checkArgument;
018import static com.google.common.base.Preconditions.checkNotNull;
019
020import com.google.common.annotations.Beta;
021import com.google.common.annotations.VisibleForTesting;
022import com.google.common.base.Objects;
023import com.google.common.base.Predicate;
024import com.google.common.hash.BloomFilterStrategies.BitArray;
025
026import java.io.Serializable;
027
028import javax.annotation.Nullable;
029
030/**
031 * A Bloom filter for instances of {@code T}. A Bloom filter offers an approximate containment test
032 * with one-sided error: if it claims that an element is contained in it, this might be in error,
033 * but if it claims that an element is <i>not</i> contained in it, then this is definitely true.
034 *
035 * <p>If you are unfamiliar with Bloom filters, this nice
036 * <a href="http://llimllib.github.com/bloomfilter-tutorial/">tutorial</a> may help you understand
037 * how they work.
038 *
039 * <p>The false positive probability ({@code FPP}) of a bloom filter is defined as the probability
040 * that {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that
041 * has not actually been put in the {@code BloomFilter}.
042 *
043 * <p>Bloom filters are serializable.
044 * However, serial forms generated by newer versions of the code may not be readable by older
045 * versions of the code (e.g., a serialized bloom filter generated today may <i>not</i> be
046 * readable by a binary that was compiled 6 months ago).
047 *
048 * @param <T> the type of instances that the {@code BloomFilter} accepts
049 * @author Dimitris Andreou
050 * @author Kevin Bourrillion
051 * @since 11.0
052 */
053@Beta
054public final class BloomFilter<T> implements Predicate<T>, Serializable {
055  /**
056   * A strategy to translate T instances, to {@code numHashFunctions} bit indexes.
057   *
058   * <p>Implementations should be collections of pure functions (i.e. stateless).
059   */
060  interface Strategy extends java.io.Serializable {
061
062    /**
063     * Sets {@code numHashFunctions} bits of the given bit array, by hashing a user element.
064     *
065     * <p>Returns whether any bits changed as a result of this operation.
066     */
067    <T> boolean put(T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits);
068
069    /**
070     * Queries {@code numHashFunctions} bits of the given bit array, by hashing a user element;
071     * returns {@code true} if and only if all selected bits are set.
072     */
073    <T> boolean mightContain(
074        T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits);
075
076    /**
077     * Identifier used to encode this strategy, when marshalled as part of a BloomFilter.
078     * Only values in the [-128, 127] range are valid for the compact serial form.
079     * Non-negative values are reserved for enums defined in BloomFilterStrategies;
080     * negative values are reserved for any custom, stateful strategy we may define
081     * (e.g. any kind of strategy that would depend on user input).
082     */
083    int ordinal();
084  }
085
086  /** The bit set of the BloomFilter (not necessarily power of 2!)*/
087  private final BitArray bits;
088
089  /** Number of hashes per element */
090  private final int numHashFunctions;
091
092  /** The funnel to translate Ts to bytes */
093  private final Funnel<T> funnel;
094
095  /**
096   * The strategy we employ to map an element T to {@code numHashFunctions} bit indexes.
097   */
098  private final Strategy strategy;
099
100  /**
101   * Creates a BloomFilter.
102   */
103  private BloomFilter(BitArray bits, int numHashFunctions, Funnel<T> funnel,
104      Strategy strategy) {
105    checkArgument(numHashFunctions > 0,
106        "numHashFunctions (%s) must be > 0", numHashFunctions);
107    checkArgument(numHashFunctions <= 255,
108        "numHashFunctions (%s) must be <= 255", numHashFunctions);
109    this.bits = checkNotNull(bits);
110    this.numHashFunctions = numHashFunctions;
111    this.funnel = checkNotNull(funnel);
112    this.strategy = checkNotNull(strategy);
113  }
114
115  /**
116   * Creates a new {@code BloomFilter} that's a copy of this instance. The new instance is equal to
117   * this instance but shares no mutable state.
118   *
119   * @since 12.0
120   */
121  public BloomFilter<T> copy() {
122    return new BloomFilter<T>(bits.copy(), numHashFunctions, funnel, strategy);
123  }
124
125  /**
126   * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter,
127   * {@code false} if this is <i>definitely</i> not the case.
128   */
129  public boolean mightContain(T object) {
130    return strategy.mightContain(object, funnel, numHashFunctions, bits);
131  }
132
133  /**
134   * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #mightContain}
135   *     instead.
136   */
137  @Deprecated
138  @Override
139  public boolean apply(T input) {
140    return mightContain(input);
141  }
142
143  /**
144   * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of
145   * {@link #mightContain(Object)} with the same element will always return {@code true}.
146   *
147   * @return true if the bloom filter's bits changed as a result of this operation. If the bits
148   *     changed, this is <i>definitely</i> the first time {@code object} has been added to the
149   *     filter. If the bits haven't changed, this <i>might</i> be the first time {@code object}
150   *     has been added to the filter. Note that {@code put(t)} always returns the
151   *     <i>opposite</i> result to what {@code mightContain(t)} would have returned at the time
152   *     it is called."
153   * @since 12.0 (present in 11.0 with {@code void} return type})
154   */
155  public boolean put(T object) {
156    return strategy.put(object, funnel, numHashFunctions, bits);
157  }
158
159  /**
160   * Returns the probability that {@linkplain #mightContain(Object)} will erroneously return
161   * {@code true} for an object that has not actually been put in the {@code BloomFilter}.
162   *
163   * <p>Ideally, this number should be close to the {@code fpp} parameter
164   * passed in {@linkplain #create(Funnel, int, double)}, or smaller. If it is
165   * significantly higher, it is usually the case that too many elements (more than
166   * expected) have been put in the {@code BloomFilter}, degenerating it.
167   *
168   * @since 14.0 (since 11.0 as expectedFalsePositiveProbability())
169   */
170  public double expectedFpp() {
171    // You down with FPP? (Yeah you know me!) Who's down with FPP? (Every last homie!)
172    return Math.pow((double) bits.bitCount() / bitSize(), numHashFunctions);
173  }
174
175  /**
176   * Returns the number of bits in the underlying bit array.
177   */
178  @VisibleForTesting long bitSize() {
179    return bits.bitSize();
180  }
181
182  /**
183   * Determines whether a given bloom filter is compatible with this bloom filter. For two
184   * bloom filters to be compatible, they must:
185   *
186   * <ul>
187   * <li>not be the same instance
188   * <li>have the same number of hash functions
189   * <li>have the same bit size
190   * <li>have the same strategy
191   * <li>have equal funnels
192   * <ul>
193   *
194   * @param that The bloom filter to check for compatibility.
195   * @since 15.0
196   */
197  public boolean isCompatible(BloomFilter<T> that) {
198    checkNotNull(that);
199    return (this != that) &&
200        (this.numHashFunctions == that.numHashFunctions) &&
201        (this.bitSize() == that.bitSize()) &&
202        (this.strategy.equals(that.strategy)) &&
203        (this.funnel.equals(that.funnel));
204  }
205
206  /**
207   * Combines this bloom filter with another bloom filter by performing a bitwise OR of the
208   * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the
209   * bloom filters are appropriately sized to avoid saturating them.
210   *
211   * @param that The bloom filter to combine this bloom filter with. It is not mutated.
212   * @throws IllegalArgumentException if {@code isCompatible(that) == false}
213   *
214   * @since 15.0
215   */
216  public void putAll(BloomFilter<T> that) {
217    checkNotNull(that);
218    checkArgument(this != that, "Cannot combine a BloomFilter with itself.");
219    checkArgument(this.numHashFunctions == that.numHashFunctions,
220        "BloomFilters must have the same number of hash functions (%s != %s)",
221        this.numHashFunctions, that.numHashFunctions);
222    checkArgument(this.bitSize() == that.bitSize(),
223        "BloomFilters must have the same size underlying bit arrays (%s != %s)",
224        this.bitSize(), that.bitSize());
225    checkArgument(this.strategy.equals(that.strategy),
226        "BloomFilters must have equal strategies (%s != %s)",
227        this.strategy, that.strategy);
228    checkArgument(this.funnel.equals(that.funnel),
229        "BloomFilters must have equal funnels (%s != %s)",
230        this.funnel, that.funnel);
231    this.bits.putAll(that.bits);
232  }
233
234  @Override
235  public boolean equals(@Nullable Object object) {
236    if (object == this) {
237      return true;
238    }
239    if (object instanceof BloomFilter) {
240      BloomFilter<?> that = (BloomFilter<?>) object;
241      return this.numHashFunctions == that.numHashFunctions
242          && this.funnel.equals(that.funnel)
243          && this.bits.equals(that.bits)
244          && this.strategy.equals(that.strategy);
245    }
246    return false;
247  }
248
249  @Override
250  public int hashCode() {
251    return Objects.hashCode(numHashFunctions, funnel, strategy, bits);
252  }
253
254  private static final Strategy DEFAULT_STRATEGY =
255      getDefaultStrategyFromSystemProperty();
256
257  @VisibleForTesting
258  static final String USE_MITZ32_PROPERTY = "com.google.common.hash.BloomFilter.useMitz32";
259
260  @VisibleForTesting
261  static Strategy getDefaultStrategyFromSystemProperty() {
262    return Boolean.parseBoolean(System.getProperty(USE_MITZ32_PROPERTY))
263        ? BloomFilterStrategies.MURMUR128_MITZ_32
264        : BloomFilterStrategies.MURMUR128_MITZ_64;
265  }
266
267  /**
268   * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of
269   * insertions and expected false positive probability.
270   *
271   * <p>Note that overflowing a {@code BloomFilter} with significantly more elements
272   * than specified, will result in its saturation, and a sharp deterioration of its
273   * false positive probability.
274   *
275   * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
276   * {@code Funnel<T>} is.
277   *
278   * <p>It is recommended that the funnel be implemented as a Java enum. This has the
279   * benefit of ensuring proper serialization and deserialization, which is important
280   * since {@link #equals} also relies on object identity of funnels.
281   *
282   * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
283   * @param expectedInsertions the number of expected insertions to the constructed
284   *     {@code BloomFilter<T>}; must be positive
285   * @param fpp the desired false positive probability (must be positive and less than 1.0)
286   * @return a {@code BloomFilter}
287   */
288  public static <T> BloomFilter<T> create(
289      Funnel<T> funnel, int expectedInsertions /* n */, double fpp) {
290    return create(funnel, expectedInsertions, fpp, DEFAULT_STRATEGY);
291  }
292
293  @VisibleForTesting
294  static <T> BloomFilter<T> create(
295      Funnel<T> funnel, int expectedInsertions /* n */, double fpp, Strategy strategy) {
296    checkNotNull(funnel);
297    checkArgument(expectedInsertions >= 0, "Expected insertions (%s) must be >= 0",
298        expectedInsertions);
299    checkArgument(fpp > 0.0, "False positive probability (%s) must be > 0.0", fpp);
300    checkArgument(fpp < 1.0, "False positive probability (%s) must be < 1.0", fpp);
301    checkNotNull(strategy);
302
303    if (expectedInsertions == 0) {
304      expectedInsertions = 1;
305    }
306    /*
307     * TODO(user): Put a warning in the javadoc about tiny fpp values,
308     * since the resulting size is proportional to -log(p), but there is not
309     * much of a point after all, e.g. optimalM(1000, 0.0000000000000001) = 76680
310     * which is less than 10kb. Who cares!
311     */
312    long numBits = optimalNumOfBits(expectedInsertions, fpp);
313    int numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits);
314    try {
315      return new BloomFilter<T>(new BitArray(numBits), numHashFunctions, funnel, strategy);
316    } catch (IllegalArgumentException e) {
317      throw new IllegalArgumentException("Could not create BloomFilter of " + numBits + " bits", e);
318    }
319  }
320
321  /**
322   * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of
323   * insertions and a default expected false positive probability of 3%.
324   *
325   * <p>Note that overflowing a {@code BloomFilter} with significantly more elements
326   * than specified, will result in its saturation, and a sharp deterioration of its
327   * false positive probability.
328   *
329   * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
330   * {@code Funnel<T>} is.
331   *
332   * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
333   * @param expectedInsertions the number of expected insertions to the constructed
334   *     {@code BloomFilter<T>}; must be positive
335   * @return a {@code BloomFilter}
336   */
337  public static <T> BloomFilter<T> create(Funnel<T> funnel, int expectedInsertions /* n */) {
338    return create(funnel, expectedInsertions, 0.03); // FYI, for 3%, we always get 5 hash functions
339  }
340
341  /*
342   * Cheat sheet:
343   *
344   * m: total bits
345   * n: expected insertions
346   * b: m/n, bits per insertion
347   * p: expected false positive probability
348   *
349   * 1) Optimal k = b * ln2
350   * 2) p = (1 - e ^ (-kn/m))^k
351   * 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b
352   * 4) For optimal k: m = -nlnp / ((ln2) ^ 2)
353   */
354
355  /**
356   * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the
357   * expected insertions and total number of bits in the Bloom filter.
358   *
359   * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula.
360   *
361   * @param n expected insertions (must be positive)
362   * @param m total number of bits in Bloom filter (must be positive)
363   */
364  @VisibleForTesting
365  static int optimalNumOfHashFunctions(long n, long m) {
366    return Math.max(1, (int) Math.round(m / n * Math.log(2)));
367  }
368
369  /**
370   * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified
371   * expected insertions, the required false positive probability.
372   *
373   * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula.
374   *
375   * @param n expected insertions (must be positive)
376   * @param p false positive rate (must be 0 < p < 1)
377   */
378  @VisibleForTesting
379  static long optimalNumOfBits(long n, double p) {
380    if (p == 0) {
381      p = Double.MIN_VALUE;
382    }
383    return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
384  }
385
386  private Object writeReplace() {
387    return new SerialForm<T>(this);
388  }
389
390  private static class SerialForm<T> implements Serializable {
391    final long[] data;
392    final int numHashFunctions;
393    final Funnel<T> funnel;
394    final Strategy strategy;
395
396    SerialForm(BloomFilter<T> bf) {
397      this.data = bf.bits.data;
398      this.numHashFunctions = bf.numHashFunctions;
399      this.funnel = bf.funnel;
400      this.strategy = bf.strategy;
401    }
402    Object readResolve() {
403      return new BloomFilter<T>(new BitArray(data), numHashFunctions, funnel, strategy);
404    }
405    private static final long serialVersionUID = 1;
406  }
407}