001 /* 002 * Copyright (C) 2011 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015 package com.google.common.hash; 016 017 import static com.google.common.base.Preconditions.checkArgument; 018 import static com.google.common.base.Preconditions.checkNotNull; 019 import static com.google.common.math.IntMath.log2; 020 import static java.lang.Math.max; 021 import static java.math.RoundingMode.CEILING; 022 023 import com.google.common.annotations.Beta; 024 import com.google.common.annotations.VisibleForTesting; 025 import com.google.common.base.Preconditions; 026 import com.google.common.hash.HashCodes.HashCodeSlicer; 027 028 import java.io.Serializable; 029 030 /** 031 * A Bloom filter for instances of {@code T}. A Bloom filter offers an approximate containment test 032 * with one-sided error: if it claims that an element is contained in it, this might be in error, 033 * but if it claims that an element is <i>not</i> contained in it, then this is definitely true. 034 * 035 * <p>If you are unfamiliar with Bloom filters, this nice 036 * <a href="http://llimllib.github.com/bloomfilter-tutorial/">tutorial</a> may help you understand 037 * how they work. 038 * 039 * @param <T> the type of instances that the {@code BloomFilter} accepts 040 * @author Kevin Bourrillion 041 * @author Dimitris Andreou 042 * @since 11.0 043 */ 044 @Beta 045 public final class BloomFilter<T> implements Serializable { 046 /** A power of two sized bit set */ 047 private final BitArray bits; 048 049 /** Number of bits required to index a bit out of {@code bits} */ 050 private final int hashBitsPerSlice; 051 052 /** Number of hashes per element */ 053 private final int numHashFunctions; 054 055 /** The funnel to translate T's to bytes */ 056 private final Funnel<T> funnel; 057 058 /** The HashFunction that generates as many bits as this BloomFilter needs */ 059 private final HashFunction hashFunction; 060 061 /** 062 * Creates a BloomFilter. 063 */ 064 private BloomFilter(BitArray bits, int numHashFunctions, Funnel<T> funnel, 065 HashFunction hashFunction) { 066 Preconditions.checkArgument(numHashFunctions > 0, "numHashFunctions zero or negative"); 067 this.bits = checkNotNull(bits); 068 this.numHashFunctions = numHashFunctions; 069 this.funnel = checkNotNull(funnel); 070 this.hashFunction = checkNotNull(hashFunction); 071 this.hashBitsPerSlice = log2(max(bits.size(), Long.SIZE /* minimum capacity */ ), CEILING); 072 } 073 074 /** 075 * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter, 076 * {@code false} if this is <i>definitely</i> not the case. 077 */ 078 public boolean mightContain(T instance) { 079 HashCodeSlicer slicer = HashCodes.slice( 080 hashFunction.newHasher().putObject(instance, funnel).hash(), hashBitsPerSlice); 081 for (int i = 0; i < numHashFunctions; i++) { 082 if (!bits.get(slicer.nextSlice())) { 083 return false; 084 } 085 } 086 return true; 087 } 088 089 /** 090 * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of 091 * {@link #mightContain(Object)} with the same element will always return {@code true}. 092 */ 093 public void put(T instance) { 094 HashCodeSlicer slicer = HashCodes.slice( 095 hashFunction.newHasher().putObject(instance, funnel).hash(), hashBitsPerSlice); 096 for (int i = 0; i < numHashFunctions; i++) { 097 int nextSlice = slicer.nextSlice(); 098 bits.set(nextSlice); 099 } 100 } 101 102 @VisibleForTesting int getHashCount() { 103 return numHashFunctions; 104 } 105 106 @VisibleForTesting double computeExpectedFalsePositiveRate(int insertions) { 107 return Math.pow( 108 1 - Math.exp(-numHashFunctions * ((double) insertions / (bits.size()))), 109 numHashFunctions); 110 } 111 112 // This little gem is kindly offered by kevinb 113 private static class BitArray { 114 final long[] data; 115 116 BitArray(int bits) { 117 this(new long[bits >> 6]); 118 } 119 120 // Used by serialization 121 BitArray(long[] data) { 122 checkArgument(data.length > 0, "data length is zero!"); 123 this.data = data; 124 } 125 126 void set(int index) { 127 data[index >> 6] |= (1L << index); 128 } 129 130 boolean get(int index) { 131 return (data[index >> 6] & (1L << index)) != 0; 132 } 133 134 /** Number of bits */ 135 int size() { 136 return data.length * Long.SIZE; 137 } 138 } 139 140 /* 141 * Cheat sheet for the factories: 142 * 143 * m: total bits 144 * n: expected insertions 145 * b: m/n, bits per insertion 146 147 * p: expected false positive probability 148 * 149 * 1) Optimal k = b * ln2 150 * 2) p = (1 - e ^ (-kn/m))^k 151 * 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b 152 * 4) For optimal k: m = -nlnp / ((ln2) ^ 2) 153 * 154 * I expect the user to provide "n", and then one of {m,b,p} is needed. 155 * Providing both (n, m) and (n, b) would be confusing, so I go for the 2nd. 156 */ 157 158 /** 159 * Creates a {@code Builder} of a {@link BloomFilter BloomFilter<T>}, with the expected number 160 * of insertions and expected false positive probability. 161 * 162 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements 163 * than specified, will result in its saturation, and a sharp deterioration of its 164 * false positive probability. 165 * 166 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided 167 * {@code Funnel<T>} is. 168 * 169 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use 170 * @param expectedInsertions the number of expected insertions to the constructed 171 * {@code BloomFilter<T>}; must be positive 172 * @param falsePositiveProbability the desired false positive probability (must be positive and 173 * less than 1.0) 174 * @return a {@code Builder} 175 */ 176 public static <T> BloomFilter<T> create(Funnel<T> funnel, int expectedInsertions /* n */, 177 double falsePositiveProbability) { 178 checkNotNull(funnel); 179 checkArgument(expectedInsertions > 0, "Expected insertions must be positive"); 180 checkArgument(falsePositiveProbability > 0.0 & falsePositiveProbability < 1.0, 181 "False positive probability in (0.0, 1.0)"); 182 /* 183 * andreou: I wanted to put a warning in the javadoc about tiny fpp values, 184 * since the resulting size is proportional to -log(p), but there is not 185 * much of a point after all, e.g. optimalM(1000, 0.0000000000000001) = 76680 186 * which is less that 10kb. Who cares! 187 */ 188 int m = optimalM(expectedInsertions, falsePositiveProbability); 189 int k = optimalK(expectedInsertions, m); 190 191 BitArray bits = new BitArray(1 << log2(max(m, Long.SIZE /* minimum capacity */ ), CEILING)); 192 HashFunction hashFunction = BloomFilterStrategies.From128ToN.withBits( 193 bits.size() * k, Hashing.murmur3_128()); 194 return new BloomFilter<T>(bits, k, funnel, hashFunction); 195 } 196 197 /** 198 * Creates a {@code Builder} of a {@link BloomFilter BloomFilter<T>}, with the expected number 199 * of insertions, and a default expected false positive probability of 3%. 200 * 201 * <p>Note that overflowing a {@code BloomFilter} with significantly more elements 202 * than specified, will result in its saturation, and a sharp deterioration of its 203 * false positive probability. 204 * 205 * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided 206 * {@code Funnel<T>} is. 207 * 208 * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use 209 * @param expectedInsertions the number of expected insertions to the constructed 210 * {@code BloomFilter<T>}; must be positive 211 * @return a {@code Builder} 212 */ 213 public static <T> BloomFilter<T> create(Funnel<T> funnel, int expectedInsertions /* n */) { 214 return create(funnel, expectedInsertions, 0.03); // FYI, for 3%, we always get 5 hash functions 215 } 216 217 218 private static final double LN2 = Math.log(2); 219 private static final double LN2_SQUARED = LN2 * LN2; 220 221 /** 222 * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the 223 * expected insertions and total number of bits in the Bloom filter. 224 * 225 * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula. 226 * 227 * @param n expected insertions (must be positive) 228 * @param m total number of bits in Bloom filter (must be positive) 229 */ 230 @VisibleForTesting static int optimalK(int n, int m) { 231 return Math.max(1, (int) Math.round(m / n * LN2)); 232 } 233 234 /** 235 * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified 236 * expected insertions, the required false positive probability. 237 * 238 * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula. 239 * 240 * @param n expected insertions (must be positive) 241 * @param p false positive rate (must be 0 < p < 1) 242 */ 243 @VisibleForTesting static int optimalM(int n, double p) { 244 return (int) (-n * Math.log(p) / LN2_SQUARED); 245 } 246 247 private Object writeReplace() { 248 return new SerialForm<T>(this); 249 } 250 251 private static class SerialForm<T> implements Serializable { 252 final long[] data; 253 final int numHashFunctions; 254 final Funnel<T> funnel; 255 final HashFunction hashFunction; 256 257 SerialForm(BloomFilter<T> bf) { 258 this.data = bf.bits.data; 259 this.numHashFunctions = bf.numHashFunctions; 260 this.funnel = bf.funnel; 261 this.hashFunction = bf.hashFunction; 262 } 263 Object readResolve() { 264 return new BloomFilter<T>(new BitArray(data), numHashFunctions, funnel, hashFunction); 265 } 266 private static final long serialVersionUID = 0; 267 } 268 }