Loading...
Searching...
No Matches
bloom.h
Go to the documentation of this file.
1/*
2 * Copyright (C) 2014 Freie Universität Berlin
3 *
4 * This file is subject to the terms and conditions of the GNU Lesser
5 * General Public License v2.1. See the file LICENSE in the top level
6 * directory for more details.
7 */
8
9#pragma once
10
11/*
12 * bloom.c
13 *
14 * Bloom filters
15 *
16 * HISTORY
17 * {x, y, z}
18 * A Bloom filter is a probibalistic : : :
19 * data structure with several interesting /|\ /|\ /|\
20 * properties, such as low memory usage, / | X | X | \
21 * asymmetric query confidence, and a very / |/ \|/ \| \
22 * speedy O(k) membership test. / | | \ \
23 * / /| /|\ |\ \
24 * Because a Bloom filter can . . . . . . . . .
25 * accept any input that can be 00000000001000101010101010100010000000000
26 * hashed effectively (such as " " "
27 * strings), that membership test \ | /
28 * tends to draw a crowd. TNSTAAFL, but \ | /
29 * as caveats go, the Bloom filters' are \ | /
30 * more interesting than incapacitating. \|/
31 * :
32 * Most notably, it can tell you with certainty {w}
33 * that an item 'i' is *not* a member of set 's',
34 * but it can only tell you with some finite
35 * probability whether an item 'i' *is* a member
36 * of set 's'.
37 *
38 * Still, along with the intriguing possibility of using bitwise AND and OR
39 * to compute the logical union and intersection of two filters, the cheap
40 * cost of adding elements to the filter set, and the low memory requirements,
41 * the Bloom filter is a good choice for many applications.
42 *
43 * NOTES
44 *
45 * Let's look more closely at the probability values.
46 *
47 * Assume that a hash function selects each array position with equal
48 * probability. If m is the number of bits in the array, and k is the number
49 * of hash functions, then the probability that a certain bit is not set
50 * to 1 by a certain hash function during the insertion of an element is
51 *
52 * 1-(1/m).
53 *
54 * The probability that it is not set to 1 by any of the hash functions is
55 *
56 * (1-(1/m))^k.
57 *
58 * If we have inserted n elements, the probability that a certain bit is
59 * set 0 is
60 *
61 * (1-(1/m))^kn,
62 *
63 * Meaning that the probability said bit is set to 1 is therefore
64 *
65 * 1-([1-(1/m)]^kn).
66 *
67 * Now test membership of an element that is not in the set. Each of the k
68 * array positions computed by the hash functions is 1 with a probability
69 * as above. The probability of all of them being 1, which would cause the
70 * algorithm to erroneously claim that the element is in the set, is often
71 * given as
72 *
73 * (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
74 *
75 * This is not strictly correct as it assumes independence for the
76 * probabilities of each bit being set. However, assuming it is a close
77 * approximation we have that the probability of false positives decreases
78 * as m (the number of bits in the array) increases, and increases as n
79 * (the number of inserted elements) increases. For a given m and n, the
80 * value of k (the number of hash functions) that minimizes the probability
81 * is
82 *
83 * (m/n)ln(2) ~~ 0.7(m/n),
84 *
85 * which gives the false positive probability of
86 *
87 * 2^-k ~~ 0.6185^(m/n).
88 *
89 * The required number of bits m, given n and a desired false positive
90 * probability p (and assuming the optimal value of k is used) can be
91 * computed by substituting the optimal value of k in the probability
92 * expression above:
93 *
94 * p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
95 *
96 * which simplifies to
97 *
98 * ln(p) = -(m/n) * (ln2)^2.
99 *
100 * This results in the equation
101 *
102 * m = -((n*ln(p)) / ((ln(2))^2))
103 *
104 * The classic filter uses
105 *
106 * 1.44*log2(1/eta)
107 *
108 * bits of space per inserted key, where eta is the false positive rate of
109 * the Bloom filter.
110 *
111 */
112
124
125#include <stdlib.h>
126#include <stdbool.h>
127#include <stdint.h>
128
129#ifdef __cplusplus
130extern "C" {
131#endif
132
136typedef uint32_t (*hashfp_t)(const uint8_t *, size_t len);
137
141typedef struct {
143 size_t m;
145 size_t k;
147 uint8_t *a;
150} bloom_t;
151
165void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof);
166
172void bloom_del(bloom_t *bloom);
173
184void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len);
185
224bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len);
225
226#ifdef __cplusplus
227}
228#endif
229
uint32_t(* hashfp_t)(const uint8_t *, size_t len)
hash function to use in thee filter
Definition bloom.h:136
void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof)
Initialize a Bloom Filter.
void bloom_del(bloom_t *bloom)
Delete a Bloom filter.
bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len)
Determine if a string is in the Bloom filter.
void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len)
Add a string to a Bloom filter.
bloom_t bloom filter object
Definition bloom.h:141
size_t m
number of bits in the bloom array
Definition bloom.h:143
size_t k
number of hash functions
Definition bloom.h:145
hashfp_t * hash
the hash functions
Definition bloom.h:149
uint8_t * a
the bloom array
Definition bloom.h:147