bloom_8h_source.html

/*

 * Copyright (C) 2014 Freie Universität Berlin

 *

 * This file is subject to the terms and conditions of the GNU Lesser

 * General Public License v2.1. See the file LICENSE in the top level

 * directory for more details.

 */


#pragma once


/*

 * bloom.c

 *

 * Bloom filters

 *

 * HISTORY

 *                                                   {x,  y,  z}

 * A Bloom filter is a probibalistic                  :   :   :

 * data structure with several interesting           /|\ /|\ /|\

 * properties, such as low memory usage,            / | X | X | \

 * asymmetric query confidence, and a very         /  |/ \|/ \|  \

 * speedy O(k) membership test.                   /   |   |   \   \

 *                                               /   /|  /|\  |\   \

 * Because a Bloom filter can                   .   . . . . . . .   .

 * accept any input that can be       00000000001000101010101010100010000000000

 * hashed effectively (such as                       "    "    "

 * strings), that membership test                     \   |   /

 * tends to draw a crowd. TNSTAAFL, but                \  |  /

 * as caveats go, the Bloom filters' are                \ | /

 * more interesting than incapacitating.                 \|/

 *                                                        :

 * Most notably, it can tell you with certainty          {w}

 * that an item 'i' is *not* a member of set 's',

 * but it can only tell you with some finite

 * probability whether an item 'i' *is* a member

 * of set 's'.

 *

 * Still, along with the intriguing possibility of using bitwise AND and OR

 * to compute the logical union and intersection of two filters, the cheap

 * cost of adding elements to the filter set, and the low memory requirements,

 * the Bloom filter is a good choice for many applications.

 *

 * NOTES

 *

 * Let's look more closely at the probability values.

 *

 * Assume that a hash function selects each array position with equal

 * probability. If m is the number of bits in the array, and k is the number

 * of hash functions, then the probability that a certain bit is not set

 * to 1 by a certain hash function during the insertion of an element is

 *

 *      1-(1/m).

 *

 * The probability that it is not set to 1 by any of the hash functions is

 *

 *      (1-(1/m))^k.

 *

 * If we have inserted n elements, the probability that a certain bit is

 * set 0 is

 *

 *      (1-(1/m))^kn,

 *

 * Meaning that the probability said bit is set to 1 is therefore

 *

 *      1-([1-(1/m)]^kn).

 *

 * Now test membership of an element that is not in the set. Each of the k

 * array positions computed by the hash functions is 1 with a probability

 * as above. The probability of all of them being 1, which would cause the

 * algorithm to erroneously claim that the element is in the set, is often

 * given as

 *

 *      (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.

 *

 * This is not strictly correct as it assumes independence for the

 * probabilities of each bit being set. However, assuming it is a close

 * approximation we have that the probability of false positives decreases

 * as m (the number of bits in the array) increases, and increases as n

 * (the number of inserted elements) increases. For a given m and n, the

 * value of k (the number of hash functions) that minimizes the probability

 * is

 *

 *      (m/n)ln(2) ~~ 0.7(m/n),

 *

 * which gives the false positive probability of

 *

 *      2^-k ~~ 0.6185^(m/n).

 *

 * The required number of bits m, given n and a desired false positive

 * probability p (and assuming the optimal value of k is used) can be

 * computed by substituting the optimal value of k in the probability

 * expression above:

 *

 *      p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),

 *

 * which simplifies to

 *

 *      ln(p) = -(m/n) * (ln2)^2.

 *

 * This results in the equation

 *

 *      m = -((n*ln(p)) / ((ln(2))^2))

 *

 * The classic filter uses

 *

 *       1.44*log2(1/eta)

 *

 * bits of space per inserted key, where eta is the false positive rate of

 * the Bloom filter.

 *

 */


#include <stdlib.h>

#include <stdbool.h>

#include <stdint.h>


#ifdef __cplusplus

extern "C" {

#endif


typedef uint32_t (*hashfp_t)(const uint8_t *, size_t len);


typedef struct {

    size_t m;

    size_t k;

    uint8_t *a;

    hashfp_t *hash;

} bloom_t;


void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof);


void bloom_del(bloom_t *bloom);


void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len);


bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len);


#ifdef __cplusplus

}

#endif


hashfp_t
uint32_t(* hashfp_t)(const uint8_t *, size_t len)
hash function to use in thee filter
Definition bloom.h:136

bloom_init
void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof)
Initialize a Bloom Filter.

bloom_del
void bloom_del(bloom_t *bloom)
Delete a Bloom filter.

bloom_check
bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len)
Determine if a string is in the Bloom filter.

bloom_add
void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len)
Add a string to a Bloom filter.

bloom_t
bloom_t bloom filter object
Definition bloom.h:141

bloom_t::m
size_t m
number of bits in the bloom array
Definition bloom.h:143

bloom_t::k
size_t k
number of hash functions
Definition bloom.h:145

bloom_t::hash
hashfp_t * hash
the hash functions
Definition bloom.h:149

bloom_t::a
uint8_t * a
the bloom array
Definition bloom.h:147