bloomfilter_8c_source.html

/*-------------------------------------------------------------------------

 *

 * bloomfilter.c

 *      Space-efficient set membership testing

 *

 * A Bloom filter is a probabilistic data structure that is used to test an

 * element's membership of a set.  False positives are possible, but false

 * negatives are not; a test of membership of the set returns either "possibly

 * in set" or "definitely not in set".  This is typically very space efficient,

 * which can be a decisive advantage.

 *

 * Elements can be added to the set, but not removed.  The more elements that

 * are added, the larger the probability of false positives.  Caller must hint

 * an estimated total size of the set when the Bloom filter is initialized.

 * This is used to balance the use of memory against the final false positive

 * rate.

 *

 * The implementation is well suited to data synchronization problems between

 * unordered sets, especially where predictable performance is important and

 * some false positives are acceptable.  It's also well suited to cache

 * filtering problems where a relatively small and/or low cardinality set is

 * fingerprinted, especially when many subsequent membership tests end up

 * indicating that values of interest are not present.  That should save the

 * caller many authoritative lookups, such as expensive probes of a much larger

 * on-disk structure.

 *

 * Copyright (c) 2018-2025, PostgreSQL Global Development Group

 *

 * IDENTIFICATION

 *    src/backend/lib/bloomfilter.c

 *

 *-------------------------------------------------------------------------

 */

#include "postgres.h"


#include <math.h>


#include "common/hashfn.h"

#include "lib/bloomfilter.h"

#include "port/pg_bitutils.h"


#define MAX_HASH_FUNCS      10


struct bloom_filter

{

    /* K hash functions are used, seeded by caller's seed */

    int         k_hash_funcs;

    uint64      seed;

    /* m is bitset size, in bits.  Must be a power of two <= 2^32.  */

    uint64      m;

    unsigned char bitset[FLEXIBLE_ARRAY_MEMBER];

};


static int  my_bloom_power(uint64 target_bitset_bits);

static int  optimal_k(uint64 bitset_bits, int64 total_elems);

static void k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem,

                     size_t len);

static inline uint32 mod_m(uint32 val, uint64 m);


/*

 * Create Bloom filter in caller's memory context.  We aim for a false positive

 * rate of between 1% and 2% when bitset size is not constrained by memory

 * availability.

 *

 * total_elems is an estimate of the final size of the set.  It should be

 * approximately correct, but the implementation can cope well with it being

 * off by perhaps a factor of five or more.  See "Bloom Filters in

 * Probabilistic Verification" (Dillinger & Manolios, 2004) for details of why

 * this is the case.

 *

 * bloom_work_mem is sized in KB, in line with the general work_mem convention.

 * This determines the size of the underlying bitset (trivial bookkeeping space

 * isn't counted).  The bitset is always sized as a power of two number of

 * bits, and the largest possible bitset is 512MB (2^32 bits).  The

 * implementation allocates only enough memory to target its standard false

 * positive rate, using a simple formula with caller's total_elems estimate as

 * an input.  The bitset might be as small as 1MB, even when bloom_work_mem is

 * much higher.

 *

 * The Bloom filter is seeded using a value provided by the caller.  Using a

 * distinct seed value on every call makes it unlikely that the same false

 * positives will reoccur when the same set is fingerprinted a second time.

 * Callers that don't care about this pass a constant as their seed, typically

 * 0.  Callers can also use a pseudo-random seed, eg from pg_prng_uint64().

 */

bloom_filter *

bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)

{

    bloom_filter *filter;

    int         bloom_power;

    uint64      bitset_bytes;

    uint64      bitset_bits;


    /*

     * Aim for two bytes per element; this is sufficient to get a false

     * positive rate below 1%, independent of the size of the bitset or total

     * number of elements.  Also, if rounding down the size of the bitset to

     * the next lowest power of two turns out to be a significant drop, the

     * false positive rate still won't exceed 2% in almost all cases.

     */

    bitset_bytes = Min(bloom_work_mem * UINT64CONST(1024), total_elems * 2);

    bitset_bytes = Max(1024 * 1024, bitset_bytes);


    /*

     * Size in bits should be the highest power of two <= target.  bitset_bits

     * is uint64 because PG_UINT32_MAX is 2^32 - 1, not 2^32

     */

    bloom_power = my_bloom_power(bitset_bytes * BITS_PER_BYTE);

    bitset_bits = UINT64CONST(1) << bloom_power;

    bitset_bytes = bitset_bits / BITS_PER_BYTE;


    /* Allocate bloom filter with unset bitset */

    filter = palloc0(offsetof(bloom_filter, bitset) +

                     sizeof(unsigned char) * bitset_bytes);

    filter->k_hash_funcs = optimal_k(bitset_bits, total_elems);

    filter->seed = seed;

    filter->m = bitset_bits;


    return filter;

}


/*

 * Free Bloom filter

 */

void

bloom_free(bloom_filter *filter)

{

    pfree(filter);

}


/*

 * Add element to Bloom filter

 */

void

bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len)

{

    uint32      hashes[MAX_HASH_FUNCS];

    int         i;


    k_hashes(filter, hashes, elem, len);


    /* Map a bit-wise address to a byte-wise address + bit offset */

    for (i = 0; i < filter->k_hash_funcs; i++)

    {

        filter->bitset[hashes[i] >> 3] |= 1 << (hashes[i] & 7);

    }

}


/*

 * Test if Bloom filter definitely lacks element.

 *

 * Returns true if the element is definitely not in the set of elements

 * observed by bloom_add_element().  Otherwise, returns false, indicating that

 * element is probably present in set.

 */

bool

bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len)

{

    uint32      hashes[MAX_HASH_FUNCS];

    int         i;


    k_hashes(filter, hashes, elem, len);


    /* Map a bit-wise address to a byte-wise address + bit offset */

    for (i = 0; i < filter->k_hash_funcs; i++)

    {

        if (!(filter->bitset[hashes[i] >> 3] & (1 << (hashes[i] & 7))))

            return true;

    }


    return false;

}


/*

 * What proportion of bits are currently set?

 *

 * Returns proportion, expressed as a multiplier of filter size.  That should

 * generally be close to 0.5, even when we have more than enough memory to

 * ensure a false positive rate within target 1% to 2% band, since more hash

 * functions are used as more memory is available per element.

 *

 * This is the only instrumentation that is low overhead enough to appear in

 * debug traces.  When debugging Bloom filter code, it's likely to be far more

 * interesting to directly test the false positive rate.

 */

double

bloom_prop_bits_set(bloom_filter *filter)

{

    int         bitset_bytes = filter->m / BITS_PER_BYTE;

    uint64      bits_set = pg_popcount((char *) filter->bitset, bitset_bytes);


    return bits_set / (double) filter->m;

}


/*

 * Which element in the sequence of powers of two is less than or equal to

 * target_bitset_bits?

 *

 * Value returned here must be generally safe as the basis for actual bitset

 * size.

 *

 * Bitset is never allowed to exceed 2 ^ 32 bits (512MB).  This is sufficient

 * for the needs of all current callers, and allows us to use 32-bit hash

 * functions.  It also makes it easy to stay under the MaxAllocSize restriction

 * (caller needs to leave room for non-bitset fields that appear before

 * flexible array member, so a 1GB bitset would use an allocation that just

 * exceeds MaxAllocSize).

 */

static int

my_bloom_power(uint64 target_bitset_bits)

{

    int         bloom_power = -1;


    while (target_bitset_bits > 0 && bloom_power < 32)

    {

        bloom_power++;

        target_bitset_bits >>= 1;

    }


    return bloom_power;

}


/*

 * Determine optimal number of hash functions based on size of filter in bits,

 * and projected total number of elements.  The optimal number is the number

 * that minimizes the false positive rate.

 */

static int

optimal_k(uint64 bitset_bits, int64 total_elems)

{

    int         k = rint(log(2.0) * bitset_bits / total_elems);


    return Max(1, Min(k, MAX_HASH_FUNCS));

}


/*

 * Generate k hash values for element.

 *

 * Caller passes array, which is filled-in with k values determined by hashing

 * caller's element.

 *

 * Only 2 real independent hash functions are actually used to support an

 * interface of up to MAX_HASH_FUNCS hash functions; enhanced double hashing is

 * used to make this work.  The main reason we prefer enhanced double hashing

 * to classic double hashing is that the latter has an issue with collisions

 * when using power of two sized bitsets.  See Dillinger & Manolios for full

 * details.

 */

static void

k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem, size_t len)

{

    uint64      hash;

    uint32      x,

                y;

    uint64      m;

    int         i;


    /* Use 64-bit hashing to get two independent 32-bit hashes */

    hash = DatumGetUInt64(hash_any_extended(elem, len, filter->seed));

    x = (uint32) hash;

    y = (uint32) (hash >> 32);

    m = filter->m;


    x = mod_m(x, m);

    y = mod_m(y, m);


    /* Accumulate hashes */

    hashes[0] = x;

    for (i = 1; i < filter->k_hash_funcs; i++)

    {

        x = mod_m(x + y, m);

        y = mod_m(y + i, m);


        hashes[i] = x;

    }

}


/*

 * Calculate "val MOD m" inexpensively.

 *

 * Assumes that m (which is bitset size) is a power of two.

 *

 * Using a power of two number of bits for bitset size allows us to use bitwise

 * AND operations to calculate the modulo of a hash value.  It's also a simple

 * way of avoiding the modulo bias effect.

 */

static inline uint32

mod_m(uint32 val, uint64 m)

{

    Assert(m <= PG_UINT32_MAX + UINT64CONST(1));

    Assert(((m - 1) & m) == 0);


    return val & (m - 1);

}

bloom_free
void bloom_free(bloom_filter *filter)
Definition: bloomfilter.c:126

MAX_HASH_FUNCS
#define MAX_HASH_FUNCS
Definition: bloomfilter.c:42

optimal_k
static int optimal_k(uint64 bitset_bits, int64 total_elems)
Definition: bloomfilter.c:229

bloom_create
bloom_filter * bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
Definition: bloomfilter.c:87

bloom_prop_bits_set
double bloom_prop_bits_set(bloom_filter *filter)
Definition: bloomfilter.c:187

mod_m
static uint32 mod_m(uint32 val, uint64 m)
Definition: bloomfilter.c:288

bloom_lacks_element
bool bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len)
Definition: bloomfilter.c:157

my_bloom_power
static int my_bloom_power(uint64 target_bitset_bits)
Definition: bloomfilter.c:210

k_hashes
static void k_hashes(bloom_filter *filter, uint32 *hashes, unsigned char *elem, size_t len)
Definition: bloomfilter.c:250

bloom_add_element
void bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len)
Definition: bloomfilter.c:135

bloomfilter.h

Min
#define Min(x, y)
Definition: c.h:975

PG_UINT32_MAX
#define PG_UINT32_MAX
Definition: c.h:561

Max
#define Max(x, y)
Definition: c.h:969

int64
int64_t int64
Definition: c.h:499

FLEXIBLE_ARRAY_MEMBER
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:434

uint64
uint64_t uint64
Definition: c.h:503

uint32
uint32_t uint32
Definition: c.h:502

UINT64CONST
#define UINT64CONST(x)
Definition: c.h:517

hashfn.h

hash_any_extended
static Datum hash_any_extended(const unsigned char *k, int keylen, uint64 seed)
Definition: hashfn.h:37

Assert
Assert(PointerIsAligned(start, uint64))

val
long val
Definition: informix.c:689

y
int y
Definition: isn.c:76

x
int x
Definition: isn.c:75

i
int i
Definition: isn.c:77

pfree
void pfree(void *pointer)
Definition: mcxt.c:1528

palloc0
void * palloc0(Size size)
Definition: mcxt.c:1351

pg_bitutils.h

pg_popcount
static uint64 pg_popcount(const char *buf, int bytes)
Definition: pg_bitutils.h:363

BITS_PER_BYTE
#define BITS_PER_BYTE
Definition: pg_config_manual.h:106

len
const void size_t len
Definition: pg_crc32c_sse42.c:28

postgres.h

DatumGetUInt64
static uint64 DatumGetUInt64(Datum X)
Definition: postgres.h:424

hash
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715

bloom_filter
Definition: bloomfilter.c:45

bloom_filter::bitset
unsigned char bitset[FLEXIBLE_ARRAY_MEMBER]
Definition: bloomfilter.c:51

bloom_filter::seed
uint64 seed
Definition: bloomfilter.c:48

bloom_filter::m
uint64 m
Definition: bloomfilter.c:50

bloom_filter::k_hash_funcs
int k_hash_funcs
Definition: bloomfilter.c:47