sampling_8c_source.html

/*-------------------------------------------------------------------------

 *

 * sampling.c

 *    Relation block sampling routines.

 *

 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group

 * Portions Copyright (c) 1994, Regents of the University of California

 *

 *

 * IDENTIFICATION

 *    src/backend/utils/misc/sampling.c

 *

 *-------------------------------------------------------------------------

 */


#include "postgres.h"


#include <math.h>


#include "utils/sampling.h"


/*

 * BlockSampler_Init -- prepare for random sampling of blocknumbers

 *

 * BlockSampler provides algorithm for block level sampling of a relation

 * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")

 * It selects a random sample of samplesize blocks out of

 * the nblocks blocks in the table. If the table has less than

 * samplesize blocks, all blocks are selected.

 *

 * Since we know the total number of blocks in advance, we can use the

 * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's

 * algorithm.

 *

 * Returns the number of blocks that BlockSampler_Next will return.

 */

BlockNumber

BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,

                  uint32 randseed)

{

    bs->N = nblocks;            /* measured table size */


    /*

     * If we decide to reduce samplesize for tables that have less or not much

     * more than samplesize blocks, here is the place to do it.

     */

    bs->n = samplesize;

    bs->t = 0;                  /* blocks scanned so far */

    bs->m = 0;                  /* blocks selected so far */


    sampler_random_init_state(randseed, &bs->randstate);


    return Min(bs->n, bs->N);

}


bool

BlockSampler_HasMore(BlockSampler bs)

{

    return (bs->t < bs->N) && (bs->m < bs->n);

}


BlockNumber

BlockSampler_Next(BlockSampler bs)

{

    BlockNumber K = bs->N - bs->t;  /* remaining blocks */

    int         k = bs->n - bs->m;  /* blocks still to sample */

    double      p;              /* probability to skip block */

    double      V;              /* random */


    Assert(BlockSampler_HasMore(bs));   /* hence K > 0 and k > 0 */


    if ((BlockNumber) k >= K)

    {

        /* need all the rest */

        bs->m++;

        return bs->t++;

    }


    /*----------

     * It is not obvious that this code matches Knuth's Algorithm S.

     * Knuth says to skip the current block with probability 1 - k/K.

     * If we are to skip, we should advance t (hence decrease K), and

     * repeat the same probabilistic test for the next block.  The naive

     * implementation thus requires a sampler_random_fract() call for each

     * block number.  But we can reduce this to one sampler_random_fract()

     * call per selected block, by noting that each time the while-test

     * succeeds, we can reinterpret V as a uniform random number in the range

     * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be

     * the appropriate fraction of its former value, and our next loop

     * makes the appropriate probabilistic test.

     *

     * We have initially K > k > 0.  If the loop reduces K to equal k,

     * the next while-test must fail since p will become exactly zero

     * (we assume there will not be roundoff error in the division).

     * (Note: Knuth suggests a "<=" loop condition, but we use "<" just

     * to be doubly sure about roundoff error.)  Therefore K cannot become

     * less than k, which means that we cannot fail to select enough blocks.

     *----------

     */

    V = sampler_random_fract(&bs->randstate);

    p = 1.0 - (double) k / (double) K;

    while (V < p)

    {

        /* skip */

        bs->t++;

        K--;                    /* keep K == N - t */


        /* adjust p to be new cutoff point in reduced range */

        p *= 1.0 - (double) k / (double) K;

    }


    /* select */

    bs->m++;

    return bs->t++;

}


/*

 * These two routines embody Algorithm Z from "Random sampling with a

 * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1

 * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms

 * of the count S of records to skip before processing another record.

 * It is computed primarily based on t, the number of records already read.

 * The only extra state needed between calls is W, a random state variable.

 *

 * reservoir_init_selection_state computes the initial W value.

 *

 * Given that we've already read t records (t >= n), reservoir_get_next_S

 * determines the number of records to skip before the next record is

 * processed.

 */

void

reservoir_init_selection_state(ReservoirState rs, int n)

{

    /*

     * Reservoir sampling is not used anywhere where it would need to return

     * repeatable results so we can initialize it randomly.

     */

    sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),

                              &rs->randstate);


    /* Initial value of W (for use when Algorithm Z is first applied) */

    rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);

}


double

reservoir_get_next_S(ReservoirState rs, double t, int n)

{

    double      S;


    /* The magic constant here is T from Vitter's paper */

    if (t <= (22.0 * n))

    {

        /* Process records using Algorithm X until t is large enough */

        double      V,

                    quot;


        V = sampler_random_fract(&rs->randstate);   /* Generate V */

        S = 0;

        t += 1;

        /* Note: "num" in Vitter's code is always equal to t - n */

        quot = (t - (double) n) / t;

        /* Find min S satisfying (4.1) */

        while (quot > V)

        {

            S += 1;

            t += 1;

            quot *= (t - (double) n) / t;

        }

    }

    else

    {

        /* Now apply Algorithm Z */

        double      W = rs->W;

        double      term = t - (double) n + 1;


        for (;;)

        {

            double      numer,

                        numer_lim,

                        denom;

            double      U,

                        X,

                        lhs,

                        rhs,

                        y,

                        tmp;


            /* Generate U and X */

            U = sampler_random_fract(&rs->randstate);

            X = t * (W - 1.0);

            S = floor(X);       /* S is tentatively set to floor(X) */

            /* Test if U <= h(S)/cg(X) in the manner of (6.3) */

            tmp = (t + 1) / term;

            lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);

            rhs = (((t + X) / (term + S)) * term) / t;

            if (lhs <= rhs)

            {

                W = rhs / lhs;

                break;

            }

            /* Test if U <= f(S)/cg(X) */

            y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);

            if ((double) n < S)

            {

                denom = t;

                numer_lim = term + S;

            }

            else

            {

                denom = t - (double) n + S;

                numer_lim = t + 1;

            }

            for (numer = t + S; numer >= numer_lim; numer -= 1)

            {

                y *= numer / denom;

                denom -= 1;

            }

            W = exp(-log(sampler_random_fract(&rs->randstate)) / n);    /* Generate W in advance */

            if (exp(log(y) / n) <= (t + X) / t)

                break;

        }

        rs->W = W;

    }

    return S;

}


/*----------

 * Random number generator used by sampling

 *----------

 */

void

sampler_random_init_state(uint32 seed, pg_prng_state *randstate)

{

    pg_prng_seed(randstate, (uint64) seed);

}


/* Select a random value R uniformly distributed in (0 - 1) */

double

sampler_random_fract(pg_prng_state *randstate)

{

    double      res;


    /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */

    do

    {

        res = pg_prng_double(randstate);

    } while (unlikely(res == 0.0));

    return res;

}


/*

 * Backwards-compatible API for block sampling

 *

 * This code is now deprecated, but since it's still in use by many FDWs,

 * we should keep it for awhile at least.  The functionality is the same as

 * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,

 * except that a common random state is used across all callers.

 */

static ReservoirStateData oldrs;

static bool oldrs_initialized = false;


double

anl_random_fract(void)

{

    /* initialize if first time through */

    if (unlikely(!oldrs_initialized))

    {

        sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),

                                  &oldrs.randstate);

        oldrs_initialized = true;

    }


    /* and compute a random fraction */

    return sampler_random_fract(&oldrs.randstate);

}


double

anl_init_selection_state(int n)

{

    /* initialize if first time through */

    if (unlikely(!oldrs_initialized))

    {

        sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),

                                  &oldrs.randstate);

        oldrs_initialized = true;

    }


    /* Initial value of W (for use when Algorithm Z is first applied) */

    return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);

}


double

anl_get_next_S(double t, int n, double *stateptr)

{

    double      result;


    oldrs.W = *stateptr;

    result = reservoir_get_next_S(&oldrs, t, n);

    *stateptr = oldrs.W;

    return result;

}

BlockNumber
uint32 BlockNumber
Definition: block.h:31

Min
#define Min(x, y)
Definition: c.h:975

uint64
uint64_t uint64
Definition: c.h:503

unlikely
#define unlikely(x)
Definition: c.h:347

uint32
uint32_t uint32
Definition: c.h:502

Assert
Assert(PointerIsAligned(start, uint64))

y
int y
Definition: isn.c:76

pg_prng_double
double pg_prng_double(pg_prng_state *state)
Definition: pg_prng.c:268

pg_prng_uint32
uint32 pg_prng_uint32(pg_prng_state *state)
Definition: pg_prng.c:227

pg_prng_seed
void pg_prng_seed(pg_prng_state *state, uint64 seed)
Definition: pg_prng.c:89

pg_global_prng_state
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34

postgres.h

oldrs
static ReservoirStateData oldrs
Definition: sampling.c:262

oldrs_initialized
static bool oldrs_initialized
Definition: sampling.c:263

BlockSampler_Init
BlockNumber BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize, uint32 randseed)
Definition: sampling.c:39

reservoir_init_selection_state
void reservoir_init_selection_state(ReservoirState rs, int n)
Definition: sampling.c:133

anl_get_next_S
double anl_get_next_S(double t, int n, double *stateptr)
Definition: sampling.c:296

sampler_random_fract
double sampler_random_fract(pg_prng_state *randstate)
Definition: sampling.c:241

BlockSampler_HasMore
bool BlockSampler_HasMore(BlockSampler bs)
Definition: sampling.c:58

BlockSampler_Next
BlockNumber BlockSampler_Next(BlockSampler bs)
Definition: sampling.c:64

anl_init_selection_state
double anl_init_selection_state(int n)
Definition: sampling.c:281

sampler_random_init_state
void sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
Definition: sampling.c:234

anl_random_fract
double anl_random_fract(void)
Definition: sampling.c:266

reservoir_get_next_S
double reservoir_get_next_S(ReservoirState rs, double t, int n)
Definition: sampling.c:147

sampling.h

K
#define K(t)
Definition: sha1.c:66

W
#define W(n)
Definition: sha1.c:78

S
#define S(n, x)
Definition: sha1.c:73

BlockSamplerData
Definition: sampling.h:29

BlockSamplerData::N
BlockNumber N
Definition: sampling.h:30

BlockSamplerData::m
int m
Definition: sampling.h:33

BlockSamplerData::n
int n
Definition: sampling.h:31

BlockSamplerData::randstate
pg_prng_state randstate
Definition: sampling.h:34

BlockSamplerData::t
BlockNumber t
Definition: sampling.h:32

ReservoirStateData
Definition: sampling.h:47

ReservoirStateData::W
double W
Definition: sampling.h:48

ReservoirStateData::randstate
pg_prng_state randstate
Definition: sampling.h:49

pg_prng_state
Definition: pg_prng.h:20