PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
sampling.h File Reference
#include "common/pg_prng.h"
#include "storage/block.h"
Include dependency graph for sampling.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BlockSamplerData
 
struct  ReservoirStateData
 

Typedefs

typedef BlockSamplerDataBlockSampler
 
typedef ReservoirStateDataReservoirState
 

Functions

void sampler_random_init_state (uint32 seed, pg_prng_state *randstate)
 
double sampler_random_fract (pg_prng_state *randstate)
 
BlockNumber BlockSampler_Init (BlockSampler bs, BlockNumber nblocks, int samplesize, uint32 randseed)
 
bool BlockSampler_HasMore (BlockSampler bs)
 
BlockNumber BlockSampler_Next (BlockSampler bs)
 
void reservoir_init_selection_state (ReservoirState rs, int n)
 
double reservoir_get_next_S (ReservoirState rs, double t, int n)
 
double anl_random_fract (void)
 
double anl_init_selection_state (int n)
 
double anl_get_next_S (double t, int n, double *stateptr)
 

Typedef Documentation

◆ BlockSampler

Definition at line 37 of file sampling.h.

◆ ReservoirState

Definition at line 52 of file sampling.h.

Function Documentation

◆ anl_get_next_S()

double anl_get_next_S ( double  t,
int  n,
double *  stateptr 
)

Definition at line 296 of file sampling.c.

297{
298 double result;
299
300 oldrs.W = *stateptr;
301 result = reservoir_get_next_S(&oldrs, t, n);
302 *stateptr = oldrs.W;
303 return result;
304}
static ReservoirStateData oldrs
Definition: sampling.c:262
double reservoir_get_next_S(ReservoirState rs, double t, int n)
Definition: sampling.c:147

References oldrs, reservoir_get_next_S(), and ReservoirStateData::W.

◆ anl_init_selection_state()

double anl_init_selection_state ( int  n)

Definition at line 281 of file sampling.c.

282{
283 /* initialize if first time through */
285 {
288 oldrs_initialized = true;
289 }
290
291 /* Initial value of W (for use when Algorithm Z is first applied) */
292 return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);
293}
#define unlikely(x)
Definition: c.h:333
uint32 pg_prng_uint32(pg_prng_state *state)
Definition: pg_prng.c:227
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static bool oldrs_initialized
Definition: sampling.c:263
double sampler_random_fract(pg_prng_state *randstate)
Definition: sampling.c:241
void sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
Definition: sampling.c:234
pg_prng_state randstate
Definition: sampling.h:49

References oldrs, oldrs_initialized, pg_global_prng_state, pg_prng_uint32(), ReservoirStateData::randstate, sampler_random_fract(), sampler_random_init_state(), and unlikely.

◆ anl_random_fract()

double anl_random_fract ( void  )

Definition at line 266 of file sampling.c.

267{
268 /* initialize if first time through */
270 {
273 oldrs_initialized = true;
274 }
275
276 /* and compute a random fraction */
278}

References oldrs, oldrs_initialized, pg_global_prng_state, pg_prng_uint32(), ReservoirStateData::randstate, sampler_random_fract(), sampler_random_init_state(), and unlikely.

◆ BlockSampler_HasMore()

bool BlockSampler_HasMore ( BlockSampler  bs)

Definition at line 58 of file sampling.c.

59{
60 return (bs->t < bs->N) && (bs->m < bs->n);
61}
BlockNumber N
Definition: sampling.h:30
BlockNumber t
Definition: sampling.h:32

References BlockSamplerData::m, BlockSamplerData::N, BlockSamplerData::n, and BlockSamplerData::t.

Referenced by block_sampling_read_stream_next(), and BlockSampler_Next().

◆ BlockSampler_Init()

BlockNumber BlockSampler_Init ( BlockSampler  bs,
BlockNumber  nblocks,
int  samplesize,
uint32  randseed 
)

Definition at line 39 of file sampling.c.

41{
42 bs->N = nblocks; /* measured table size */
43
44 /*
45 * If we decide to reduce samplesize for tables that have less or not much
46 * more than samplesize blocks, here is the place to do it.
47 */
48 bs->n = samplesize;
49 bs->t = 0; /* blocks scanned so far */
50 bs->m = 0; /* blocks selected so far */
51
53
54 return Min(bs->n, bs->N);
55}
#define Min(x, y)
Definition: c.h:961
pg_prng_state randstate
Definition: sampling.h:34

References BlockSamplerData::m, Min, BlockSamplerData::N, BlockSamplerData::n, BlockSamplerData::randstate, sampler_random_init_state(), and BlockSamplerData::t.

Referenced by acquire_sample_rows().

◆ BlockSampler_Next()

BlockNumber BlockSampler_Next ( BlockSampler  bs)

Definition at line 64 of file sampling.c.

65{
66 BlockNumber K = bs->N - bs->t; /* remaining blocks */
67 int k = bs->n - bs->m; /* blocks still to sample */
68 double p; /* probability to skip block */
69 double V; /* random */
70
71 Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
72
73 if ((BlockNumber) k >= K)
74 {
75 /* need all the rest */
76 bs->m++;
77 return bs->t++;
78 }
79
80 /*----------
81 * It is not obvious that this code matches Knuth's Algorithm S.
82 * Knuth says to skip the current block with probability 1 - k/K.
83 * If we are to skip, we should advance t (hence decrease K), and
84 * repeat the same probabilistic test for the next block. The naive
85 * implementation thus requires a sampler_random_fract() call for each
86 * block number. But we can reduce this to one sampler_random_fract()
87 * call per selected block, by noting that each time the while-test
88 * succeeds, we can reinterpret V as a uniform random number in the range
89 * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
90 * the appropriate fraction of its former value, and our next loop
91 * makes the appropriate probabilistic test.
92 *
93 * We have initially K > k > 0. If the loop reduces K to equal k,
94 * the next while-test must fail since p will become exactly zero
95 * (we assume there will not be roundoff error in the division).
96 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
97 * to be doubly sure about roundoff error.) Therefore K cannot become
98 * less than k, which means that we cannot fail to select enough blocks.
99 *----------
100 */
102 p = 1.0 - (double) k / (double) K;
103 while (V < p)
104 {
105 /* skip */
106 bs->t++;
107 K--; /* keep K == N - t */
108
109 /* adjust p to be new cutoff point in reduced range */
110 p *= 1.0 - (double) k / (double) K;
111 }
112
113 /* select */
114 bs->m++;
115 return bs->t++;
116}
uint32 BlockNumber
Definition: block.h:31
#define Assert(condition)
Definition: c.h:815
bool BlockSampler_HasMore(BlockSampler bs)
Definition: sampling.c:58
#define K(t)
Definition: sha1.c:66

References Assert, BlockSampler_HasMore(), K, BlockSamplerData::m, BlockSamplerData::N, BlockSamplerData::n, BlockSamplerData::randstate, sampler_random_fract(), and BlockSamplerData::t.

Referenced by block_sampling_read_stream_next().

◆ reservoir_get_next_S()

double reservoir_get_next_S ( ReservoirState  rs,
double  t,
int  n 
)

Definition at line 147 of file sampling.c.

148{
149 double S;
150
151 /* The magic constant here is T from Vitter's paper */
152 if (t <= (22.0 * n))
153 {
154 /* Process records using Algorithm X until t is large enough */
155 double V,
156 quot;
157
158 V = sampler_random_fract(&rs->randstate); /* Generate V */
159 S = 0;
160 t += 1;
161 /* Note: "num" in Vitter's code is always equal to t - n */
162 quot = (t - (double) n) / t;
163 /* Find min S satisfying (4.1) */
164 while (quot > V)
165 {
166 S += 1;
167 t += 1;
168 quot *= (t - (double) n) / t;
169 }
170 }
171 else
172 {
173 /* Now apply Algorithm Z */
174 double W = rs->W;
175 double term = t - (double) n + 1;
176
177 for (;;)
178 {
179 double numer,
180 numer_lim,
181 denom;
182 double U,
183 X,
184 lhs,
185 rhs,
186 y,
187 tmp;
188
189 /* Generate U and X */
191 X = t * (W - 1.0);
192 S = floor(X); /* S is tentatively set to floor(X) */
193 /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
194 tmp = (t + 1) / term;
195 lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
196 rhs = (((t + X) / (term + S)) * term) / t;
197 if (lhs <= rhs)
198 {
199 W = rhs / lhs;
200 break;
201 }
202 /* Test if U <= f(S)/cg(X) */
203 y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
204 if ((double) n < S)
205 {
206 denom = t;
207 numer_lim = term + S;
208 }
209 else
210 {
211 denom = t - (double) n + S;
212 numer_lim = t + 1;
213 }
214 for (numer = t + S; numer >= numer_lim; numer -= 1)
215 {
216 y *= numer / denom;
217 denom -= 1;
218 }
219 W = exp(-log(sampler_random_fract(&rs->randstate)) / n); /* Generate W in advance */
220 if (exp(log(y) / n) <= (t + X) / t)
221 break;
222 }
223 rs->W = W;
224 }
225 return S;
226}
int y
Definition: isn.c:71
#define W(n)
Definition: sha1.c:78
#define S(n, x)
Definition: sha1.c:73

References ReservoirStateData::randstate, S, sampler_random_fract(), W, ReservoirStateData::W, and y.

Referenced by acquire_sample_rows(), analyze_row_processor(), anl_get_next_S(), and file_acquire_sample_rows().

◆ reservoir_init_selection_state()

void reservoir_init_selection_state ( ReservoirState  rs,
int  n 
)

Definition at line 133 of file sampling.c.

134{
135 /*
136 * Reservoir sampling is not used anywhere where it would need to return
137 * repeatable results so we can initialize it randomly.
138 */
140 &rs->randstate);
141
142 /* Initial value of W (for use when Algorithm Z is first applied) */
143 rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);
144}

References pg_global_prng_state, pg_prng_uint32(), ReservoirStateData::randstate, sampler_random_fract(), sampler_random_init_state(), and ReservoirStateData::W.

Referenced by acquire_sample_rows(), file_acquire_sample_rows(), and postgresAcquireSampleRowsFunc().

◆ sampler_random_fract()

double sampler_random_fract ( pg_prng_state randstate)

Definition at line 241 of file sampling.c.

242{
243 double res;
244
245 /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
246 do
247 {
248 res = pg_prng_double(randstate);
249 } while (unlikely(res == 0.0));
250 return res;
251}
double pg_prng_double(pg_prng_state *state)
Definition: pg_prng.c:268

References pg_prng_double(), res, and unlikely.

Referenced by acquire_sample_rows(), analyze_row_processor(), anl_init_selection_state(), anl_random_fract(), BlockSampler_Next(), file_acquire_sample_rows(), random_relative_prime(), reservoir_get_next_S(), reservoir_init_selection_state(), system_rows_nextsampleblock(), and system_time_nextsampleblock().

◆ sampler_random_init_state()

void sampler_random_init_state ( uint32  seed,
pg_prng_state randstate 
)

Definition at line 234 of file sampling.c.

235{
236 pg_prng_seed(randstate, (uint64) seed);
237}
uint64_t uint64
Definition: c.h:489
void pg_prng_seed(pg_prng_state *state, uint64 seed)
Definition: pg_prng.c:89

References pg_prng_seed().

Referenced by anl_init_selection_state(), anl_random_fract(), BlockSampler_Init(), reservoir_init_selection_state(), system_rows_nextsampleblock(), and system_time_nextsampleblock().