PostgreSQL Source Code  git master
sampling.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * sampling.c
4  * Relation block sampling routines.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/misc/sampling.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include <math.h>
19 
20 #include "utils/sampling.h"
21 
22 
23 /*
24  * BlockSampler_Init -- prepare for random sampling of blocknumbers
25  *
26  * BlockSampler provides algorithm for block level sampling of a relation
27  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28  * It selects a random sample of samplesize blocks out of
29  * the nblocks blocks in the table. If the table has less than
30  * samplesize blocks, all blocks are selected.
31  *
32  * Since we know the total number of blocks in advance, we can use the
33  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
34  * algorithm.
35  *
36  * Returns the number of blocks that BlockSampler_Next will return.
37  */
39 BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
40  uint32 randseed)
41 {
42  bs->N = nblocks; /* measured table size */
43 
44  /*
45  * If we decide to reduce samplesize for tables that have less or not much
46  * more than samplesize blocks, here is the place to do it.
47  */
48  bs->n = samplesize;
49  bs->t = 0; /* blocks scanned so far */
50  bs->m = 0; /* blocks selected so far */
51 
52  sampler_random_init_state(randseed, &bs->randstate);
53 
54  return Min(bs->n, bs->N);
55 }
56 
57 bool
59 {
60  return (bs->t < bs->N) && (bs->m < bs->n);
61 }
62 
65 {
66  BlockNumber K = bs->N - bs->t; /* remaining blocks */
67  int k = bs->n - bs->m; /* blocks still to sample */
68  double p; /* probability to skip block */
69  double V; /* random */
70 
71  Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
72 
73  if ((BlockNumber) k >= K)
74  {
75  /* need all the rest */
76  bs->m++;
77  return bs->t++;
78  }
79 
80  /*----------
81  * It is not obvious that this code matches Knuth's Algorithm S.
82  * Knuth says to skip the current block with probability 1 - k/K.
83  * If we are to skip, we should advance t (hence decrease K), and
84  * repeat the same probabilistic test for the next block. The naive
85  * implementation thus requires a sampler_random_fract() call for each
86  * block number. But we can reduce this to one sampler_random_fract()
87  * call per selected block, by noting that each time the while-test
88  * succeeds, we can reinterpret V as a uniform random number in the range
89  * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
90  * the appropriate fraction of its former value, and our next loop
91  * makes the appropriate probabilistic test.
92  *
93  * We have initially K > k > 0. If the loop reduces K to equal k,
94  * the next while-test must fail since p will become exactly zero
95  * (we assume there will not be roundoff error in the division).
96  * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
97  * to be doubly sure about roundoff error.) Therefore K cannot become
98  * less than k, which means that we cannot fail to select enough blocks.
99  *----------
100  */
102  p = 1.0 - (double) k / (double) K;
103  while (V < p)
104  {
105  /* skip */
106  bs->t++;
107  K--; /* keep K == N - t */
108 
109  /* adjust p to be new cutoff point in reduced range */
110  p *= 1.0 - (double) k / (double) K;
111  }
112 
113  /* select */
114  bs->m++;
115  return bs->t++;
116 }
117 
118 /*
119  * These two routines embody Algorithm Z from "Random sampling with a
120  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
121  * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
122  * of the count S of records to skip before processing another record.
123  * It is computed primarily based on t, the number of records already read.
124  * The only extra state needed between calls is W, a random state variable.
125  *
126  * reservoir_init_selection_state computes the initial W value.
127  *
128  * Given that we've already read t records (t >= n), reservoir_get_next_S
129  * determines the number of records to skip before the next record is
130  * processed.
131  */
132 void
134 {
135  /*
136  * Reservoir sampling is not used anywhere where it would need to return
137  * repeatable results so we can initialize it randomly.
138  */
140  &rs->randstate);
141 
142  /* Initial value of W (for use when Algorithm Z is first applied) */
143  rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);
144 }
145 
146 double
148 {
149  double S;
150 
151  /* The magic constant here is T from Vitter's paper */
152  if (t <= (22.0 * n))
153  {
154  /* Process records using Algorithm X until t is large enough */
155  double V,
156  quot;
157 
158  V = sampler_random_fract(&rs->randstate); /* Generate V */
159  S = 0;
160  t += 1;
161  /* Note: "num" in Vitter's code is always equal to t - n */
162  quot = (t - (double) n) / t;
163  /* Find min S satisfying (4.1) */
164  while (quot > V)
165  {
166  S += 1;
167  t += 1;
168  quot *= (t - (double) n) / t;
169  }
170  }
171  else
172  {
173  /* Now apply Algorithm Z */
174  double W = rs->W;
175  double term = t - (double) n + 1;
176 
177  for (;;)
178  {
179  double numer,
180  numer_lim,
181  denom;
182  double U,
183  X,
184  lhs,
185  rhs,
186  y,
187  tmp;
188 
189  /* Generate U and X */
191  X = t * (W - 1.0);
192  S = floor(X); /* S is tentatively set to floor(X) */
193  /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
194  tmp = (t + 1) / term;
195  lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
196  rhs = (((t + X) / (term + S)) * term) / t;
197  if (lhs <= rhs)
198  {
199  W = rhs / lhs;
200  break;
201  }
202  /* Test if U <= f(S)/cg(X) */
203  y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
204  if ((double) n < S)
205  {
206  denom = t;
207  numer_lim = term + S;
208  }
209  else
210  {
211  denom = t - (double) n + S;
212  numer_lim = t + 1;
213  }
214  for (numer = t + S; numer >= numer_lim; numer -= 1)
215  {
216  y *= numer / denom;
217  denom -= 1;
218  }
219  W = exp(-log(sampler_random_fract(&rs->randstate)) / n); /* Generate W in advance */
220  if (exp(log(y) / n) <= (t + X) / t)
221  break;
222  }
223  rs->W = W;
224  }
225  return S;
226 }
227 
228 
229 /*----------
230  * Random number generator used by sampling
231  *----------
232  */
233 void
235 {
236  pg_prng_seed(randstate, (uint64) seed);
237 }
238 
239 /* Select a random value R uniformly distributed in (0 - 1) */
240 double
242 {
243  double res;
244 
245  /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
246  do
247  {
248  res = pg_prng_double(randstate);
249  } while (unlikely(res == 0.0));
250  return res;
251 }
252 
253 
254 /*
255  * Backwards-compatible API for block sampling
256  *
257  * This code is now deprecated, but since it's still in use by many FDWs,
258  * we should keep it for awhile at least. The functionality is the same as
259  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
260  * except that a common random state is used across all callers.
261  */
263 static bool oldrs_initialized = false;
264 
265 double
267 {
268  /* initialize if first time through */
270  {
272  &oldrs.randstate);
273  oldrs_initialized = true;
274  }
275 
276  /* and compute a random fraction */
278 }
279 
280 double
282 {
283  /* initialize if first time through */
285  {
287  &oldrs.randstate);
288  oldrs_initialized = true;
289  }
290 
291  /* Initial value of W (for use when Algorithm Z is first applied) */
292  return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);
293 }
294 
295 double
296 anl_get_next_S(double t, int n, double *stateptr)
297 {
298  double result;
299 
300  oldrs.W = *stateptr;
301  result = reservoir_get_next_S(&oldrs, t, n);
302  *stateptr = oldrs.W;
303  return result;
304 }
uint32 BlockNumber
Definition: block.h:31
unsigned int uint32
Definition: c.h:518
#define Min(x, y)
Definition: c.h:1007
#define Assert(condition)
Definition: c.h:861
#define unlikely(x)
Definition: c.h:326
int y
Definition: isn.c:71
double pg_prng_double(pg_prng_state *state)
Definition: pg_prng.c:268
uint32 pg_prng_uint32(pg_prng_state *state)
Definition: pg_prng.c:227
void pg_prng_seed(pg_prng_state *state, uint64 seed)
Definition: pg_prng.c:89
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static ReservoirStateData oldrs
Definition: sampling.c:262
static bool oldrs_initialized
Definition: sampling.c:263
BlockNumber BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize, uint32 randseed)
Definition: sampling.c:39
void reservoir_init_selection_state(ReservoirState rs, int n)
Definition: sampling.c:133
double anl_get_next_S(double t, int n, double *stateptr)
Definition: sampling.c:296
double sampler_random_fract(pg_prng_state *randstate)
Definition: sampling.c:241
bool BlockSampler_HasMore(BlockSampler bs)
Definition: sampling.c:58
BlockNumber BlockSampler_Next(BlockSampler bs)
Definition: sampling.c:64
double anl_init_selection_state(int n)
Definition: sampling.c:281
void sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
Definition: sampling.c:234
double anl_random_fract(void)
Definition: sampling.c:266
double reservoir_get_next_S(ReservoirState rs, double t, int n)
Definition: sampling.c:147
#define K(t)
Definition: sha1.c:66
#define W(n)
Definition: sha1.c:78
#define S(n, x)
Definition: sha1.c:73
BlockNumber N
Definition: sampling.h:30
pg_prng_state randstate
Definition: sampling.h:34
BlockNumber t
Definition: sampling.h:32
pg_prng_state randstate
Definition: sampling.h:49