PostgreSQL Source Code  git master
sampling.c File Reference
#include "postgres.h"
#include <math.h>
#include "utils/sampling.h"
Include dependency graph for sampling.c:

Go to the source code of this file.

Functions

void BlockSampler_Init (BlockSampler bs, BlockNumber nblocks, int samplesize, long randseed)
 
bool BlockSampler_HasMore (BlockSampler bs)
 
BlockNumber BlockSampler_Next (BlockSampler bs)
 
void reservoir_init_selection_state (ReservoirState rs, int n)
 
double reservoir_get_next_S (ReservoirState rs, double t, int n)
 
void sampler_random_init_state (long seed, SamplerRandomState randstate)
 
double sampler_random_fract (SamplerRandomState randstate)
 
double anl_random_fract (void)
 
double anl_init_selection_state (int n)
 
double anl_get_next_S (double t, int n, double *stateptr)
 

Variables

static ReservoirStateData oldrs
 

Function Documentation

◆ anl_get_next_S()

double anl_get_next_S ( double  t,
int  n,
double *  stateptr 
)

Definition at line 284 of file sampling.c.

References reservoir_get_next_S(), and ReservoirStateData::W.

285 {
286  double result;
287 
288  oldrs.W = *stateptr;
289  result = reservoir_get_next_S(&oldrs, t, n);
290  *stateptr = oldrs.W;
291  return result;
292 }
static ReservoirStateData oldrs
Definition: sampling.c:259
double reservoir_get_next_S(ReservoirState rs, double t, int n)
Definition: sampling.c:142

◆ anl_init_selection_state()

double anl_init_selection_state ( int  n)

Definition at line 273 of file sampling.c.

References random(), ReservoirStateData::randstate, sampler_random_fract(), and sampler_random_init_state().

274 {
275  /* initialize if first time through */
276  if (oldrs.randstate[0] == 0)
278 
279  /* Initial value of W (for use when Algorithm Z is first applied) */
280  return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
281 }
void sampler_random_init_state(long seed, SamplerRandomState randstate)
Definition: sampling.c:229
long random(void)
Definition: random.c:22
double sampler_random_fract(SamplerRandomState randstate)
Definition: sampling.c:238
static ReservoirStateData oldrs
Definition: sampling.c:259
SamplerRandomState randstate
Definition: sampling.h:50

◆ anl_random_fract()

double anl_random_fract ( void  )

Definition at line 262 of file sampling.c.

References random(), ReservoirStateData::randstate, sampler_random_fract(), and sampler_random_init_state().

263 {
264  /* initialize if first time through */
265  if (oldrs.randstate[0] == 0)
267 
268  /* and compute a random fraction */
270 }
void sampler_random_init_state(long seed, SamplerRandomState randstate)
Definition: sampling.c:229
long random(void)
Definition: random.c:22
double sampler_random_fract(SamplerRandomState randstate)
Definition: sampling.c:238
static ReservoirStateData oldrs
Definition: sampling.c:259
SamplerRandomState randstate
Definition: sampling.h:50

◆ BlockSampler_HasMore()

bool BlockSampler_HasMore ( BlockSampler  bs)

Definition at line 54 of file sampling.c.

References BlockSamplerData::m, BlockSamplerData::N, BlockSamplerData::n, and BlockSamplerData::t.

Referenced by acquire_sample_rows(), and BlockSampler_Next().

55 {
56  return (bs->t < bs->N) && (bs->m < bs->n);
57 }
BlockNumber t
Definition: sampling.h:33
BlockNumber N
Definition: sampling.h:31

◆ BlockSampler_Init()

void BlockSampler_Init ( BlockSampler  bs,
BlockNumber  nblocks,
int  samplesize,
long  randseed 
)

Definition at line 37 of file sampling.c.

References BlockSamplerData::m, BlockSamplerData::N, BlockSamplerData::n, BlockSamplerData::randstate, sampler_random_init_state(), and BlockSamplerData::t.

Referenced by acquire_sample_rows().

39 {
40  bs->N = nblocks; /* measured table size */
41 
42  /*
43  * If we decide to reduce samplesize for tables that have less or not much
44  * more than samplesize blocks, here is the place to do it.
45  */
46  bs->n = samplesize;
47  bs->t = 0; /* blocks scanned so far */
48  bs->m = 0; /* blocks selected so far */
49 
50  sampler_random_init_state(randseed, bs->randstate);
51 }
void sampler_random_init_state(long seed, SamplerRandomState randstate)
Definition: sampling.c:229
BlockNumber t
Definition: sampling.h:33
BlockNumber N
Definition: sampling.h:31
SamplerRandomState randstate
Definition: sampling.h:35

◆ BlockSampler_Next()

BlockNumber BlockSampler_Next ( BlockSampler  bs)

Definition at line 60 of file sampling.c.

References Assert, BlockSampler_HasMore(), K, BlockSamplerData::m, BlockSamplerData::N, BlockSamplerData::n, BlockSamplerData::randstate, sampler_random_fract(), and BlockSamplerData::t.

Referenced by acquire_sample_rows().

61 {
62  BlockNumber K = bs->N - bs->t; /* remaining blocks */
63  int k = bs->n - bs->m; /* blocks still to sample */
64  double p; /* probability to skip block */
65  double V; /* random */
66 
67  Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
68 
69  if ((BlockNumber) k >= K)
70  {
71  /* need all the rest */
72  bs->m++;
73  return bs->t++;
74  }
75 
76  /*----------
77  * It is not obvious that this code matches Knuth's Algorithm S.
78  * Knuth says to skip the current block with probability 1 - k/K.
79  * If we are to skip, we should advance t (hence decrease K), and
80  * repeat the same probabilistic test for the next block. The naive
81  * implementation thus requires a sampler_random_fract() call for each
82  * block number. But we can reduce this to one sampler_random_fract()
83  * call per selected block, by noting that each time the while-test
84  * succeeds, we can reinterpret V as a uniform random number in the range
85  * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
86  * the appropriate fraction of its former value, and our next loop
87  * makes the appropriate probabilistic test.
88  *
89  * We have initially K > k > 0. If the loop reduces K to equal k,
90  * the next while-test must fail since p will become exactly zero
91  * (we assume there will not be roundoff error in the division).
92  * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
93  * to be doubly sure about roundoff error.) Therefore K cannot become
94  * less than k, which means that we cannot fail to select enough blocks.
95  *----------
96  */
98  p = 1.0 - (double) k / (double) K;
99  while (V < p)
100  {
101  /* skip */
102  bs->t++;
103  K--; /* keep K == N - t */
104 
105  /* adjust p to be new cutoff point in reduced range */
106  p *= 1.0 - (double) k / (double) K;
107  }
108 
109  /* select */
110  bs->m++;
111  return bs->t++;
112 }
bool BlockSampler_HasMore(BlockSampler bs)
Definition: sampling.c:54
double sampler_random_fract(SamplerRandomState randstate)
Definition: sampling.c:238
uint32 BlockNumber
Definition: block.h:31
BlockNumber t
Definition: sampling.h:33
#define K(t)
Definition: sha1.c:48
#define Assert(condition)
Definition: c.h:670
BlockNumber N
Definition: sampling.h:31
SamplerRandomState randstate
Definition: sampling.h:35

◆ reservoir_get_next_S()

double reservoir_get_next_S ( ReservoirState  rs,
double  t,
int  n 
)

Definition at line 142 of file sampling.c.

References ReservoirStateData::randstate, S, sampler_random_fract(), ReservoirStateData::W, and W.

Referenced by acquire_sample_rows(), analyze_row_processor(), anl_get_next_S(), and file_acquire_sample_rows().

143 {
144  double S;
145 
146  /* The magic constant here is T from Vitter's paper */
147  if (t <= (22.0 * n))
148  {
149  /* Process records using Algorithm X until t is large enough */
150  double V,
151  quot;
152 
153  V = sampler_random_fract(rs->randstate); /* Generate V */
154  S = 0;
155  t += 1;
156  /* Note: "num" in Vitter's code is always equal to t - n */
157  quot = (t - (double) n) / t;
158  /* Find min S satisfying (4.1) */
159  while (quot > V)
160  {
161  S += 1;
162  t += 1;
163  quot *= (t - (double) n) / t;
164  }
165  }
166  else
167  {
168  /* Now apply Algorithm Z */
169  double W = rs->W;
170  double term = t - (double) n + 1;
171 
172  for (;;)
173  {
174  double numer,
175  numer_lim,
176  denom;
177  double U,
178  X,
179  lhs,
180  rhs,
181  y,
182  tmp;
183 
184  /* Generate U and X */
186  X = t * (W - 1.0);
187  S = floor(X); /* S is tentatively set to floor(X) */
188  /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
189  tmp = (t + 1) / term;
190  lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
191  rhs = (((t + X) / (term + S)) * term) / t;
192  if (lhs <= rhs)
193  {
194  W = rhs / lhs;
195  break;
196  }
197  /* Test if U <= f(S)/cg(X) */
198  y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
199  if ((double) n < S)
200  {
201  denom = t;
202  numer_lim = term + S;
203  }
204  else
205  {
206  denom = t - (double) n + S;
207  numer_lim = t + 1;
208  }
209  for (numer = t + S; numer >= numer_lim; numer -= 1)
210  {
211  y *= numer / denom;
212  denom -= 1;
213  }
214  W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
215  if (exp(log(y) / n) <= (t + X) / t)
216  break;
217  }
218  rs->W = W;
219  }
220  return S;
221 }
double sampler_random_fract(SamplerRandomState randstate)
Definition: sampling.c:238
#define W(n)
Definition: sha1.c:60
#define S(n, x)
Definition: sha1.c:55
SamplerRandomState randstate
Definition: sampling.h:50

◆ reservoir_init_selection_state()

void reservoir_init_selection_state ( ReservoirState  rs,
int  n 
)

Definition at line 129 of file sampling.c.

References random(), ReservoirStateData::randstate, sampler_random_fract(), sampler_random_init_state(), and ReservoirStateData::W.

Referenced by acquire_sample_rows(), file_acquire_sample_rows(), and postgresAcquireSampleRowsFunc().

130 {
131  /*
132  * Reservoir sampling is not used anywhere where it would need to return
133  * repeatable results so we can initialize it randomly.
134  */
136 
137  /* Initial value of W (for use when Algorithm Z is first applied) */
138  rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
139 }
void sampler_random_init_state(long seed, SamplerRandomState randstate)
Definition: sampling.c:229
long random(void)
Definition: random.c:22
double sampler_random_fract(SamplerRandomState randstate)
Definition: sampling.c:238
SamplerRandomState randstate
Definition: sampling.h:50

◆ sampler_random_fract()

double sampler_random_fract ( SamplerRandomState  randstate)

Definition at line 238 of file sampling.c.

References pg_erand48().

Referenced by acquire_sample_rows(), analyze_row_processor(), anl_init_selection_state(), anl_random_fract(), BlockSampler_Next(), file_acquire_sample_rows(), random_relative_prime(), reservoir_get_next_S(), reservoir_init_selection_state(), system_rows_nextsampleblock(), and system_time_nextsampleblock().

239 {
240  double res;
241 
242  /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
243  do
244  {
245  res = pg_erand48(randstate);
246  } while (res == 0.0);
247  return res;
248 }
double pg_erand48(unsigned short xseed[3])
Definition: erand48.c:79

◆ sampler_random_init_state()

void sampler_random_init_state ( long  seed,
SamplerRandomState  randstate 
)

Definition at line 229 of file sampling.c.

Referenced by anl_init_selection_state(), anl_random_fract(), BlockSampler_Init(), reservoir_init_selection_state(), system_rows_nextsampleblock(), and system_time_nextsampleblock().

230 {
231  randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */
232  randstate[1] = (unsigned short) seed;
233  randstate[2] = (unsigned short) (seed >> 16);
234 }

Variable Documentation

◆ oldrs

ReservoirStateData oldrs
static

Definition at line 259 of file sampling.c.