backend_2access_2tablesample_2system_8c_source.html

/*-------------------------------------------------------------------------

 *

 * system.c

 *    support routines for SYSTEM tablesample method

 *

 * To ensure repeatability of samples, it is necessary that selection of a

 * given tuple be history-independent; otherwise syncscanning would break

 * repeatability, to say nothing of logically-irrelevant maintenance such

 * as physical extension or shortening of the relation.

 *

 * To achieve that, we proceed by hashing each candidate block number together

 * with the active seed, and then selecting it if the hash is less than the

 * cutoff value computed from the selection probability by BeginSampleScan.

 *

 *

 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group

 * Portions Copyright (c) 1994, Regents of the University of California

 *

 * IDENTIFICATION

 *    src/backend/access/tablesample/system.c

 *

 *-------------------------------------------------------------------------

 */


#include "postgres.h"


#include <math.h>


#include "access/tsmapi.h"

#include "catalog/pg_type.h"

#include "common/hashfn.h"

#include "optimizer/optimizer.h"

#include "utils/fmgrprotos.h"


/* Private state */

typedef struct

{

    uint64      cutoff;         /* select blocks with hash less than this */

    uint32      seed;           /* random seed */

    BlockNumber nextblock;      /* next block to consider sampling */

    OffsetNumber lt;            /* last tuple returned from current block */

} SystemSamplerData;


static void system_samplescangetsamplesize(PlannerInfo *root,

                                           RelOptInfo *baserel,

                                           List *paramexprs,

                                           BlockNumber *pages,

                                           double *tuples);

static void system_initsamplescan(SampleScanState *node,

                                  int eflags);

static void system_beginsamplescan(SampleScanState *node,

                                   Datum *params,

                                   int nparams,

                                   uint32 seed);

static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks);

static OffsetNumber system_nextsampletuple(SampleScanState *node,

                                           BlockNumber blockno,

                                           OffsetNumber maxoffset);


/*

 * Create a TsmRoutine descriptor for the SYSTEM method.

 */

Datum

tsm_system_handler(PG_FUNCTION_ARGS)

{

    TsmRoutine *tsm = makeNode(TsmRoutine);


    tsm->parameterTypes = list_make1_oid(FLOAT4OID);

    tsm->repeatable_across_queries = true;

    tsm->repeatable_across_scans = true;

    tsm->SampleScanGetSampleSize = system_samplescangetsamplesize;

    tsm->InitSampleScan = system_initsamplescan;

    tsm->BeginSampleScan = system_beginsamplescan;

    tsm->NextSampleBlock = system_nextsampleblock;

    tsm->NextSampleTuple = system_nextsampletuple;

    tsm->EndSampleScan = NULL;


    PG_RETURN_POINTER(tsm);

}


/*

 * Sample size estimation.

 */

static void

system_samplescangetsamplesize(PlannerInfo *root,

                               RelOptInfo *baserel,

                               List *paramexprs,

                               BlockNumber *pages,

                               double *tuples)

{

    Node       *pctnode;

    float4      samplefract;


    /* Try to extract an estimate for the sample percentage */

    pctnode = (Node *) linitial(paramexprs);

    pctnode = estimate_expression_value(root, pctnode);


    if (IsA(pctnode, Const) &&

        !((Const *) pctnode)->constisnull)

    {

        samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);

        if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))

            samplefract /= 100.0f;

        else

        {

            /* Default samplefract if the value is bogus */

            samplefract = 0.1f;

        }

    }

    else

    {

        /* Default samplefract if we didn't obtain a non-null Const */

        samplefract = 0.1f;

    }


    /* We'll visit a sample of the pages ... */

    *pages = clamp_row_est(baserel->pages * samplefract);


    /* ... and hopefully get a representative number of tuples from them */

    *tuples = clamp_row_est(baserel->tuples * samplefract);

}


/*

 * Initialize during executor setup.

 */

static void

system_initsamplescan(SampleScanState *node, int eflags)

{

    node->tsm_state = palloc0(sizeof(SystemSamplerData));

}


/*

 * Examine parameters and prepare for a sample scan.

 */

static void

system_beginsamplescan(SampleScanState *node,

                       Datum *params,

                       int nparams,

                       uint32 seed)

{

    SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;

    double      percent = DatumGetFloat4(params[0]);

    double      dcutoff;


    if (percent < 0 || percent > 100 || isnan(percent))

        ereport(ERROR,

                (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),

                 errmsg("sample percentage must be between 0 and 100")));


    /*

     * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to

     * store that as a uint64, of course.  Note that this gives strictly

     * correct behavior at the limits of zero or one probability.

     */

    dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);

    sampler->cutoff = (uint64) dcutoff;

    sampler->seed = seed;

    sampler->nextblock = 0;

    sampler->lt = InvalidOffsetNumber;


    /*

     * Bulkread buffer access strategy probably makes sense unless we're

     * scanning a very small fraction of the table.  The 1% cutoff here is a

     * guess.  We should use pagemode visibility checking, since we scan all

     * tuples on each selected page.

     */

    node->use_bulkread = (percent >= 1);

    node->use_pagemode = true;

}


/*

 * Select next block to sample.

 */

static BlockNumber

system_nextsampleblock(SampleScanState *node, BlockNumber nblocks)

{

    SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;

    BlockNumber nextblock = sampler->nextblock;

    uint32      hashinput[2];


    /*

     * We compute the hash by applying hash_any to an array of 2 uint32's

     * containing the block number and seed.  This is efficient to set up, and

     * with the current implementation of hash_any, it gives

     * machine-independent results, which is a nice property for regression

     * testing.

     *

     * These words in the hash input are the same throughout the block:

     */

    hashinput[1] = sampler->seed;


    /*

     * Loop over block numbers until finding suitable block or reaching end of

     * relation.

     */

    for (; nextblock < nblocks; nextblock++)

    {

        uint32      hash;


        hashinput[0] = nextblock;


        hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,

                                       (int) sizeof(hashinput)));

        if (hash < sampler->cutoff)

            break;

    }


    if (nextblock < nblocks)

    {

        /* Found a suitable block; remember where we should start next time */

        sampler->nextblock = nextblock + 1;

        return nextblock;

    }


    /* Done, but let's reset nextblock to 0 for safety. */

    sampler->nextblock = 0;

    return InvalidBlockNumber;

}


/*

 * Select next sampled tuple in current block.

 *

 * In block sampling, we just want to sample all the tuples in each selected

 * block.

 *

 * It is OK here to return an offset without knowing if the tuple is visible

 * (or even exists); nodeSamplescan.c will deal with that.

 *

 * When we reach end of the block, return InvalidOffsetNumber which tells

 * SampleScan to go to next block.

 */

static OffsetNumber

system_nextsampletuple(SampleScanState *node,

                       BlockNumber blockno,

                       OffsetNumber maxoffset)

{

    SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;

    OffsetNumber tupoffset = sampler->lt;


    /* Advance to next possible offset on page */

    if (tupoffset == InvalidOffsetNumber)

        tupoffset = FirstOffsetNumber;

    else

        tupoffset++;


    /* Done? */

    if (tupoffset > maxoffset)

        tupoffset = InvalidOffsetNumber;


    sampler->lt = tupoffset;


    return tupoffset;

}

system_nextsampletuple
static OffsetNumber system_nextsampletuple(SampleScanState *node, BlockNumber blockno, OffsetNumber maxoffset)
Definition: system.c:236

system_initsamplescan
static void system_initsamplescan(SampleScanState *node, int eflags)
Definition: system.c:130

system_beginsamplescan
static void system_beginsamplescan(SampleScanState *node, Datum *params, int nparams, uint32 seed)
Definition: system.c:139

tsm_system_handler
Datum tsm_system_handler(PG_FUNCTION_ARGS)
Definition: system.c:67

system_nextsampleblock
static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
Definition: system.c:178

system_samplescangetsamplesize
static void system_samplescangetsamplesize(PlannerInfo *root, RelOptInfo *baserel, List *paramexprs, BlockNumber *pages, double *tuples)
Definition: system.c:88

BlockNumber
uint32 BlockNumber
Definition: block.h:31

InvalidBlockNumber
#define InvalidBlockNumber
Definition: block.h:33

PG_UINT32_MAX
#define PG_UINT32_MAX
Definition: c.h:561

uint64
uint64_t uint64
Definition: c.h:503

uint32
uint32_t uint32
Definition: c.h:502

float4
float float4
Definition: c.h:600

estimate_expression_value
Node * estimate_expression_value(PlannerInfo *root, Node *node)
Definition: clauses.c:2397

clamp_row_est
double clamp_row_est(double nrows)
Definition: costsize.c:213

errcode
int errcode(int sqlerrcode)
Definition: elog.c:854

errmsg
int errmsg(const char *fmt,...)
Definition: elog.c:1071

ERROR
#define ERROR
Definition: elog.h:39

ereport
#define ereport(elevel,...)
Definition: elog.h:149

PG_RETURN_POINTER
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361

PG_FUNCTION_ARGS
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193

hashfn.h

hash_any
static Datum hash_any(const unsigned char *k, int keylen)
Definition: hashfn.h:31

for
for(;;)
Definition: hashfn_unstable.h:265

if
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81

palloc0
void * palloc0(Size size)
Definition: mcxt.c:1351

IsA
#define IsA(nodeptr, _type_)
Definition: nodes.h:164

makeNode
#define makeNode(_type_)
Definition: nodes.h:161

InvalidOffsetNumber
#define InvalidOffsetNumber
Definition: off.h:26

OffsetNumber
uint16 OffsetNumber
Definition: off.h:24

FirstOffsetNumber
#define FirstOffsetNumber
Definition: off.h:27

optimizer.h

list_make1_oid
#define list_make1_oid(x1)
Definition: pg_list.h:242

linitial
#define linitial(l)
Definition: pg_list.h:178

pg_type.h

postgres.h

DatumGetUInt32
static uint32 DatumGetUInt32(Datum X)
Definition: postgres.h:227

Datum
uintptr_t Datum
Definition: postgres.h:69

DatumGetFloat4
static float4 DatumGetFloat4(Datum X)
Definition: postgres.h:463

root
tree ctl root
Definition: radixtree.h:1857

hash
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715

Const
Definition: primnodes.h:324

List
Definition: pg_list.h:54

Node
Definition: nodes.h:135

PlannerInfo
Definition: pathnodes.h:217

RelOptInfo
Definition: pathnodes.h:884

RelOptInfo::tuples
Cardinality tuples
Definition: pathnodes.h:973

RelOptInfo::pages
BlockNumber pages
Definition: pathnodes.h:972

SampleScanState
Definition: execnodes.h:1633

SampleScanState::use_bulkread
bool use_bulkread
Definition: execnodes.h:1640

SampleScanState::tsm_state
void * tsm_state
Definition: execnodes.h:1639

SampleScanState::use_pagemode
bool use_pagemode
Definition: execnodes.h:1641

SystemSamplerData
Definition: system.c:38

SystemSamplerData::seed
uint32 seed
Definition: system.c:40

SystemSamplerData::nextblock
BlockNumber nextblock
Definition: system.c:41

SystemSamplerData::cutoff
uint64 cutoff
Definition: system.c:39

SystemSamplerData::lt
OffsetNumber lt
Definition: system.c:42

TsmRoutine
Definition: tsmapi.h:57

TsmRoutine::NextSampleTuple
NextSampleTuple_function NextSampleTuple
Definition: tsmapi.h:74

TsmRoutine::repeatable_across_scans
bool repeatable_across_scans
Definition: tsmapi.h:65

TsmRoutine::EndSampleScan
EndSampleScan_function EndSampleScan
Definition: tsmapi.h:75

TsmRoutine::SampleScanGetSampleSize
SampleScanGetSampleSize_function SampleScanGetSampleSize
Definition: tsmapi.h:68

TsmRoutine::BeginSampleScan
BeginSampleScan_function BeginSampleScan
Definition: tsmapi.h:72

TsmRoutine::NextSampleBlock
NextSampleBlock_function NextSampleBlock
Definition: tsmapi.h:73

TsmRoutine::InitSampleScan
InitSampleScan_function InitSampleScan
Definition: tsmapi.h:71

TsmRoutine::parameterTypes
List * parameterTypes
Definition: tsmapi.h:61

TsmRoutine::repeatable_across_queries
bool repeatable_across_queries
Definition: tsmapi.h:64

tsmapi.h