hashsort_8c_source.html

/*-------------------------------------------------------------------------

 *

 * hashsort.c

 *      Sort tuples for insertion into a new hash index.

 *

 * When building a very large hash index, we pre-sort the tuples by bucket

 * number to improve locality of access to the index, and thereby avoid

 * thrashing.  We use tuplesort.c to sort the given index tuples into order.

 *

 * Note: if the number of rows in the table has been underestimated,

 * bucket splits may occur during the index build.  In that case we'd

 * be inserting into two or more buckets for each possible masked-off

 * hash code value.  That's no big problem though, since we'll still have

 * plenty of locality of access.

 *

 *

 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group

 * Portions Copyright (c) 1994, Regents of the University of California

 *

 * IDENTIFICATION

 *    src/backend/access/hash/hashsort.c

 *

 *-------------------------------------------------------------------------

 */


#include "postgres.h"


#include "access/hash.h"

#include "commands/progress.h"

#include "miscadmin.h"

#include "pgstat.h"

#include "port/pg_bitutils.h"

#include "utils/tuplesort.h"


/*

 * Status record for spooling/sorting phase.

 */

struct HSpool

{

    Tuplesortstate *sortstate;  /* state data for tuplesort.c */

    Relation    index;


    /*

     * We sort the hash keys based on the buckets they belong to, then by the

     * hash values themselves, to optimize insertions onto hash pages.  The

     * masks below are used in _hash_hashkey2bucket to determine the bucket of

     * a given hash key.

     */

    uint32      high_mask;

    uint32      low_mask;

    uint32      max_buckets;

};


/*

 * create and initialize a spool structure

 */

HSpool *

_h_spoolinit(Relation heap, Relation index, uint32 num_buckets)

{

    HSpool     *hspool = (HSpool *) palloc0(sizeof(HSpool));


    hspool->index = index;


    /*

     * Determine the bitmask for hash code values.  Since there are currently

     * num_buckets buckets in the index, the appropriate mask can be computed

     * as follows.

     *

     * NOTE : This hash mask calculation should be in sync with similar

     * calculation in _hash_init_metabuffer.

     */

    hspool->high_mask = pg_nextpower2_32(num_buckets + 1) - 1;

    hspool->low_mask = (hspool->high_mask >> 1);

    hspool->max_buckets = num_buckets - 1;


    /*

     * We size the sort area as maintenance_work_mem rather than work_mem to

     * speed index creation.  This should be OK since a single backend can't

     * run multiple index creations in parallel.

     */

    hspool->sortstate = tuplesort_begin_index_hash(heap,

                                                   index,

                                                   hspool->high_mask,

                                                   hspool->low_mask,

                                                   hspool->max_buckets,

                                                   maintenance_work_mem,

                                                   NULL,

                                                   TUPLESORT_NONE);


    return hspool;

}


/*

 * clean up a spool structure and its substructures.

 */

void

_h_spooldestroy(HSpool *hspool)

{

    tuplesort_end(hspool->sortstate);

    pfree(hspool);

}


/*

 * spool an index entry into the sort file.

 */

void

_h_spool(HSpool *hspool, ItemPointer self, const Datum *values, const bool *isnull)

{

    tuplesort_putindextuplevalues(hspool->sortstate, hspool->index,

                                  self, values, isnull);

}


/*

 * given a spool loaded by successive calls to _h_spool,

 * create an entire index.

 */

void

_h_indexbuild(HSpool *hspool, Relation heapRel)

{

    IndexTuple  itup;

    int64       tups_done = 0;

#ifdef USE_ASSERT_CHECKING

    uint32      hashkey = 0;

#endif


    tuplesort_performsort(hspool->sortstate);


    while ((itup = tuplesort_getindextuple(hspool->sortstate, true)) != NULL)

    {

        /*

         * Technically, it isn't critical that hash keys be found in sorted

         * order, since this sorting is only used to increase locality of

         * access as a performance optimization.  It still seems like a good

         * idea to test tuplesort.c's handling of hash index tuple sorts

         * through an assertion, though.

         */

#ifdef USE_ASSERT_CHECKING

        uint32      lasthashkey = hashkey;


        hashkey = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),

                                       hspool->max_buckets, hspool->high_mask,

                                       hspool->low_mask);

        Assert(hashkey >= lasthashkey);

#endif


        /* the tuples are sorted by hashkey, so pass 'sorted' as true */

        _hash_doinsert(hspool->index, itup, heapRel, true);


        /* allow insertion phase to be interrupted, and track progress */

        CHECK_FOR_INTERRUPTS();


        pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,

                                     ++tups_done);

    }

}

pgstat_progress_update_param
void pgstat_progress_update_param(int index, int64 val)
Definition: backend_progress.c:48

values
static Datum values[MAXATTR]
Definition: bootstrap.c:151

int64
int64_t int64
Definition: c.h:499

uint32
uint32_t uint32
Definition: c.h:502

maintenance_work_mem
int maintenance_work_mem
Definition: globals.c:133

hash.h

Assert
Assert(PointerIsAligned(start, uint64))

_hash_doinsert
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
Definition: hashinsert.c:38

_h_spool
void _h_spool(HSpool *hspool, ItemPointer self, const Datum *values, const bool *isnull)
Definition: hashsort.c:109

_h_indexbuild
void _h_indexbuild(HSpool *hspool, Relation heapRel)
Definition: hashsort.c:120

_h_spoolinit
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
Definition: hashsort.c:60

_h_spooldestroy
void _h_spooldestroy(HSpool *hspool)
Definition: hashsort.c:99

_hash_get_indextuple_hashkey
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Definition: hashutil.c:291

_hash_hashkey2bucket
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:125

pfree
void pfree(void *pointer)
Definition: mcxt.c:1528

palloc0
void * palloc0(Size size)
Definition: mcxt.c:1351

miscadmin.h

CHECK_FOR_INTERRUPTS
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122

pg_bitutils.h

pg_nextpower2_32
static uint32 pg_nextpower2_32(uint32 num)
Definition: pg_bitutils.h:189

pgstat.h

postgres.h

Datum
uintptr_t Datum
Definition: postgres.h:69

progress.h

PROGRESS_CREATEIDX_TUPLES_DONE
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition: progress.h:90

HSpool
Definition: hashsort.c:40

HSpool::low_mask
uint32 low_mask
Definition: hashsort.c:51

HSpool::sortstate
Tuplesortstate * sortstate
Definition: hashsort.c:41

HSpool::high_mask
uint32 high_mask
Definition: hashsort.c:50

HSpool::max_buckets
uint32 max_buckets
Definition: hashsort.c:52

HSpool::index
Relation index
Definition: hashsort.c:42

IndexTupleData
Definition: itup.h:36

ItemPointerData
Definition: itemptr.h:37

RelationData
Definition: rel.h:56

Tuplesortstate
Definition: tuplesort.c:186

index
Definition: type.h:96

tuplesort_performsort
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1363

tuplesort_end
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:951

tuplesort.h

TUPLESORT_NONE
#define TUPLESORT_NONE
Definition: tuplesort.h:94

tuplesort_getindextuple
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Definition: tuplesortvariants.c:1045

tuplesort_putindextuplevalues
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, const Datum *values, const bool *isnull)
Definition: tuplesortvariants.c:817

tuplesort_begin_index_hash
Tuplesortstate * tuplesort_begin_index_hash(Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets, int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesortvariants.c:439