PostgreSQL Source Code git master
nbtsort.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/parallel.h"
#include "access/relscan.h"
#include "access/table.h"
#include "access/xact.h"
#include "catalog/index.h"
#include "commands/progress.h"
#include "executor/instrument.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bulk_write.h"
#include "tcop/tcopprot.h"
#include "utils/rel.h"
#include "utils/sortsupport.h"
#include "utils/tuplesort.h"
Include dependency graph for nbtsort.c:

Go to the source code of this file.

Data Structures

struct  BTSpool
 
struct  BTShared
 
struct  BTLeader
 
struct  BTBuildState
 
struct  BTPageState
 
struct  BTWriteState
 

Macros

#define PARALLEL_KEY_BTREE_SHARED   UINT64CONST(0xA000000000000001)
 
#define PARALLEL_KEY_TUPLESORT   UINT64CONST(0xA000000000000002)
 
#define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
 
#define PARALLEL_KEY_QUERY_TEXT   UINT64CONST(0xA000000000000004)
 
#define PARALLEL_KEY_WAL_USAGE   UINT64CONST(0xA000000000000005)
 
#define PARALLEL_KEY_BUFFER_USAGE   UINT64CONST(0xA000000000000006)
 
#define ParallelTableScanFromBTShared(shared)    (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
 

Typedefs

typedef struct BTSpool BTSpool
 
typedef struct BTShared BTShared
 
typedef struct BTLeader BTLeader
 
typedef struct BTBuildState BTBuildState
 
typedef struct BTPageState BTPageState
 
typedef struct BTWriteState BTWriteState
 

Functions

static double _bt_spools_heapscan (Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
 
static void _bt_spooldestroy (BTSpool *btspool)
 
static void _bt_spool (BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
 
static void _bt_leafbuild (BTSpool *btspool, BTSpool *btspool2)
 
static void _bt_build_callback (Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
 
static BulkWriteBuffer _bt_blnewpage (BTWriteState *wstate, uint32 level)
 
static BTPageState_bt_pagestate (BTWriteState *wstate, uint32 level)
 
static void _bt_slideleft (Page rightmostpage)
 
static void _bt_sortaddtup (Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
 
static void _bt_buildadd (BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
 
static void _bt_sort_dedup_finish_pending (BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
 
static void _bt_uppershutdown (BTWriteState *wstate, BTPageState *state)
 
static void _bt_load (BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 
static void _bt_begin_parallel (BTBuildState *buildstate, bool isconcurrent, int request)
 
static void _bt_end_parallel (BTLeader *btleader)
 
static Size _bt_parallel_estimate_shared (Relation heap, Snapshot snapshot)
 
static double _bt_parallel_heapscan (BTBuildState *buildstate, bool *brokenhotchain)
 
static void _bt_leader_participate_as_worker (BTBuildState *buildstate)
 
static void _bt_parallel_scan_and_sort (BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
 
IndexBuildResultbtbuild (Relation heap, Relation index, IndexInfo *indexInfo)
 
static void _bt_blwritepage (BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ PARALLEL_KEY_BTREE_SHARED

#define PARALLEL_KEY_BTREE_SHARED   UINT64CONST(0xA000000000000001)

Definition at line 61 of file nbtsort.c.

◆ PARALLEL_KEY_BUFFER_USAGE

#define PARALLEL_KEY_BUFFER_USAGE   UINT64CONST(0xA000000000000006)

Definition at line 66 of file nbtsort.c.

◆ PARALLEL_KEY_QUERY_TEXT

#define PARALLEL_KEY_QUERY_TEXT   UINT64CONST(0xA000000000000004)

Definition at line 64 of file nbtsort.c.

◆ PARALLEL_KEY_TUPLESORT

#define PARALLEL_KEY_TUPLESORT   UINT64CONST(0xA000000000000002)

Definition at line 62 of file nbtsort.c.

◆ PARALLEL_KEY_TUPLESORT_SPOOL2

#define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)

Definition at line 63 of file nbtsort.c.

◆ PARALLEL_KEY_WAL_USAGE

#define PARALLEL_KEY_WAL_USAGE   UINT64CONST(0xA000000000000005)

Definition at line 65 of file nbtsort.c.

◆ ParallelTableScanFromBTShared

#define ParallelTableScanFromBTShared (   shared)     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))

Definition at line 161 of file nbtsort.c.

Typedef Documentation

◆ BTBuildState

typedef struct BTBuildState BTBuildState

◆ BTLeader

typedef struct BTLeader BTLeader

◆ BTPageState

typedef struct BTPageState BTPageState

◆ BTShared

typedef struct BTShared BTShared

◆ BTSpool

typedef struct BTSpool BTSpool

◆ BTWriteState

typedef struct BTWriteState BTWriteState

Function Documentation

◆ _bt_begin_parallel()

static void _bt_begin_parallel ( BTBuildState buildstate,
bool  isconcurrent,
int  request 
)
static

Definition at line 1397 of file nbtsort.c.

1398{
1399 ParallelContext *pcxt;
1400 int scantuplesortstates;
1401 Snapshot snapshot;
1402 Size estbtshared;
1403 Size estsort;
1404 BTShared *btshared;
1405 Sharedsort *sharedsort;
1406 Sharedsort *sharedsort2;
1407 BTSpool *btspool = buildstate->spool;
1408 BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1409 WalUsage *walusage;
1410 BufferUsage *bufferusage;
1411 bool leaderparticipates = true;
1412 int querylen;
1413
1414#ifdef DISABLE_LEADER_PARTICIPATION
1415 leaderparticipates = false;
1416#endif
1417
1418 /*
1419 * Enter parallel mode, and create context for parallel build of btree
1420 * index
1421 */
1423 Assert(request > 0);
1424 pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1425 request);
1426
1427 scantuplesortstates = leaderparticipates ? request + 1 : request;
1428
1429 /*
1430 * Prepare for scan of the base relation. In a normal index build, we use
1431 * SnapshotAny because we must retrieve all tuples and do our own time
1432 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1433 * concurrent build, we take a regular MVCC snapshot and index whatever's
1434 * live according to that.
1435 */
1436 if (!isconcurrent)
1437 snapshot = SnapshotAny;
1438 else
1440
1441 /*
1442 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1443 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1444 */
1445 estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1446 shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1447 estsort = tuplesort_estimate_shared(scantuplesortstates);
1448 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1449
1450 /*
1451 * Unique case requires a second spool, and so we may have to account for
1452 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1453 */
1454 if (!btspool->isunique)
1456 else
1457 {
1458 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1460 }
1461
1462 /*
1463 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1464 * and PARALLEL_KEY_BUFFER_USAGE.
1465 *
1466 * If there are no extensions loaded that care, we could skip this. We
1467 * have no way of knowing whether anyone's looking at pgWalUsage or
1468 * pgBufferUsage, so do it unconditionally.
1469 */
1471 mul_size(sizeof(WalUsage), pcxt->nworkers));
1474 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1476
1477 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1479 {
1480 querylen = strlen(debug_query_string);
1481 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1483 }
1484 else
1485 querylen = 0; /* keep compiler quiet */
1486
1487 /* Everyone's had a chance to ask for space, so now create the DSM */
1489
1490 /* If no DSM segment was available, back out (do serial build) */
1491 if (pcxt->seg == NULL)
1492 {
1493 if (IsMVCCSnapshot(snapshot))
1494 UnregisterSnapshot(snapshot);
1497 return;
1498 }
1499
1500 /* Store shared build state, for which we reserved space */
1501 btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1502 /* Initialize immutable state */
1503 btshared->heaprelid = RelationGetRelid(btspool->heap);
1504 btshared->indexrelid = RelationGetRelid(btspool->index);
1505 btshared->isunique = btspool->isunique;
1506 btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1507 btshared->isconcurrent = isconcurrent;
1508 btshared->scantuplesortstates = scantuplesortstates;
1509 btshared->queryid = pgstat_get_my_query_id();
1511 SpinLockInit(&btshared->mutex);
1512 /* Initialize mutable state */
1513 btshared->nparticipantsdone = 0;
1514 btshared->reltuples = 0.0;
1515 btshared->havedead = false;
1516 btshared->indtuples = 0.0;
1517 btshared->brokenhotchain = false;
1520 snapshot);
1521
1522 /*
1523 * Store shared tuplesort-private state, for which we reserved space.
1524 * Then, initialize opaque state using tuplesort routine.
1525 */
1526 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1527 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1528 pcxt->seg);
1529
1531 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1532
1533 /* Unique case requires a second spool, and associated shared state */
1534 if (!btspool->isunique)
1535 sharedsort2 = NULL;
1536 else
1537 {
1538 /*
1539 * Store additional shared tuplesort-private state, for which we
1540 * reserved space. Then, initialize opaque state using tuplesort
1541 * routine.
1542 */
1543 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1544 tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1545 pcxt->seg);
1546
1548 }
1549
1550 /* Store query string for workers */
1552 {
1553 char *sharedquery;
1554
1555 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1556 memcpy(sharedquery, debug_query_string, querylen + 1);
1557 shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1558 }
1559
1560 /*
1561 * Allocate space for each worker's WalUsage and BufferUsage; no need to
1562 * initialize.
1563 */
1564 walusage = shm_toc_allocate(pcxt->toc,
1565 mul_size(sizeof(WalUsage), pcxt->nworkers));
1566 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1567 bufferusage = shm_toc_allocate(pcxt->toc,
1568 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1569 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1570
1571 /* Launch workers, saving status for leader/caller */
1573 btleader->pcxt = pcxt;
1574 btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1575 if (leaderparticipates)
1576 btleader->nparticipanttuplesorts++;
1577 btleader->btshared = btshared;
1578 btleader->sharedsort = sharedsort;
1579 btleader->sharedsort2 = sharedsort2;
1580 btleader->snapshot = snapshot;
1581 btleader->walusage = walusage;
1582 btleader->bufferusage = bufferusage;
1583
1584 /* If no workers were successfully launched, back out (do serial build) */
1585 if (pcxt->nworkers_launched == 0)
1586 {
1587 _bt_end_parallel(btleader);
1588 return;
1589 }
1590
1591 /* Save leader state now that it's clear build will be parallel */
1592 buildstate->btleader = btleader;
1593
1594 /* Join heap scan ourselves */
1595 if (leaderparticipates)
1597
1598 /*
1599 * Caller needs to wait for all launched workers when we return. Make
1600 * sure that the failure-to-start case will not hang forever.
1601 */
1603}
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:211
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:573
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:950
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition: parallel.c:173
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition: parallel.c:693
uint64 pgstat_get_my_query_id(void)
size_t Size
Definition: c.h:576
void ConditionVariableInit(ConditionVariable *cv)
Assert(PointerIsAligned(start, uint64))
void * palloc0(Size size)
Definition: mcxt.c:1347
#define PARALLEL_KEY_BUFFER_USAGE
Definition: nbtsort.c:66
#define ParallelTableScanFromBTShared(shared)
Definition: nbtsort.c:161
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1609
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:63
static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition: nbtsort.c:1635
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:61
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:62
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:64
#define PARALLEL_KEY_WAL_USAGE
Definition: nbtsort.c:65
static void _bt_leader_participate_as_worker(BTBuildState *buildstate)
Definition: nbtsort.c:1689
const char * debug_query_string
Definition: postgres.c:88
#define RelationGetRelid(relation)
Definition: rel.h:512
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
Size mul_size(Size s1, Size s2)
Definition: shmem.c:505
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:271
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:853
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:811
#define SnapshotAny
Definition: snapmgr.h:33
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:55
#define SpinLockInit(lock)
Definition: spin.h:57
BTSpool * spool
Definition: nbtsort.c:210
BTLeader * btleader
Definition: nbtsort.c:224
ParallelContext * pcxt
Definition: nbtsort.c:170
BTShared * btshared
Definition: nbtsort.c:190
Sharedsort * sharedsort
Definition: nbtsort.c:191
Sharedsort * sharedsort2
Definition: nbtsort.c:192
int nparticipanttuplesorts
Definition: nbtsort.c:178
BufferUsage * bufferusage
Definition: nbtsort.c:195
Snapshot snapshot
Definition: nbtsort.c:193
WalUsage * walusage
Definition: nbtsort.c:194
slock_t mutex
Definition: nbtsort.c:124
bool isconcurrent
Definition: nbtsort.c:104
double indtuples
Definition: nbtsort.c:145
double reltuples
Definition: nbtsort.c:143
Oid heaprelid
Definition: nbtsort.c:100
bool brokenhotchain
Definition: nbtsort.c:146
bool isunique
Definition: nbtsort.c:102
int nparticipantsdone
Definition: nbtsort.c:142
ConditionVariable workersdonecv
Definition: nbtsort.c:116
int scantuplesortstates
Definition: nbtsort.c:105
uint64 queryid
Definition: nbtsort.c:108
Oid indexrelid
Definition: nbtsort.c:101
bool havedead
Definition: nbtsort.c:144
bool nulls_not_distinct
Definition: nbtsort.c:103
bool isunique
Definition: nbtsort.c:84
bool nulls_not_distinct
Definition: nbtsort.c:85
Relation heap
Definition: nbtsort.c:82
Relation index
Definition: nbtsort.c:83
dsm_segment * seg
Definition: parallel.h:42
shm_toc_estimator estimator
Definition: parallel.h:41
shm_toc * toc
Definition: parallel.h:44
int nworkers_launched
Definition: parallel.h:37
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition: tableam.c:146
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:2938
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:2917
void ExitParallelMode(void)
Definition: xact.c:1064
void EnterParallelMode(void)
Definition: xact.c:1051

References _bt_end_parallel(), _bt_leader_participate_as_worker(), _bt_parallel_estimate_shared(), Assert(), BTShared::brokenhotchain, BTBuildState::btleader, BTLeader::btshared, BTLeader::bufferusage, ConditionVariableInit(), CreateParallelContext(), debug_query_string, DestroyParallelContext(), EnterParallelMode(), ParallelContext::estimator, ExitParallelMode(), GetTransactionSnapshot(), BTShared::havedead, BTSpool::heap, BTShared::heaprelid, BTSpool::index, BTShared::indexrelid, BTShared::indtuples, InitializeParallelDSM(), BTShared::isconcurrent, IsMVCCSnapshot, BTSpool::isunique, BTShared::isunique, LaunchParallelWorkers(), mul_size(), BTShared::mutex, BTShared::nparticipantsdone, BTLeader::nparticipanttuplesorts, BTSpool::nulls_not_distinct, BTShared::nulls_not_distinct, ParallelContext::nworkers, ParallelContext::nworkers_launched, palloc0(), PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelTableScanFromBTShared, BTLeader::pcxt, pgstat_get_my_query_id(), BTShared::queryid, RegisterSnapshot(), RelationGetRelid, BTShared::reltuples, BTShared::scantuplesortstates, ParallelContext::seg, BTLeader::sharedsort, BTLeader::sharedsort2, shm_toc_allocate(), shm_toc_estimate_chunk, shm_toc_estimate_keys, shm_toc_insert(), BTLeader::snapshot, SnapshotAny, SpinLockInit, BTBuildState::spool, table_parallelscan_initialize(), ParallelContext::toc, tuplesort_estimate_shared(), tuplesort_initialize_shared(), UnregisterSnapshot(), WaitForParallelWorkersToAttach(), BTLeader::walusage, and BTShared::workersdonecv.

Referenced by _bt_spools_heapscan().

◆ _bt_blnewpage()

static BulkWriteBuffer _bt_blnewpage ( BTWriteState wstate,
uint32  level 
)
static

Definition at line 608 of file nbtsort.c.

609{
611 Page page;
612 BTPageOpaque opaque;
613
615 page = (Page) buf;
616
617 /* Zero the page and set up standard page header info */
618 _bt_pageinit(page, BLCKSZ);
619
620 /* Initialize BT opaque state */
621 opaque = BTPageGetOpaque(page);
622 opaque->btpo_prev = opaque->btpo_next = P_NONE;
623 opaque->btpo_level = level;
624 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
625 opaque->btpo_cycleid = 0;
626
627 /* Make the P_HIKEY line pointer appear allocated */
628 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
629
630 return buf;
631}
PageHeaderData * PageHeader
Definition: bufpage.h:174
PageData * Page
Definition: bufpage.h:82
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
struct ItemIdData ItemIdData
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1129
#define BTP_LEAF
Definition: nbtree.h:76
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_NONE
Definition: nbtree.h:212
static char * buf
Definition: pg_test_fsync.c:72
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint16 btpo_flags
Definition: nbtree.h:67
uint32 btpo_level
Definition: nbtree.h:66
BTCycleId btpo_cycleid
Definition: nbtree.h:68
BulkWriteState * bulkstate
Definition: nbtsort.c:250

References _bt_pageinit(), BTP_LEAF, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, BTWriteState::bulkstate, P_NONE, and smgr_bulk_get_buf().

Referenced by _bt_buildadd(), and _bt_pagestate().

◆ _bt_blwritepage()

static void _bt_blwritepage ( BTWriteState wstate,
BulkWriteBuffer  buf,
BlockNumber  blkno 
)
static

Definition at line 637 of file nbtsort.c.

638{
639 smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
640 /* smgr_bulk_write took ownership of 'buf' */
641}
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323

References buf, BTWriteState::bulkstate, and smgr_bulk_write().

Referenced by _bt_buildadd(), and _bt_uppershutdown().

◆ _bt_build_callback()

static void _bt_build_callback ( Relation  index,
ItemPointer  tid,
Datum values,
bool *  isnull,
bool  tupleIsAlive,
void *  state 
)
static

Definition at line 579 of file nbtsort.c.

585{
586 BTBuildState *buildstate = (BTBuildState *) state;
587
588 /*
589 * insert the index tuple into the appropriate spool file for subsequent
590 * processing
591 */
592 if (tupleIsAlive || buildstate->spool2 == NULL)
593 _bt_spool(buildstate->spool, tid, values, isnull);
594 else
595 {
596 /* dead tuples are put into spool2 */
597 buildstate->havedead = true;
598 _bt_spool(buildstate->spool2, tid, values, isnull);
599 }
600
601 buildstate->indtuples += 1;
602}
static Datum values[MAXATTR]
Definition: bootstrap.c:151
static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
Definition: nbtsort.c:527
bool havedead
Definition: nbtsort.c:208
BTSpool * spool2
Definition: nbtsort.c:216
double indtuples
Definition: nbtsort.c:217
Definition: regguts.h:323

References _bt_spool(), BTBuildState::havedead, BTBuildState::indtuples, BTBuildState::spool, BTBuildState::spool2, and values.

Referenced by _bt_parallel_scan_and_sort(), and _bt_spools_heapscan().

◆ _bt_buildadd()

static void _bt_buildadd ( BTWriteState wstate,
BTPageState state,
IndexTuple  itup,
Size  truncextra 
)
static

Definition at line 786 of file nbtsort.c.

788{
789 BulkWriteBuffer nbuf;
790 Page npage;
791 BlockNumber nblkno;
792 OffsetNumber last_off;
793 Size last_truncextra;
794 Size pgspc;
795 Size itupsz;
796 bool isleaf;
797
798 /*
799 * This is a handy place to check for cancel interrupts during the btree
800 * load phase of index creation.
801 */
803
804 nbuf = state->btps_buf;
805 npage = (Page) nbuf;
806 nblkno = state->btps_blkno;
807 last_off = state->btps_lastoff;
808 last_truncextra = state->btps_lastextra;
809 state->btps_lastextra = truncextra;
810
811 pgspc = PageGetFreeSpace(npage);
812 itupsz = IndexTupleSize(itup);
813 itupsz = MAXALIGN(itupsz);
814 /* Leaf case has slightly different rules due to suffix truncation */
815 isleaf = (state->btps_level == 0);
816
817 /*
818 * Check whether the new item can fit on a btree page on current level at
819 * all.
820 *
821 * Every newly built index will treat heap TID as part of the keyspace,
822 * which imposes the requirement that new high keys must occasionally have
823 * a heap TID appended within _bt_truncate(). That may leave a new pivot
824 * tuple one or two MAXALIGN() quantums larger than the original
825 * firstright tuple it's derived from. v4 deals with the problem by
826 * decreasing the limit on the size of tuples inserted on the leaf level
827 * by the same small amount. Enforce the new v4+ limit on the leaf level,
828 * and the old limit on internal levels, since pivot tuples may need to
829 * make use of the reserved space. This should never fail on internal
830 * pages.
831 */
832 if (unlikely(itupsz > BTMaxItemSize))
833 _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
834 itup);
835
836 /*
837 * Check to see if current page will fit new item, with space left over to
838 * append a heap TID during suffix truncation when page is a leaf page.
839 *
840 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
841 * high key with heap TID when finishing off a leaf page, since we rely on
842 * _bt_check_third_page() rejecting oversized non-pivot tuples. On
843 * internal pages we can always fit 3 pivot tuples with larger internal
844 * page tuple limit (includes page high key).
845 *
846 * Most of the time, a page is only "full" in the sense that the soft
847 * fillfactor-wise limit has been exceeded. However, we must always leave
848 * at least two items plus a high key on each page before starting a new
849 * page. Disregard fillfactor and insert on "full" current page if we
850 * don't have the minimum number of items yet. (Note that we deliberately
851 * assume that suffix truncation neither enlarges nor shrinks new high key
852 * when applying soft limit, except when last tuple has a posting list.)
853 */
854 Assert(last_truncextra == 0 || isleaf);
855 if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
856 (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
857 {
858 /*
859 * Finish off the page and write it out.
860 */
861 BulkWriteBuffer obuf = nbuf;
862 Page opage = npage;
863 BlockNumber oblkno = nblkno;
864 ItemId ii;
865 ItemId hii;
866 IndexTuple oitup;
867
868 /* Create new page of same level */
869 nbuf = _bt_blnewpage(wstate, state->btps_level);
870 npage = (Page) nbuf;
871
872 /* and assign it a page position */
873 nblkno = wstate->btws_pages_alloced++;
874
875 /*
876 * We copy the last item on the page into the new page, and then
877 * rearrange the old page so that the 'last item' becomes its high key
878 * rather than a true data item. There had better be at least two
879 * items on the page already, else the page would be empty of useful
880 * data.
881 */
882 Assert(last_off > P_FIRSTKEY);
883 ii = PageGetItemId(opage, last_off);
884 oitup = (IndexTuple) PageGetItem(opage, ii);
885 _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
886 !isleaf);
887
888 /*
889 * Move 'last' into the high key position on opage. _bt_blnewpage()
890 * allocated empty space for a line pointer when opage was first
891 * created, so this is a matter of rearranging already-allocated space
892 * on page, and initializing high key line pointer. (Actually, leaf
893 * pages must also swap oitup with a truncated version of oitup, which
894 * is sometimes larger than oitup, though never by more than the space
895 * needed to append a heap TID.)
896 */
897 hii = PageGetItemId(opage, P_HIKEY);
898 *hii = *ii;
899 ItemIdSetUnused(ii); /* redundant */
900 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
901
902 if (isleaf)
903 {
904 IndexTuple lastleft;
905 IndexTuple truncated;
906
907 /*
908 * Truncate away any unneeded attributes from high key on leaf
909 * level. This is only done at the leaf level because downlinks
910 * in internal pages are either negative infinity items, or get
911 * their contents from copying from one level down. See also:
912 * _bt_split().
913 *
914 * We don't try to bias our choice of split point to make it more
915 * likely that _bt_truncate() can truncate away more attributes,
916 * whereas the split point used within _bt_split() is chosen much
917 * more delicately. Even still, the lastleft and firstright
918 * tuples passed to _bt_truncate() here are at least not fully
919 * equal to each other when deduplication is used, unless there is
920 * a large group of duplicates (also, unique index builds usually
921 * have few or no spool2 duplicates). When the split point is
922 * between two unequal tuples, _bt_truncate() will avoid including
923 * a heap TID in the new high key, which is the most important
924 * benefit of suffix truncation.
925 *
926 * Overwrite the old item with new truncated high key directly.
927 * oitup is already located at the physical beginning of tuple
928 * space, so this should directly reuse the existing tuple space.
929 */
930 ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
931 lastleft = (IndexTuple) PageGetItem(opage, ii);
932
933 Assert(IndexTupleSize(oitup) > last_truncextra);
934 truncated = _bt_truncate(wstate->index, lastleft, oitup,
935 wstate->inskey);
936 if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
937 IndexTupleSize(truncated)))
938 elog(ERROR, "failed to add high key to the index page");
939 pfree(truncated);
940
941 /* oitup should continue to point to the page's high key */
942 hii = PageGetItemId(opage, P_HIKEY);
943 oitup = (IndexTuple) PageGetItem(opage, hii);
944 }
945
946 /*
947 * Link the old page into its parent, using its low key. If we don't
948 * have a parent, we have to create one; this adds a new btree level.
949 */
950 if (state->btps_next == NULL)
951 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
952
953 Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
955 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
957 Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
958 !P_LEFTMOST(BTPageGetOpaque(opage)));
959 BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
960 _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
961 pfree(state->btps_lowkey);
962
963 /*
964 * Save a copy of the high key from the old page. It is also the low
965 * key for the new page.
966 */
967 state->btps_lowkey = CopyIndexTuple(oitup);
968
969 /*
970 * Set the sibling links for both pages.
971 */
972 {
973 BTPageOpaque oopaque = BTPageGetOpaque(opage);
974 BTPageOpaque nopaque = BTPageGetOpaque(npage);
975
976 oopaque->btpo_next = nblkno;
977 nopaque->btpo_prev = oblkno;
978 nopaque->btpo_next = P_NONE; /* redundant */
979 }
980
981 /*
982 * Write out the old page. _bt_blwritepage takes ownership of the
983 * 'opage' buffer.
984 */
985 _bt_blwritepage(wstate, obuf, oblkno);
986
987 /*
988 * Reset last_off to point to new page
989 */
990 last_off = P_FIRSTKEY;
991 }
992
993 /*
994 * By here, either original page is still the current page, or a new page
995 * was created that became the current page. Either way, the current page
996 * definitely has space for new item.
997 *
998 * If the new item is the first for its page, it must also be the first
999 * item on its entire level. On later same-level pages, a low key for a
1000 * page will be copied from the prior page in the code above. Generate a
1001 * minus infinity low key here instead.
1002 */
1003 if (last_off == P_HIKEY)
1004 {
1005 Assert(state->btps_lowkey == NULL);
1006 state->btps_lowkey = palloc0(sizeof(IndexTupleData));
1007 state->btps_lowkey->t_info = sizeof(IndexTupleData);
1008 BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1009 }
1010
1011 /*
1012 * Add the new item into the current page.
1013 */
1014 last_off = OffsetNumberNext(last_off);
1015 _bt_sortaddtup(npage, itupsz, itup, last_off,
1016 !isleaf && last_off == P_FIRSTKEY);
1017
1018 state->btps_buf = nbuf;
1019 state->btps_blkno = nblkno;
1020 state->btps_lastoff = last_off;
1021}
uint32 BlockNumber
Definition: block.h:31
Size PageGetFreeSpace(const PageData *page)
Definition: bufpage.c:896
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1394
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
#define MAXALIGN(LEN)
Definition: c.h:782
#define unlikely(x)
Definition: c.h:347
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
Pointer Item
Definition: item.h:17
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
IndexTupleData * IndexTuple
Definition: itup.h:53
struct IndexTupleData IndexTupleData
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
void pfree(void *pointer)
Definition: mcxt.c:1524
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define P_HIKEY
Definition: nbtree.h:367
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:562
#define P_FIRSTKEY
Definition: nbtree.h:368
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:595
#define BTMaxItemSize
Definition: nbtree.h:164
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
static void _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
Definition: nbtsort.c:637
static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtsort.c:716
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
Definition: nbtsort.c:648
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
Definition: nbtsort.c:786
static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level)
Definition: nbtsort.c:608
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:3239
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2813
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:531
Relation heap
Definition: nbtsort.c:248
Relation index
Definition: nbtsort.c:249
BlockNumber btws_pages_alloced
Definition: nbtsort.c:252
BTScanInsert inskey
Definition: nbtsort.c:251

References _bt_blnewpage(), _bt_blwritepage(), _bt_buildadd(), _bt_check_third_page(), _bt_pagestate(), _bt_sortaddtup(), _bt_truncate(), Assert(), BTMaxItemSize, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), BTreeTupleSetNAtts(), BTWriteState::btws_pages_alloced, CHECK_FOR_INTERRUPTS, CopyIndexTuple(), elog, ERROR, BTWriteState::heap, BTWriteState::index, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize(), BTWriteState::inskey, ItemIdGetLength, ItemIdSetUnused, MAXALIGN, OffsetNumberNext, OffsetNumberPrev, P_FIRSTKEY, P_HIKEY, P_LEFTMOST, P_NONE, PageGetFreeSpace(), PageGetItem(), PageGetItemId(), PageIndexTupleOverwrite(), palloc0(), pfree(), and unlikely.

Referenced by _bt_buildadd(), _bt_load(), _bt_sort_dedup_finish_pending(), and _bt_uppershutdown().

◆ _bt_end_parallel()

static void _bt_end_parallel ( BTLeader btleader)
static

Definition at line 1609 of file nbtsort.c.

1610{
1611 int i;
1612
1613 /* Shutdown worker processes */
1615
1616 /*
1617 * Next, accumulate WAL usage. (This must wait for the workers to finish,
1618 * or we might get incomplete data.)
1619 */
1620 for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1621 InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1622
1623 /* Free last reference to MVCC snapshot, if one was used */
1624 if (IsMVCCSnapshot(btleader->snapshot))
1625 UnregisterSnapshot(btleader->snapshot);
1626 DestroyParallelContext(btleader->pcxt);
1628}
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:796
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:218
int i
Definition: isn.c:74

References BTLeader::bufferusage, DestroyParallelContext(), ExitParallelMode(), i, InstrAccumParallelQuery(), IsMVCCSnapshot, ParallelContext::nworkers_launched, BTLeader::pcxt, BTLeader::snapshot, UnregisterSnapshot(), WaitForParallelWorkersToFinish(), and BTLeader::walusage.

Referenced by _bt_begin_parallel(), and btbuild().

◆ _bt_leader_participate_as_worker()

static void _bt_leader_participate_as_worker ( BTBuildState buildstate)
static

Definition at line 1689 of file nbtsort.c.

1690{
1691 BTLeader *btleader = buildstate->btleader;
1692 BTSpool *leaderworker;
1693 BTSpool *leaderworker2;
1694 int sortmem;
1695
1696 /* Allocate memory and initialize private spool */
1697 leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1698 leaderworker->heap = buildstate->spool->heap;
1699 leaderworker->index = buildstate->spool->index;
1700 leaderworker->isunique = buildstate->spool->isunique;
1701 leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1702
1703 /* Initialize second spool, if required */
1704 if (!btleader->btshared->isunique)
1705 leaderworker2 = NULL;
1706 else
1707 {
1708 /* Allocate memory for worker's own private secondary spool */
1709 leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1710
1711 /* Initialize worker's own secondary spool */
1712 leaderworker2->heap = leaderworker->heap;
1713 leaderworker2->index = leaderworker->index;
1714 leaderworker2->isunique = false;
1715 }
1716
1717 /*
1718 * Might as well use reliable figure when doling out maintenance_work_mem
1719 * (when requested number of workers were not launched, this will be
1720 * somewhat higher than it is for other workers).
1721 */
1722 sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1723
1724 /* Perform work common to all participants */
1725 _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1726 btleader->sharedsort, btleader->sharedsort2,
1727 sortmem, true);
1728
1729#ifdef BTREE_BUILD_STATS
1731 {
1732 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1733 ResetUsage();
1734 }
1735#endif /* BTREE_BUILD_STATS */
1736}
int maintenance_work_mem
Definition: globals.c:132
bool log_btree_build_stats
Definition: guc_tables.c:512
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition: nbtsort.c:1867
void ShowUsage(const char *title)
Definition: postgres.c:5026
void ResetUsage(void)
Definition: postgres.c:5019

References _bt_parallel_scan_and_sort(), BTBuildState::btleader, BTLeader::btshared, BTSpool::heap, BTSpool::index, BTSpool::isunique, BTShared::isunique, log_btree_build_stats, maintenance_work_mem, BTLeader::nparticipanttuplesorts, BTSpool::nulls_not_distinct, palloc0(), ResetUsage(), BTLeader::sharedsort, BTLeader::sharedsort2, ShowUsage(), and BTBuildState::spool.

Referenced by _bt_begin_parallel().

◆ _bt_leafbuild()

static void _bt_leafbuild ( BTSpool btspool,
BTSpool btspool2 
)
static

Definition at line 538 of file nbtsort.c.

539{
540 BTWriteState wstate;
541
542#ifdef BTREE_BUILD_STATS
544 {
545 ShowUsage("BTREE BUILD (Spool) STATISTICS");
546 ResetUsage();
547 }
548#endif /* BTREE_BUILD_STATS */
549
550 /* Execute the sort */
554 if (btspool2)
555 {
559 }
560
561 wstate.heap = btspool->heap;
562 wstate.index = btspool->index;
563 wstate.inskey = _bt_mkscankey(wstate.index, NULL);
564 /* _bt_mkscankey() won't set allequalimage without metapage */
565 wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
566
567 /* reserve the metapage */
569
572 _bt_load(&wstate, btspool, btspool2);
573}
void pgstat_progress_update_param(int index, int64 val)
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition: nbtree.h:1154
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition: nbtree.h:1155
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition: nbtree.h:1153
#define BTREE_METAPAGE
Definition: nbtree.h:148
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:1137
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:80
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:3296
#define PROGRESS_CREATEIDX_SUBPHASE
Definition: progress.h:88
bool allequalimage
Definition: nbtree.h:792
Tuplesortstate * sortstate
Definition: nbtsort.c:81
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1363

References _bt_allequalimage(), _bt_load(), _bt_mkscankey(), BTScanInsertData::allequalimage, BTREE_METAPAGE, BTWriteState::btws_pages_alloced, BTSpool::heap, BTWriteState::heap, BTSpool::index, BTWriteState::index, BTWriteState::inskey, log_btree_build_stats, pgstat_progress_update_param(), PROGRESS_BTREE_PHASE_LEAF_LOAD, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, PROGRESS_CREATEIDX_SUBPHASE, ResetUsage(), ShowUsage(), BTSpool::sortstate, and tuplesort_performsort().

Referenced by btbuild().

◆ _bt_load()

static void _bt_load ( BTWriteState wstate,
BTSpool btspool,
BTSpool btspool2 
)
static

Definition at line 1137 of file nbtsort.c.

1138{
1139 BTPageState *state = NULL;
1140 bool merge = (btspool2 != NULL);
1141 IndexTuple itup,
1142 itup2 = NULL;
1143 bool load1;
1144 TupleDesc tupdes = RelationGetDescr(wstate->index);
1145 int i,
1147 SortSupport sortKeys;
1148 int64 tuples_done = 0;
1149 bool deduplicate;
1150
1151 wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1152
1153 deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1155
1156 if (merge)
1157 {
1158 /*
1159 * Another BTSpool for dead tuples exists. Now we have to merge
1160 * btspool and btspool2.
1161 */
1162
1163 /* the preparation of merge */
1164 itup = tuplesort_getindextuple(btspool->sortstate, true);
1165 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1166
1167 /* Prepare SortSupport data for each column */
1168 sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1169
1170 for (i = 0; i < keysz; i++)
1171 {
1172 SortSupport sortKey = sortKeys + i;
1173 ScanKey scanKey = wstate->inskey->scankeys + i;
1174 bool reverse;
1175
1176 sortKey->ssup_cxt = CurrentMemoryContext;
1177 sortKey->ssup_collation = scanKey->sk_collation;
1178 sortKey->ssup_nulls_first =
1179 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1180 sortKey->ssup_attno = scanKey->sk_attno;
1181 /* Abbreviation is not supported here */
1182 sortKey->abbreviate = false;
1183
1184 Assert(sortKey->ssup_attno != 0);
1185
1186 reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
1187
1188 PrepareSortSupportFromIndexRel(wstate->index, reverse, sortKey);
1189 }
1190
1191 for (;;)
1192 {
1193 load1 = true; /* load BTSpool next ? */
1194 if (itup2 == NULL)
1195 {
1196 if (itup == NULL)
1197 break;
1198 }
1199 else if (itup != NULL)
1200 {
1201 int32 compare = 0;
1202
1203 for (i = 1; i <= keysz; i++)
1204 {
1205 SortSupport entry;
1206 Datum attrDatum1,
1207 attrDatum2;
1208 bool isNull1,
1209 isNull2;
1210
1211 entry = sortKeys + i - 1;
1212 attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1213 attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1214
1215 compare = ApplySortComparator(attrDatum1, isNull1,
1216 attrDatum2, isNull2,
1217 entry);
1218 if (compare > 0)
1219 {
1220 load1 = false;
1221 break;
1222 }
1223 else if (compare < 0)
1224 break;
1225 }
1226
1227 /*
1228 * If key values are equal, we sort on ItemPointer. This is
1229 * required for btree indexes, since heap TID is treated as an
1230 * implicit last key attribute in order to ensure that all
1231 * keys in the index are physically unique.
1232 */
1233 if (compare == 0)
1234 {
1235 compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1236 Assert(compare != 0);
1237 if (compare > 0)
1238 load1 = false;
1239 }
1240 }
1241 else
1242 load1 = false;
1243
1244 /* When we see first tuple, create first index page */
1245 if (state == NULL)
1246 state = _bt_pagestate(wstate, 0);
1247
1248 if (load1)
1249 {
1250 _bt_buildadd(wstate, state, itup, 0);
1251 itup = tuplesort_getindextuple(btspool->sortstate, true);
1252 }
1253 else
1254 {
1255 _bt_buildadd(wstate, state, itup2, 0);
1256 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1257 }
1258
1259 /* Report progress */
1261 ++tuples_done);
1262 }
1263 pfree(sortKeys);
1264 }
1265 else if (deduplicate)
1266 {
1267 /* merge is unnecessary, deduplicate into posting lists */
1268 BTDedupState dstate;
1269
1270 dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
1271 dstate->deduplicate = true; /* unused */
1272 dstate->nmaxitems = 0; /* unused */
1273 dstate->maxpostingsize = 0; /* set later */
1274 /* Metadata about base tuple of current pending posting list */
1275 dstate->base = NULL;
1276 dstate->baseoff = InvalidOffsetNumber; /* unused */
1277 dstate->basetupsize = 0;
1278 /* Metadata about current pending posting list TIDs */
1279 dstate->htids = NULL;
1280 dstate->nhtids = 0;
1281 dstate->nitems = 0;
1282 dstate->phystupsize = 0; /* unused */
1283 dstate->nintervals = 0; /* unused */
1284
1285 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1286 true)) != NULL)
1287 {
1288 /* When we see first tuple, create first index page */
1289 if (state == NULL)
1290 {
1291 state = _bt_pagestate(wstate, 0);
1292
1293 /*
1294 * Limit size of posting list tuples to 1/10 space we want to
1295 * leave behind on the page, plus space for final item's line
1296 * pointer. This is equal to the space that we'd like to
1297 * leave behind on each leaf page when fillfactor is 90,
1298 * allowing us to get close to fillfactor% space utilization
1299 * when there happen to be a great many duplicates. (This
1300 * makes higher leaf fillfactor settings ineffective when
1301 * building indexes that have many duplicates, but packing
1302 * leaf pages full with few very large tuples doesn't seem
1303 * like a useful goal.)
1304 */
1305 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1306 sizeof(ItemIdData);
1307 Assert(dstate->maxpostingsize <= BTMaxItemSize &&
1308 dstate->maxpostingsize <= INDEX_SIZE_MASK);
1309 dstate->htids = palloc(dstate->maxpostingsize);
1310
1311 /* start new pending posting list with itup copy */
1314 }
1315 else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1316 itup) > keysz &&
1317 _bt_dedup_save_htid(dstate, itup))
1318 {
1319 /*
1320 * Tuple is equal to base tuple of pending posting list. Heap
1321 * TID from itup has been saved in state.
1322 */
1323 }
1324 else
1325 {
1326 /*
1327 * Tuple is not equal to pending posting list tuple, or
1328 * _bt_dedup_save_htid() opted to not merge current item into
1329 * pending posting list.
1330 */
1331 _bt_sort_dedup_finish_pending(wstate, state, dstate);
1332 pfree(dstate->base);
1333
1334 /* start new pending posting list with itup copy */
1337 }
1338
1339 /* Report progress */
1341 ++tuples_done);
1342 }
1343
1344 if (state)
1345 {
1346 /*
1347 * Handle the last item (there must be a last item when the
1348 * tuplesort returned one or more tuples)
1349 */
1350 _bt_sort_dedup_finish_pending(wstate, state, dstate);
1351 pfree(dstate->base);
1352 pfree(dstate->htids);
1353 }
1354
1355 pfree(dstate);
1356 }
1357 else
1358 {
1359 /* merging and deduplication are both unnecessary */
1360 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1361 true)) != NULL)
1362 {
1363 /* When we see first tuple, create first index page */
1364 if (state == NULL)
1365 state = _bt_pagestate(wstate, 0);
1366
1367 _bt_buildadd(wstate, state, itup, 0);
1368
1369 /* Report progress */
1371 ++tuples_done);
1372 }
1373 }
1374
1375 /* Close down final pages and write the metapage */
1376 _bt_uppershutdown(wstate, state);
1377 smgr_bulk_finish(wstate->bulkstate);
1378}
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:87
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
#define MAXALIGN_DOWN(LEN)
Definition: c.h:794
int64_t int64
Definition: c.h:499
int32_t int32
Definition: c.h:498
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:131
#define INDEX_SIZE_MASK
Definition: itup.h:65
void * palloc(Size size)
Definition: mcxt.c:1317
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:484
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:433
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:1141
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1123
#define SK_BT_DESC
Definition: nbtree.h:1122
BTDedupStateData * BTDedupState
Definition: nbtree.h:898
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
Definition: nbtsort.c:1031
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
Definition: nbtsort.c:1065
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:3032
#define InvalidOffsetNumber
Definition: off.h:26
static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
Definition: pairingheap.c:79
uintptr_t Datum
Definition: postgres.h:69
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition: progress.h:90
#define RelationGetDescr(relation)
Definition: rel.h:538
@ MAIN_FORKNUM
Definition: relpath.h:58
void PrepareSortSupportFromIndexRel(Relation indexRel, bool reverse, SortSupport ssup)
Definition: sortsupport.c:161
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:200
Size maxpostingsize
Definition: nbtree.h:875
ItemPointer htids
Definition: nbtree.h:883
bool deduplicate
Definition: nbtree.h:873
OffsetNumber baseoff
Definition: nbtree.h:879
Size basetupsize
Definition: nbtree.h:880
IndexTuple base
Definition: nbtree.h:878
Size phystupsize
Definition: nbtree.h:886
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:798
ItemPointerData t_tid
Definition: itup.h:37
int sk_flags
Definition: skey.h:66
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67
AttrNumber ssup_attno
Definition: sortsupport.h:81
bool ssup_nulls_first
Definition: sortsupport.h:75
MemoryContext ssup_cxt
Definition: sortsupport.h:66
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)

References _bt_buildadd(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_keep_natts_fast(), _bt_pagestate(), _bt_sort_dedup_finish_pending(), _bt_uppershutdown(), SortSupportData::abbreviate, BTScanInsertData::allequalimage, ApplySortComparator(), Assert(), BTDedupStateData::base, BTDedupStateData::baseoff, BTDedupStateData::basetupsize, BTGetDeduplicateItems, BTMaxItemSize, BTWriteState::bulkstate, compare(), CopyIndexTuple(), CurrentMemoryContext, BTDedupStateData::deduplicate, BTDedupStateData::htids, i, BTWriteState::index, index_getattr(), INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, BTWriteState::inskey, InvalidOffsetNumber, BTSpool::isunique, ItemPointerCompare(), MAIN_FORKNUM, MAXALIGN_DOWN, BTDedupStateData::maxpostingsize, merge(), BTDedupStateData::nhtids, BTDedupStateData::nintervals, BTDedupStateData::nitems, BTDedupStateData::nmaxitems, palloc(), palloc0(), pfree(), pgstat_progress_update_param(), BTDedupStateData::phystupsize, PrepareSortSupportFromIndexRel(), PROGRESS_CREATEIDX_TUPLES_DONE, RelationGetDescr, BTScanInsertData::scankeys, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, smgr_bulk_finish(), smgr_bulk_start_rel(), BTSpool::sortstate, SortSupportData::ssup_attno, SortSupportData::ssup_collation, SortSupportData::ssup_cxt, SortSupportData::ssup_nulls_first, IndexTupleData::t_tid, and tuplesort_getindextuple().

Referenced by _bt_leafbuild().

◆ _bt_pagestate()

static BTPageState * _bt_pagestate ( BTWriteState wstate,
uint32  level 
)
static

Definition at line 648 of file nbtsort.c.

649{
651
652 /* create initial page for level */
653 state->btps_buf = _bt_blnewpage(wstate, level);
654
655 /* and assign it a page position */
656 state->btps_blkno = wstate->btws_pages_alloced++;
657
658 state->btps_lowkey = NULL;
659 /* initialize lastoff so first item goes into P_FIRSTKEY */
660 state->btps_lastoff = P_HIKEY;
661 state->btps_lastextra = 0;
662 state->btps_level = level;
663 /* set "full" threshold based on level. See notes at head of file. */
664 if (level > 0)
665 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
666 else
667 state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
668
669 /* no parent level, yet */
670 state->btps_next = NULL;
671
672 return state;
673}
#define BTGetTargetPageFreeSpace(relation)
Definition: nbtree.h:1139
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:201

References _bt_blnewpage(), BTGetTargetPageFreeSpace, BTREE_NONLEAF_FILLFACTOR, BTWriteState::btws_pages_alloced, BTWriteState::index, P_HIKEY, and palloc0().

Referenced by _bt_buildadd(), and _bt_load().

◆ _bt_parallel_build_main()

void _bt_parallel_build_main ( dsm_segment seg,
shm_toc toc 
)

Definition at line 1742 of file nbtsort.c.

1743{
1744 char *sharedquery;
1745 BTSpool *btspool;
1746 BTSpool *btspool2;
1747 BTShared *btshared;
1748 Sharedsort *sharedsort;
1749 Sharedsort *sharedsort2;
1750 Relation heapRel;
1751 Relation indexRel;
1752 LOCKMODE heapLockmode;
1753 LOCKMODE indexLockmode;
1754 WalUsage *walusage;
1755 BufferUsage *bufferusage;
1756 int sortmem;
1757
1758#ifdef BTREE_BUILD_STATS
1760 ResetUsage();
1761#endif /* BTREE_BUILD_STATS */
1762
1763 /*
1764 * The only possible status flag that can be set to the parallel worker is
1765 * PROC_IN_SAFE_IC.
1766 */
1767 Assert((MyProc->statusFlags == 0) ||
1769
1770 /* Set debug_query_string for individual workers first */
1771 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1772 debug_query_string = sharedquery;
1773
1774 /* Report the query string from leader */
1776
1777 /* Look up nbtree shared state */
1778 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1779
1780 /* Open relations using lock modes known to be obtained by index.c */
1781 if (!btshared->isconcurrent)
1782 {
1783 heapLockmode = ShareLock;
1784 indexLockmode = AccessExclusiveLock;
1785 }
1786 else
1787 {
1788 heapLockmode = ShareUpdateExclusiveLock;
1789 indexLockmode = RowExclusiveLock;
1790 }
1791
1792 /* Track query ID */
1793 pgstat_report_query_id(btshared->queryid, false);
1794
1795 /* Open relations within worker */
1796 heapRel = table_open(btshared->heaprelid, heapLockmode);
1797 indexRel = index_open(btshared->indexrelid, indexLockmode);
1798
1799 /* Initialize worker's own spool */
1800 btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1801 btspool->heap = heapRel;
1802 btspool->index = indexRel;
1803 btspool->isunique = btshared->isunique;
1804 btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1805
1806 /* Look up shared state private to tuplesort.c */
1807 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1808 tuplesort_attach_shared(sharedsort, seg);
1809 if (!btshared->isunique)
1810 {
1811 btspool2 = NULL;
1812 sharedsort2 = NULL;
1813 }
1814 else
1815 {
1816 /* Allocate memory for worker's own private secondary spool */
1817 btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1818
1819 /* Initialize worker's own secondary spool */
1820 btspool2->heap = btspool->heap;
1821 btspool2->index = btspool->index;
1822 btspool2->isunique = false;
1823 /* Look up shared state private to tuplesort.c */
1824 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1825 tuplesort_attach_shared(sharedsort2, seg);
1826 }
1827
1828 /* Prepare to track buffer usage during parallel execution */
1830
1831 /* Perform sorting of spool, and possibly a spool2 */
1832 sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1833 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1834 sharedsort2, sortmem, false);
1835
1836 /* Report WAL/buffer usage during parallel execution */
1837 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1838 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1840 &walusage[ParallelWorkerNumber]);
1841
1842#ifdef BTREE_BUILD_STATS
1844 {
1845 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1846 ResetUsage();
1847 }
1848#endif /* BTREE_BUILD_STATS */
1849
1850 index_close(indexRel, indexLockmode);
1851 table_close(heapRel, heapLockmode);
1852}
int ParallelWorkerNumber
Definition: parallel.c:115
void pgstat_report_query_id(uint64 query_id, bool force)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:133
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:208
void InstrStartParallelQuery(void)
Definition: instrument.c:200
int LOCKMODE
Definition: lockdefs.h:26
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
#define ShareLock
Definition: lockdefs.h:40
#define RowExclusiveLock
Definition: lockdefs.h:38
#define PROC_IN_SAFE_IC
Definition: proc.h:59
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
PGPROC * MyProc
Definition: proc.c:66
uint8 statusFlags
Definition: proc.h:243
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2961

References _bt_parallel_scan_and_sort(), AccessExclusiveLock, Assert(), debug_query_string, BTSpool::heap, BTShared::heaprelid, BTSpool::index, index_close(), index_open(), BTShared::indexrelid, InstrEndParallelQuery(), InstrStartParallelQuery(), BTShared::isconcurrent, BTSpool::isunique, BTShared::isunique, log_btree_build_stats, maintenance_work_mem, MyProc, BTSpool::nulls_not_distinct, BTShared::nulls_not_distinct, palloc0(), PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelWorkerNumber, pgstat_report_activity(), pgstat_report_query_id(), PROC_IN_SAFE_IC, BTShared::queryid, ResetUsage(), RowExclusiveLock, BTShared::scantuplesortstates, ShareLock, ShareUpdateExclusiveLock, shm_toc_lookup(), ShowUsage(), STATE_RUNNING, PGPROC::statusFlags, table_close(), table_open(), and tuplesort_attach_shared().

◆ _bt_parallel_estimate_shared()

static Size _bt_parallel_estimate_shared ( Relation  heap,
Snapshot  snapshot 
)
static

Definition at line 1635 of file nbtsort.c.

1636{
1637 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1638 return add_size(BUFFERALIGN(sizeof(BTShared)),
1639 table_parallelscan_estimate(heap, snapshot));
1640}
#define BUFFERALIGN(LEN)
Definition: c.h:784
Size add_size(Size s1, Size s2)
Definition: shmem.c:488
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition: tableam.c:131

References add_size(), BUFFERALIGN, and table_parallelscan_estimate().

Referenced by _bt_begin_parallel().

◆ _bt_parallel_heapscan()

static double _bt_parallel_heapscan ( BTBuildState buildstate,
bool *  brokenhotchain 
)
static

Definition at line 1655 of file nbtsort.c.

1656{
1657 BTShared *btshared = buildstate->btleader->btshared;
1658 int nparticipanttuplesorts;
1659 double reltuples;
1660
1661 nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1662 for (;;)
1663 {
1664 SpinLockAcquire(&btshared->mutex);
1665 if (btshared->nparticipantsdone == nparticipanttuplesorts)
1666 {
1667 buildstate->havedead = btshared->havedead;
1668 buildstate->indtuples = btshared->indtuples;
1669 *brokenhotchain = btshared->brokenhotchain;
1670 reltuples = btshared->reltuples;
1671 SpinLockRelease(&btshared->mutex);
1672 break;
1673 }
1674 SpinLockRelease(&btshared->mutex);
1675
1677 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1678 }
1679
1681
1682 return reltuples;
1683}
bool ConditionVariableCancelSleep(void)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59

References BTShared::brokenhotchain, BTBuildState::btleader, BTLeader::btshared, ConditionVariableCancelSleep(), ConditionVariableSleep(), BTShared::havedead, BTBuildState::havedead, BTShared::indtuples, BTBuildState::indtuples, BTShared::mutex, BTShared::nparticipantsdone, BTLeader::nparticipanttuplesorts, BTShared::reltuples, SpinLockAcquire, SpinLockRelease, and BTShared::workersdonecv.

Referenced by _bt_spools_heapscan().

◆ _bt_parallel_scan_and_sort()

static void _bt_parallel_scan_and_sort ( BTSpool btspool,
BTSpool btspool2,
BTShared btshared,
Sharedsort sharedsort,
Sharedsort sharedsort2,
int  sortmem,
bool  progress 
)
static

Definition at line 1867 of file nbtsort.c.

1870{
1871 SortCoordinate coordinate;
1872 BTBuildState buildstate;
1873 TableScanDesc scan;
1874 double reltuples;
1875 IndexInfo *indexInfo;
1876
1877 /* Initialize local tuplesort coordination state */
1878 coordinate = palloc0(sizeof(SortCoordinateData));
1879 coordinate->isWorker = true;
1880 coordinate->nParticipants = -1;
1881 coordinate->sharedsort = sharedsort;
1882
1883 /* Begin "partial" tuplesort */
1884 btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1885 btspool->index,
1886 btspool->isunique,
1887 btspool->nulls_not_distinct,
1888 sortmem, coordinate,
1890
1891 /*
1892 * Just as with serial case, there may be a second spool. If so, a
1893 * second, dedicated spool2 partial tuplesort is required.
1894 */
1895 if (btspool2)
1896 {
1897 SortCoordinate coordinate2;
1898
1899 /*
1900 * We expect that the second one (for dead tuples) won't get very
1901 * full, so we give it only work_mem (unless sortmem is less for
1902 * worker). Worker processes are generally permitted to allocate
1903 * work_mem independently.
1904 */
1905 coordinate2 = palloc0(sizeof(SortCoordinateData));
1906 coordinate2->isWorker = true;
1907 coordinate2->nParticipants = -1;
1908 coordinate2->sharedsort = sharedsort2;
1909 btspool2->sortstate =
1910 tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1911 Min(sortmem, work_mem), coordinate2,
1912 false);
1913 }
1914
1915 /* Fill in buildstate for _bt_build_callback() */
1916 buildstate.isunique = btshared->isunique;
1917 buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1918 buildstate.havedead = false;
1919 buildstate.heap = btspool->heap;
1920 buildstate.spool = btspool;
1921 buildstate.spool2 = btspool2;
1922 buildstate.indtuples = 0;
1923 buildstate.btleader = NULL;
1924
1925 /* Join parallel scan */
1926 indexInfo = BuildIndexInfo(btspool->index);
1927 indexInfo->ii_Concurrent = btshared->isconcurrent;
1928 scan = table_beginscan_parallel(btspool->heap,
1930 reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1932 &buildstate, scan);
1933
1934 /* Execute this worker's part of the sort */
1935 if (progress)
1939 if (btspool2)
1940 {
1941 if (progress)
1945 }
1946
1947 /*
1948 * Done. Record ambuild statistics, and whether we encountered a broken
1949 * HOT chain.
1950 */
1951 SpinLockAcquire(&btshared->mutex);
1952 btshared->nparticipantsdone++;
1953 btshared->reltuples += reltuples;
1954 if (buildstate.havedead)
1955 btshared->havedead = true;
1956 btshared->indtuples += buildstate.indtuples;
1957 if (indexInfo->ii_BrokenHotChain)
1958 btshared->brokenhotchain = true;
1959 SpinLockRelease(&btshared->mutex);
1960
1961 /* Notify leader */
1963
1964 /* We can end tuplesorts immediately */
1965 tuplesort_end(btspool->sortstate);
1966 if (btspool2)
1967 tuplesort_end(btspool2->sortstate);
1968}
#define Min(x, y)
Definition: c.h:975
void ConditionVariableSignal(ConditionVariable *cv)
int work_mem
Definition: globals.c:130
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2428
static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: nbtsort.c:579
static int progress
Definition: pgbench.c:262
bool isunique
Definition: nbtsort.c:206
bool nulls_not_distinct
Definition: nbtsort.c:207
Relation heap
Definition: nbtsort.c:209
bool ii_BrokenHotChain
Definition: execnodes.h:215
bool ii_Concurrent
Definition: execnodes.h:214
Sharedsort * sharedsort
Definition: tuplesort.h:59
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition: tableam.c:166
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1745
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:951
#define TUPLESORT_NONE
Definition: tuplesort.h:94
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt)

References _bt_build_callback(), BTShared::brokenhotchain, BTBuildState::btleader, BuildIndexInfo(), ConditionVariableSignal(), BTShared::havedead, BTBuildState::havedead, BTSpool::heap, BTBuildState::heap, IndexInfo::ii_BrokenHotChain, IndexInfo::ii_Concurrent, BTSpool::index, BTShared::indtuples, BTBuildState::indtuples, BTShared::isconcurrent, BTSpool::isunique, BTShared::isunique, BTBuildState::isunique, SortCoordinateData::isWorker, Min, BTShared::mutex, SortCoordinateData::nParticipants, BTShared::nparticipantsdone, BTSpool::nulls_not_distinct, BTShared::nulls_not_distinct, BTBuildState::nulls_not_distinct, palloc0(), ParallelTableScanFromBTShared, pgstat_progress_update_param(), progress, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, PROGRESS_CREATEIDX_SUBPHASE, BTShared::reltuples, SortCoordinateData::sharedsort, BTSpool::sortstate, SpinLockAcquire, SpinLockRelease, BTBuildState::spool, BTBuildState::spool2, table_beginscan_parallel(), table_index_build_scan(), tuplesort_begin_index_btree(), tuplesort_end(), TUPLESORT_NONE, tuplesort_performsort(), work_mem, and BTShared::workersdonecv.

Referenced by _bt_leader_participate_as_worker(), and _bt_parallel_build_main().

◆ _bt_slideleft()

static void _bt_slideleft ( Page  rightmostpage)
static

Definition at line 685 of file nbtsort.c.

686{
687 OffsetNumber off;
688 OffsetNumber maxoff;
689 ItemId previi;
690
691 maxoff = PageGetMaxOffsetNumber(rightmostpage);
692 Assert(maxoff >= P_FIRSTKEY);
693 previi = PageGetItemId(rightmostpage, P_HIKEY);
694 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
695 {
696 ItemId thisii = PageGetItemId(rightmostpage, off);
697
698 *previi = *thisii;
699 previi = thisii;
700 }
701 ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
702}
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372

References Assert(), OffsetNumberNext, P_FIRSTKEY, P_HIKEY, PageGetItemId(), and PageGetMaxOffsetNumber().

Referenced by _bt_uppershutdown().

◆ _bt_sort_dedup_finish_pending()

static void _bt_sort_dedup_finish_pending ( BTWriteState wstate,
BTPageState state,
BTDedupState  dstate 
)
static

Definition at line 1031 of file nbtsort.c.

1033{
1034 Assert(dstate->nitems > 0);
1035
1036 if (dstate->nitems == 1)
1037 _bt_buildadd(wstate, state, dstate->base, 0);
1038 else
1039 {
1040 IndexTuple postingtuple;
1041 Size truncextra;
1042
1043 /* form a tuple with a posting list */
1044 postingtuple = _bt_form_posting(dstate->base,
1045 dstate->htids,
1046 dstate->nhtids);
1047 /* Calculate posting list overhead */
1048 truncextra = IndexTupleSize(postingtuple) -
1049 BTreeTupleGetPostingOffset(postingtuple);
1050
1051 _bt_buildadd(wstate, state, postingtuple, truncextra);
1052 pfree(postingtuple);
1053 }
1054
1055 dstate->nmaxitems = 0;
1056 dstate->nhtids = 0;
1057 dstate->nitems = 0;
1058 dstate->phystupsize = 0;
1059}
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:864
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529

References _bt_buildadd(), _bt_form_posting(), Assert(), BTDedupStateData::base, BTreeTupleGetPostingOffset(), BTDedupStateData::htids, IndexTupleSize(), BTDedupStateData::nhtids, BTDedupStateData::nitems, BTDedupStateData::nmaxitems, pfree(), and BTDedupStateData::phystupsize.

Referenced by _bt_load().

◆ _bt_sortaddtup()

static void _bt_sortaddtup ( Page  page,
Size  itemsize,
IndexTuple  itup,
OffsetNumber  itup_off,
bool  newfirstdataitem 
)
static

Definition at line 716 of file nbtsort.c.

721{
722 IndexTupleData trunctuple;
723
724 if (newfirstdataitem)
725 {
726 trunctuple = *itup;
727 trunctuple.t_info = sizeof(IndexTupleData);
728 BTreeTupleSetNAtts(&trunctuple, 0, false);
729 itup = &trunctuple;
730 itemsize = sizeof(IndexTupleData);
731 }
732
733 if (PageAddItem(page, (Item) itup, itemsize, itup_off,
734 false, false) == InvalidOffsetNumber)
735 elog(ERROR, "failed to add item to the index page");
736}
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:471
unsigned short t_info
Definition: itup.h:49

References BTreeTupleSetNAtts(), elog, ERROR, InvalidOffsetNumber, PageAddItem, and IndexTupleData::t_info.

Referenced by _bt_buildadd().

◆ _bt_spool()

static void _bt_spool ( BTSpool btspool,
ItemPointer  self,
Datum values,
bool *  isnull 
)
static

Definition at line 527 of file nbtsort.c.

528{
530 self, values, isnull);
531}
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, const Datum *values, const bool *isnull)

References BTSpool::index, BTSpool::sortstate, tuplesort_putindextuplevalues(), and values.

Referenced by _bt_build_callback().

◆ _bt_spooldestroy()

static void _bt_spooldestroy ( BTSpool btspool)
static

Definition at line 517 of file nbtsort.c.

518{
519 tuplesort_end(btspool->sortstate);
520 pfree(btspool);
521}

References pfree(), BTSpool::sortstate, and tuplesort_end().

Referenced by _bt_spools_heapscan(), and btbuild().

◆ _bt_spools_heapscan()

static double _bt_spools_heapscan ( Relation  heap,
Relation  index,
BTBuildState buildstate,
IndexInfo indexInfo 
)
static

Definition at line 365 of file nbtsort.c.

367{
368 BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
369 SortCoordinate coordinate = NULL;
370 double reltuples = 0;
371
372 /*
373 * We size the sort area as maintenance_work_mem rather than work_mem to
374 * speed index creation. This should be OK since a single backend can't
375 * run multiple index creations in parallel (see also: notes on
376 * parallelism and maintenance_work_mem below).
377 */
378 btspool->heap = heap;
379 btspool->index = index;
380 btspool->isunique = indexInfo->ii_Unique;
381 btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
382
383 /* Save as primary spool */
384 buildstate->spool = btspool;
385
386 /* Report table scan phase started */
389
390 /* Attempt to launch parallel worker scan when required */
391 if (indexInfo->ii_ParallelWorkers > 0)
392 _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
393 indexInfo->ii_ParallelWorkers);
394
395 /*
396 * If parallel build requested and at least one worker process was
397 * successfully launched, set up coordination state
398 */
399 if (buildstate->btleader)
400 {
401 coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
402 coordinate->isWorker = false;
403 coordinate->nParticipants =
404 buildstate->btleader->nparticipanttuplesorts;
405 coordinate->sharedsort = buildstate->btleader->sharedsort;
406 }
407
408 /*
409 * Begin serial/leader tuplesort.
410 *
411 * In cases where parallelism is involved, the leader receives the same
412 * share of maintenance_work_mem as a serial sort (it is generally treated
413 * in the same way as a serial sort once we return). Parallel worker
414 * Tuplesortstates will have received only a fraction of
415 * maintenance_work_mem, though.
416 *
417 * We rely on the lifetime of the Leader Tuplesortstate almost not
418 * overlapping with any worker Tuplesortstate's lifetime. There may be
419 * some small overlap, but that's okay because we rely on leader
420 * Tuplesortstate only allocating a small, fixed amount of memory here.
421 * When its tuplesort_performsort() is called (by our caller), and
422 * significant amounts of memory are likely to be used, all workers must
423 * have already freed almost all memory held by their Tuplesortstates
424 * (they are about to go away completely, too). The overall effect is
425 * that maintenance_work_mem always represents an absolute high watermark
426 * on the amount of memory used by a CREATE INDEX operation, regardless of
427 * the use of parallelism or any other factor.
428 */
429 buildstate->spool->sortstate =
430 tuplesort_begin_index_btree(heap, index, buildstate->isunique,
431 buildstate->nulls_not_distinct,
432 maintenance_work_mem, coordinate,
434
435 /*
436 * If building a unique index, put dead tuples in a second spool to keep
437 * them out of the uniqueness check. We expect that the second spool (for
438 * dead tuples) won't get very full, so we give it only work_mem.
439 */
440 if (indexInfo->ii_Unique)
441 {
442 BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
443 SortCoordinate coordinate2 = NULL;
444
445 /* Initialize secondary spool */
446 btspool2->heap = heap;
447 btspool2->index = index;
448 btspool2->isunique = false;
449 /* Save as secondary spool */
450 buildstate->spool2 = btspool2;
451
452 if (buildstate->btleader)
453 {
454 /*
455 * Set up non-private state that is passed to
456 * tuplesort_begin_index_btree() about the basic high level
457 * coordination of a parallel sort.
458 */
459 coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
460 coordinate2->isWorker = false;
461 coordinate2->nParticipants =
462 buildstate->btleader->nparticipanttuplesorts;
463 coordinate2->sharedsort = buildstate->btleader->sharedsort2;
464 }
465
466 /*
467 * We expect that the second one (for dead tuples) won't get very
468 * full, so we give it only work_mem
469 */
470 buildstate->spool2->sortstate =
471 tuplesort_begin_index_btree(heap, index, false, false, work_mem,
472 coordinate2, TUPLESORT_NONE);
473 }
474
475 /* Fill spool using either serial or parallel heap scan */
476 if (!buildstate->btleader)
477 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
478 _bt_build_callback, buildstate,
479 NULL);
480 else
481 reltuples = _bt_parallel_heapscan(buildstate,
482 &indexInfo->ii_BrokenHotChain);
483
484 /*
485 * Set the progress target for the next phase. Reset the block number
486 * values set by table_index_build_scan
487 */
488 {
489 const int progress_index[] = {
493 };
494 const int64 progress_vals[] = {
495 buildstate->indtuples,
496 0, 0
497 };
498
499 pgstat_progress_update_multi_param(3, progress_index, progress_vals);
500 }
501
502 /* okay, all heap tuples are spooled */
503 if (buildstate->spool2 && !buildstate->havedead)
504 {
505 /* spool2 turns out to be unnecessary */
506 _bt_spooldestroy(buildstate->spool2);
507 buildstate->spool2 = NULL;
508 }
509
510 return reltuples;
511}
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition: nbtree.h:1152
static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
Definition: nbtsort.c:1655
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
Definition: nbtsort.c:1397
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:517
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:89
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:125
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:124
bool ii_Unique
Definition: execnodes.h:209
bool ii_NullsNotDistinct
Definition: execnodes.h:210
int ii_ParallelWorkers
Definition: execnodes.h:218
Definition: type.h:96
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:62

References _bt_begin_parallel(), _bt_build_callback(), _bt_parallel_heapscan(), _bt_spooldestroy(), BTBuildState::btleader, BTBuildState::havedead, BTSpool::heap, IndexInfo::ii_BrokenHotChain, IndexInfo::ii_Concurrent, IndexInfo::ii_NullsNotDistinct, IndexInfo::ii_ParallelWorkers, IndexInfo::ii_Unique, BTSpool::index, BTBuildState::indtuples, BTSpool::isunique, BTBuildState::isunique, SortCoordinateData::isWorker, maintenance_work_mem, SortCoordinateData::nParticipants, BTLeader::nparticipanttuplesorts, BTSpool::nulls_not_distinct, BTBuildState::nulls_not_distinct, palloc0(), pgstat_progress_update_multi_param(), pgstat_progress_update_param(), PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN, PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_CREATEIDX_TUPLES_TOTAL, PROGRESS_SCAN_BLOCKS_DONE, PROGRESS_SCAN_BLOCKS_TOTAL, BTLeader::sharedsort, SortCoordinateData::sharedsort, BTLeader::sharedsort2, BTSpool::sortstate, BTBuildState::spool, BTBuildState::spool2, table_index_build_scan(), tuplesort_begin_index_btree(), TUPLESORT_NONE, and work_mem.

Referenced by btbuild().

◆ _bt_uppershutdown()

static void _bt_uppershutdown ( BTWriteState wstate,
BTPageState state 
)
static

Definition at line 1065 of file nbtsort.c.

1066{
1067 BTPageState *s;
1068 BlockNumber rootblkno = P_NONE;
1069 uint32 rootlevel = 0;
1070 BulkWriteBuffer metabuf;
1071
1072 /*
1073 * Each iteration of this loop completes one more level of the tree.
1074 */
1075 for (s = state; s != NULL; s = s->btps_next)
1076 {
1077 BlockNumber blkno;
1078 BTPageOpaque opaque;
1079
1080 blkno = s->btps_blkno;
1081 opaque = BTPageGetOpaque((Page) s->btps_buf);
1082
1083 /*
1084 * We have to link the last page on this level to somewhere.
1085 *
1086 * If we're at the top, it's the root, so attach it to the metapage.
1087 * Otherwise, add an entry for it to its parent using its low key.
1088 * This may cause the last page of the parent level to split, but
1089 * that's not a problem -- we haven't gotten to it yet.
1090 */
1091 if (s->btps_next == NULL)
1092 {
1093 opaque->btpo_flags |= BTP_ROOT;
1094 rootblkno = blkno;
1095 rootlevel = s->btps_level;
1096 }
1097 else
1098 {
1101 BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1102 P_LEFTMOST(opaque));
1103 Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1104 !P_LEFTMOST(opaque));
1106 _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1107 pfree(s->btps_lowkey);
1108 s->btps_lowkey = NULL;
1109 }
1110
1111 /*
1112 * This is the rightmost page, so the ItemId array needs to be slid
1113 * back one slot. Then we can dump out the page.
1114 */
1116 _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
1117 s->btps_buf = NULL; /* writepage took ownership of the buffer */
1118 }
1119
1120 /*
1121 * As the last step in the process, construct the metapage and make it
1122 * point to the new root (unless we had no data at all, in which case it's
1123 * set to point to "P_NONE"). This changes the index to the "valid" state
1124 * by filling in a valid magic number in the metapage.
1125 */
1126 metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1127 _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
1128 wstate->inskey->allequalimage);
1129 _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
1130}
uint32_t uint32
Definition: c.h:502
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
#define BTP_ROOT
Definition: nbtree.h:77
static void _bt_slideleft(Page rightmostpage)
Definition: nbtsort.c:685
IndexTuple btps_lowkey
Definition: nbtsort.c:235
BulkWriteBuffer btps_buf
Definition: nbtsort.c:233
BlockNumber btps_blkno
Definition: nbtsort.c:234
struct BTPageState * btps_next
Definition: nbtsort.c:240
uint32 btps_level
Definition: nbtsort.c:238

References _bt_blwritepage(), _bt_buildadd(), _bt_initmetapage(), _bt_slideleft(), BTScanInsertData::allequalimage, Assert(), BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTPageState::btps_blkno, BTPageState::btps_buf, BTPageState::btps_level, BTPageState::btps_lowkey, BTPageState::btps_next, BTREE_METAPAGE, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), BTWriteState::bulkstate, BTWriteState::index, IndexRelationGetNumberOfKeyAttributes, BTWriteState::inskey, P_LEFTMOST, P_NONE, pfree(), and smgr_bulk_get_buf().

Referenced by _bt_load().

◆ btbuild()

IndexBuildResult * btbuild ( Relation  heap,
Relation  index,
IndexInfo indexInfo 
)

Definition at line 295 of file nbtsort.c.

296{
297 IndexBuildResult *result;
298 BTBuildState buildstate;
299 double reltuples;
300
301#ifdef BTREE_BUILD_STATS
303 ResetUsage();
304#endif /* BTREE_BUILD_STATS */
305
306 buildstate.isunique = indexInfo->ii_Unique;
307 buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
308 buildstate.havedead = false;
309 buildstate.heap = heap;
310 buildstate.spool = NULL;
311 buildstate.spool2 = NULL;
312 buildstate.indtuples = 0;
313 buildstate.btleader = NULL;
314
315 /*
316 * We expect to be called exactly once for any index relation. If that's
317 * not the case, big trouble's what we have.
318 */
320 elog(ERROR, "index \"%s\" already contains data",
322
323 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
324
325 /*
326 * Finish the build by (1) completing the sort of the spool file, (2)
327 * inserting the sorted tuples into btree pages and (3) building the upper
328 * levels. Finally, it may also be necessary to end use of parallelism.
329 */
330 _bt_leafbuild(buildstate.spool, buildstate.spool2);
331 _bt_spooldestroy(buildstate.spool);
332 if (buildstate.spool2)
333 _bt_spooldestroy(buildstate.spool2);
334 if (buildstate.btleader)
335 _bt_end_parallel(buildstate.btleader);
336
337 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
338
339 result->heap_tuples = reltuples;
340 result->index_tuples = buildstate.indtuples;
341
342#ifdef BTREE_BUILD_STATS
344 {
345 ShowUsage("BTREE BUILD STATS");
346 ResetUsage();
347 }
348#endif /* BTREE_BUILD_STATS */
349
350 return result;
351}
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:275
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:538
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition: nbtsort.c:365
#define RelationGetRelationName(relation)
Definition: rel.h:546
double heap_tuples
Definition: genam.h:55
double index_tuples
Definition: genam.h:56

References _bt_end_parallel(), _bt_leafbuild(), _bt_spooldestroy(), _bt_spools_heapscan(), BTBuildState::btleader, elog, ERROR, BTBuildState::havedead, BTBuildState::heap, IndexBuildResult::heap_tuples, IndexInfo::ii_NullsNotDistinct, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, BTBuildState::indtuples, BTBuildState::isunique, log_btree_build_stats, BTBuildState::nulls_not_distinct, palloc(), RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), ShowUsage(), BTBuildState::spool, and BTBuildState::spool2.

Referenced by bthandler().