PostgreSQL Source Code  git master
nbtsort.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/parallel.h"
#include "access/relscan.h"
#include "access/table.h"
#include "access/tableam.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
#include "catalog/index.h"
#include "commands/progress.h"
#include "executor/instrument.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/rel.h"
#include "utils/sortsupport.h"
#include "utils/tuplesort.h"
Include dependency graph for nbtsort.c:

Go to the source code of this file.

Data Structures

struct  BTSpool
 
struct  BTShared
 
struct  BTLeader
 
struct  BTBuildState
 
struct  BTPageState
 
struct  BTWriteState
 

Macros

#define PARALLEL_KEY_BTREE_SHARED   UINT64CONST(0xA000000000000001)
 
#define PARALLEL_KEY_TUPLESORT   UINT64CONST(0xA000000000000002)
 
#define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
 
#define PARALLEL_KEY_QUERY_TEXT   UINT64CONST(0xA000000000000004)
 
#define PARALLEL_KEY_WAL_USAGE   UINT64CONST(0xA000000000000005)
 
#define PARALLEL_KEY_BUFFER_USAGE   UINT64CONST(0xA000000000000006)
 
#define ParallelTableScanFromBTShared(shared)   (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
 

Typedefs

typedef struct BTSpool BTSpool
 
typedef struct BTShared BTShared
 
typedef struct BTLeader BTLeader
 
typedef struct BTBuildState BTBuildState
 
typedef struct BTPageState BTPageState
 
typedef struct BTWriteState BTWriteState
 

Functions

static double _bt_spools_heapscan (Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
 
static void _bt_spooldestroy (BTSpool *btspool)
 
static void _bt_spool (BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
 
static void _bt_leafbuild (BTSpool *btspool, BTSpool *btspool2)
 
static void _bt_build_callback (Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
 
static Page _bt_blnewpage (uint32 level)
 
static BTPageState_bt_pagestate (BTWriteState *wstate, uint32 level)
 
static void _bt_slideleft (Page rightmostpage)
 
static void _bt_sortaddtup (Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
 
static void _bt_buildadd (BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
 
static void _bt_sort_dedup_finish_pending (BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
 
static void _bt_uppershutdown (BTWriteState *wstate, BTPageState *state)
 
static void _bt_load (BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 
static void _bt_begin_parallel (BTBuildState *buildstate, bool isconcurrent, int request)
 
static void _bt_end_parallel (BTLeader *btleader)
 
static Size _bt_parallel_estimate_shared (Relation heap, Snapshot snapshot)
 
static double _bt_parallel_heapscan (BTBuildState *buildstate, bool *brokenhotchain)
 
static void _bt_leader_participate_as_worker (BTBuildState *buildstate)
 
static void _bt_parallel_scan_and_sort (BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
 
IndexBuildResultbtbuild (Relation heap, Relation index, IndexInfo *indexInfo)
 
static void _bt_blwritepage (BTWriteState *wstate, Page page, BlockNumber blkno)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ PARALLEL_KEY_BTREE_SHARED

#define PARALLEL_KEY_BTREE_SHARED   UINT64CONST(0xA000000000000001)

Definition at line 69 of file nbtsort.c.

Referenced by _bt_begin_parallel(), and _bt_parallel_build_main().

◆ PARALLEL_KEY_BUFFER_USAGE

#define PARALLEL_KEY_BUFFER_USAGE   UINT64CONST(0xA000000000000006)

Definition at line 74 of file nbtsort.c.

Referenced by _bt_begin_parallel(), and _bt_parallel_build_main().

◆ PARALLEL_KEY_QUERY_TEXT

#define PARALLEL_KEY_QUERY_TEXT   UINT64CONST(0xA000000000000004)

Definition at line 72 of file nbtsort.c.

Referenced by _bt_begin_parallel(), and _bt_parallel_build_main().

◆ PARALLEL_KEY_TUPLESORT

#define PARALLEL_KEY_TUPLESORT   UINT64CONST(0xA000000000000002)

Definition at line 70 of file nbtsort.c.

Referenced by _bt_begin_parallel(), and _bt_parallel_build_main().

◆ PARALLEL_KEY_TUPLESORT_SPOOL2

#define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)

Definition at line 71 of file nbtsort.c.

Referenced by _bt_begin_parallel(), and _bt_parallel_build_main().

◆ PARALLEL_KEY_WAL_USAGE

#define PARALLEL_KEY_WAL_USAGE   UINT64CONST(0xA000000000000005)

Definition at line 73 of file nbtsort.c.

Referenced by _bt_begin_parallel(), and _bt_parallel_build_main().

◆ ParallelTableScanFromBTShared

#define ParallelTableScanFromBTShared (   shared)    (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))

Definition at line 164 of file nbtsort.c.

Referenced by _bt_begin_parallel(), and _bt_parallel_scan_and_sort().

Typedef Documentation

◆ BTBuildState

typedef struct BTBuildState BTBuildState

◆ BTLeader

typedef struct BTLeader BTLeader

◆ BTPageState

typedef struct BTPageState BTPageState

◆ BTShared

typedef struct BTShared BTShared

◆ BTSpool

typedef struct BTSpool BTSpool

◆ BTWriteState

typedef struct BTWriteState BTWriteState

Function Documentation

◆ _bt_begin_parallel()

static void _bt_begin_parallel ( BTBuildState buildstate,
bool  isconcurrent,
int  request 
)
static

Definition at line 1454 of file nbtsort.c.

References _bt_end_parallel(), _bt_leader_participate_as_worker(), _bt_parallel_estimate_shared(), Assert, BTShared::brokenhotchain, BTBuildState::btleader, BTLeader::btshared, BTLeader::bufferusage, ConditionVariableInit(), CreateParallelContext(), debug_query_string, DestroyParallelContext(), EnterParallelMode(), ParallelContext::estimator, ExitParallelMode(), GetTransactionSnapshot(), BTShared::havedead, BTSpool::heap, BTShared::heaprelid, BTSpool::index, BTShared::indexrelid, BTShared::indtuples, InitializeParallelDSM(), BTShared::isconcurrent, IsMVCCSnapshot, BTSpool::isunique, BTShared::isunique, LaunchParallelWorkers(), mul_size(), BTShared::mutex, BTShared::nparticipantsdone, BTLeader::nparticipanttuplesorts, ParallelContext::nworkers, ParallelContext::nworkers_launched, palloc0(), PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelTableScanFromBTShared, BTLeader::pcxt, RegisterSnapshot(), RelationGetRelid, BTShared::reltuples, BTShared::scantuplesortstates, ParallelContext::seg, BTLeader::sharedsort, BTLeader::sharedsort2, shm_toc_allocate(), shm_toc_estimate_chunk, shm_toc_estimate_keys, shm_toc_insert(), BTLeader::snapshot, SnapshotAny, SpinLockInit, BTBuildState::spool, table_parallelscan_initialize(), ParallelContext::toc, tuplesort_estimate_shared(), tuplesort_initialize_shared(), UnregisterSnapshot(), WaitForParallelWorkersToAttach(), BTLeader::walusage, and BTShared::workersdonecv.

Referenced by _bt_spools_heapscan().

1455 {
1456  ParallelContext *pcxt;
1457  int scantuplesortstates;
1458  Snapshot snapshot;
1459  Size estbtshared;
1460  Size estsort;
1461  BTShared *btshared;
1462  Sharedsort *sharedsort;
1463  Sharedsort *sharedsort2;
1464  BTSpool *btspool = buildstate->spool;
1465  BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1466  WalUsage *walusage;
1467  BufferUsage *bufferusage;
1468  bool leaderparticipates = true;
1469  char *sharedquery;
1470  int querylen;
1471 
1472 #ifdef DISABLE_LEADER_PARTICIPATION
1473  leaderparticipates = false;
1474 #endif
1475 
1476  /*
1477  * Enter parallel mode, and create context for parallel build of btree
1478  * index
1479  */
1481  Assert(request > 0);
1482  pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1483  request);
1484 
1485  scantuplesortstates = leaderparticipates ? request + 1 : request;
1486 
1487  /*
1488  * Prepare for scan of the base relation. In a normal index build, we use
1489  * SnapshotAny because we must retrieve all tuples and do our own time
1490  * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1491  * concurrent build, we take a regular MVCC snapshot and index whatever's
1492  * live according to that.
1493  */
1494  if (!isconcurrent)
1495  snapshot = SnapshotAny;
1496  else
1498 
1499  /*
1500  * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1501  * PARALLEL_KEY_TUPLESORT tuplesort workspace
1502  */
1503  estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1504  shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1505  estsort = tuplesort_estimate_shared(scantuplesortstates);
1506  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1507 
1508  /*
1509  * Unique case requires a second spool, and so we may have to account for
1510  * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1511  */
1512  if (!btspool->isunique)
1513  shm_toc_estimate_keys(&pcxt->estimator, 2);
1514  else
1515  {
1516  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1517  shm_toc_estimate_keys(&pcxt->estimator, 3);
1518  }
1519 
1520  /*
1521  * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1522  * and PARALLEL_KEY_BUFFER_USAGE.
1523  *
1524  * If there are no extensions loaded that care, we could skip this. We
1525  * have no way of knowing whether anyone's looking at pgWalUsage or
1526  * pgBufferUsage, so do it unconditionally.
1527  */
1529  mul_size(sizeof(WalUsage), pcxt->nworkers));
1530  shm_toc_estimate_keys(&pcxt->estimator, 1);
1532  mul_size(sizeof(BufferUsage), pcxt->nworkers));
1533  shm_toc_estimate_keys(&pcxt->estimator, 1);
1534 
1535  /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1536  querylen = strlen(debug_query_string);
1537  shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1538  shm_toc_estimate_keys(&pcxt->estimator, 1);
1539 
1540  /* Everyone's had a chance to ask for space, so now create the DSM */
1541  InitializeParallelDSM(pcxt);
1542 
1543  /* If no DSM segment was available, back out (do serial build) */
1544  if (pcxt->seg == NULL)
1545  {
1546  if (IsMVCCSnapshot(snapshot))
1547  UnregisterSnapshot(snapshot);
1548  DestroyParallelContext(pcxt);
1549  ExitParallelMode();
1550  return;
1551  }
1552 
1553  /* Store shared build state, for which we reserved space */
1554  btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1555  /* Initialize immutable state */
1556  btshared->heaprelid = RelationGetRelid(btspool->heap);
1557  btshared->indexrelid = RelationGetRelid(btspool->index);
1558  btshared->isunique = btspool->isunique;
1559  btshared->isconcurrent = isconcurrent;
1560  btshared->scantuplesortstates = scantuplesortstates;
1562  SpinLockInit(&btshared->mutex);
1563  /* Initialize mutable state */
1564  btshared->nparticipantsdone = 0;
1565  btshared->reltuples = 0.0;
1566  btshared->havedead = false;
1567  btshared->indtuples = 0.0;
1568  btshared->brokenhotchain = false;
1571  snapshot);
1572 
1573  /*
1574  * Store shared tuplesort-private state, for which we reserved space.
1575  * Then, initialize opaque state using tuplesort routine.
1576  */
1577  sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1578  tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1579  pcxt->seg);
1580 
1581  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1582  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1583 
1584  /* Unique case requires a second spool, and associated shared state */
1585  if (!btspool->isunique)
1586  sharedsort2 = NULL;
1587  else
1588  {
1589  /*
1590  * Store additional shared tuplesort-private state, for which we
1591  * reserved space. Then, initialize opaque state using tuplesort
1592  * routine.
1593  */
1594  sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1595  tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1596  pcxt->seg);
1597 
1598  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1599  }
1600 
1601  /* Store query string for workers */
1602  sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1603  memcpy(sharedquery, debug_query_string, querylen + 1);
1604  shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1605 
1606  /*
1607  * Allocate space for each worker's WalUsage and BufferUsage; no need to
1608  * initialize.
1609  */
1610  walusage = shm_toc_allocate(pcxt->toc,
1611  mul_size(sizeof(WalUsage), pcxt->nworkers));
1612  shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1613  bufferusage = shm_toc_allocate(pcxt->toc,
1614  mul_size(sizeof(BufferUsage), pcxt->nworkers));
1615  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1616 
1617  /* Launch workers, saving status for leader/caller */
1618  LaunchParallelWorkers(pcxt);
1619  btleader->pcxt = pcxt;
1620  btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1621  if (leaderparticipates)
1622  btleader->nparticipanttuplesorts++;
1623  btleader->btshared = btshared;
1624  btleader->sharedsort = sharedsort;
1625  btleader->sharedsort2 = sharedsort2;
1626  btleader->snapshot = snapshot;
1627  btleader->walusage = walusage;
1628  btleader->bufferusage = bufferusage;
1629 
1630  /* If no workers were successfully launched, back out (do serial build) */
1631  if (pcxt->nworkers_launched == 0)
1632  {
1633  _bt_end_parallel(btleader);
1634  return;
1635  }
1636 
1637  /* Save leader state now that it's clear build will be parallel */
1638  buildstate->btleader = btleader;
1639 
1640  /* Join heap scan ourselves */
1641  if (leaderparticipates)
1643 
1644  /*
1645  * Caller needs to wait for all launched workers when we return. Make
1646  * sure that the failure-to-start case will not hang forever.
1647  */
1649 }
Sharedsort * sharedsort2
Definition: nbtsort.c:195
int scantuplesortstates
Definition: nbtsort.c:111
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:70
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition: parallel.c:164
BTShared * btshared
Definition: nbtsort.c:193
int nparticipantsdone
Definition: nbtsort.c:145
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:810
Oid indexrelid
Definition: nbtsort.c:108
dsm_segment * seg
Definition: parallel.h:43
Sharedsort * sharedsort
Definition: nbtsort.c:194
shm_toc_estimator estimator
Definition: parallel.h:42
#define SpinLockInit(lock)
Definition: spin.h:60
Snapshot snapshot
Definition: nbtsort.c:196
bool isunique
Definition: nbtsort.c:109
#define ParallelTableScanFromBTShared(shared)
Definition: nbtsort.c:164
bool isconcurrent
Definition: nbtsort.c:110
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1655
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:4559
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
BTSpool * spool
Definition: nbtsort.c:212
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:250
ParallelContext * pcxt
Definition: nbtsort.c:173
Relation heap
Definition: nbtsort.c:90
void ConditionVariableInit(ConditionVariable *cv)
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:904
#define PARALLEL_KEY_BUFFER_USAGE
Definition: nbtsort.c:74
bool isunique
Definition: nbtsort.c:92
void ExitParallelMode(void)
Definition: xact.c:992
Oid heaprelid
Definition: nbtsort.c:107
BufferUsage * bufferusage
Definition: nbtsort.c:198
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:72
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:71
int nworkers_launched
Definition: parallel.h:38
bool brokenhotchain
Definition: nbtsort.c:149
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:527
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:852
Relation index
Definition: nbtsort.c:91
const char * debug_query_string
Definition: postgres.c:88
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:202
Size mul_size(Size s1, Size s2)
Definition: shmem.c:515
void * palloc0(Size size)
Definition: mcxt.c:981
BTLeader * btleader
Definition: nbtsort.c:226
double reltuples
Definition: nbtsort.c:146
double indtuples
Definition: nbtsort.c:148
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:97
slock_t mutex
Definition: nbtsort.c:127
WalUsage * walusage
Definition: nbtsort.c:197
bool havedead
Definition: nbtsort.c:147
#define Assert(condition)
Definition: c.h:745
int nparticipanttuplesorts
Definition: nbtsort.c:181
size_t Size
Definition: c.h:473
static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition: nbtsort.c:1681
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
void EnterParallelMode(void)
Definition: xact.c:979
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
static void _bt_leader_participate_as_worker(BTBuildState *buildstate)
Definition: nbtsort.c:1735
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:4538
#define SnapshotAny
Definition: snapmgr.h:68
#define PARALLEL_KEY_WAL_USAGE
Definition: nbtsort.c:73
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
ConditionVariable workersdonecv
Definition: nbtsort.c:119
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:69
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition: tableam.c:155
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition: parallel.c:647
#define RelationGetRelid(relation)
Definition: rel.h:456
shm_toc * toc
Definition: parallel.h:45

◆ _bt_blnewpage()

static Page _bt_blnewpage ( uint32  level)
static

Definition at line 611 of file nbtsort.c.

References _bt_pageinit(), BTP_LEAF, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTPageOpaqueData::level, P_NONE, PageGetSpecialPointer, and palloc().

Referenced by _bt_buildadd(), and _bt_pagestate().

612 {
613  Page page;
614  BTPageOpaque opaque;
615 
616  page = (Page) palloc(BLCKSZ);
617 
618  /* Zero the page and set up standard page header info */
619  _bt_pageinit(page, BLCKSZ);
620 
621  /* Initialize BT opaque state */
622  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
623  opaque->btpo_prev = opaque->btpo_next = P_NONE;
624  opaque->btpo.level = level;
625  opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
626  opaque->btpo_cycleid = 0;
627 
628  /* Make the P_HIKEY line pointer appear allocated */
629  ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
630 
631  return page;
632 }
BlockNumber btpo_next
Definition: nbtree.h:59
#define BTP_LEAF
Definition: nbtree.h:72
union BTPageOpaqueData::@45 btpo
#define P_NONE
Definition: nbtree.h:206
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
BTCycleId btpo_cycleid
Definition: nbtree.h:66
BlockNumber btpo_prev
Definition: nbtree.h:58
struct ItemIdData ItemIdData
uint32 level
Definition: nbtree.h:62
PageHeaderData * PageHeader
Definition: bufpage.h:166
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
void * palloc(Size size)
Definition: mcxt.c:950
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1066
uint16 btpo_flags
Definition: nbtree.h:65
Pointer Page
Definition: bufpage.h:78

◆ _bt_blwritepage()

static void _bt_blwritepage ( BTWriteState wstate,
Page  page,
BlockNumber  blkno 
)
static

Definition at line 638 of file nbtsort.c.

References BTWriteState::btws_pages_written, BTWriteState::btws_use_wal, BTWriteState::btws_zeropage, BTWriteState::index, log_newpage(), MAIN_FORKNUM, PageSetChecksumInplace(), palloc0(), pfree(), RelationData::rd_node, RelationData::rd_smgr, RelationOpenSmgr, smgrextend(), and smgrwrite().

Referenced by _bt_buildadd(), and _bt_uppershutdown().

639 {
640  /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
641  RelationOpenSmgr(wstate->index);
642 
643  /* XLOG stuff */
644  if (wstate->btws_use_wal)
645  {
646  /* We use the XLOG_FPI record type for this */
647  log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
648  }
649 
650  /*
651  * If we have to write pages nonsequentially, fill in the space with
652  * zeroes until we come back and overwrite. This is not logically
653  * necessary on standard Unix filesystems (unwritten space will read as
654  * zeroes anyway), but it should help to avoid fragmentation. The dummy
655  * pages aren't WAL-logged though.
656  */
657  while (blkno > wstate->btws_pages_written)
658  {
659  if (!wstate->btws_zeropage)
660  wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
661  /* don't set checksum for all-zero page */
663  wstate->btws_pages_written++,
664  (char *) wstate->btws_zeropage,
665  true);
666  }
667 
668  PageSetChecksumInplace(page, blkno);
669 
670  /*
671  * Now write the page. There's no need for smgr to schedule an fsync for
672  * this write; we'll do it ourselves before ending the build.
673  */
674  if (blkno == wstate->btws_pages_written)
675  {
676  /* extending the file... */
677  smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
678  (char *) page, true);
679  wstate->btws_pages_written++;
680  }
681  else
682  {
683  /* overwriting a block we zero-filled before */
684  smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
685  (char *) page, true);
686  }
687 
688  pfree(page);
689 }
Page btws_zeropage
Definition: nbtsort.c:256
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
bool btws_use_wal
Definition: nbtsort.c:253
#define RelationOpenSmgr(relation)
Definition: rel.h:513
void pfree(void *pointer)
Definition: mcxt.c:1057
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:524
Relation index
Definition: nbtsort.c:251
BlockNumber btws_pages_written
Definition: nbtsort.c:255
void * palloc0(Size size)
Definition: mcxt.c:981
RelFileNode rd_node
Definition: rel.h:55
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1414
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:463
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:996
Pointer Page
Definition: bufpage.h:78

◆ _bt_build_callback()

static void _bt_build_callback ( Relation  index,
ItemPointer  tid,
Datum values,
bool isnull,
bool  tupleIsAlive,
void *  state 
)
static

Definition at line 582 of file nbtsort.c.

References _bt_spool(), BTBuildState::havedead, BTBuildState::indtuples, BTBuildState::spool, and BTBuildState::spool2.

Referenced by _bt_parallel_scan_and_sort(), and _bt_spools_heapscan().

588 {
589  BTBuildState *buildstate = (BTBuildState *) state;
590 
591  /*
592  * insert the index tuple into the appropriate spool file for subsequent
593  * processing
594  */
595  if (tupleIsAlive || buildstate->spool2 == NULL)
596  _bt_spool(buildstate->spool, tid, values, isnull);
597  else
598  {
599  /* dead tuples are put into spool2 */
600  buildstate->havedead = true;
601  _bt_spool(buildstate->spool2, tid, values, isnull);
602  }
603 
604  buildstate->indtuples += 1;
605 }
bool havedead
Definition: nbtsort.c:210
BTSpool * spool
Definition: nbtsort.c:212
BTSpool * spool2
Definition: nbtsort.c:218
double indtuples
Definition: nbtsort.c:219
Definition: regguts.h:298
static Datum values[MAXATTR]
Definition: bootstrap.c:165
static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
Definition: nbtsort.c:528

◆ _bt_buildadd()

static void _bt_buildadd ( BTWriteState wstate,
BTPageState state,
IndexTuple  itup,
Size  truncextra 
)
static

Definition at line 834 of file nbtsort.c.

References _bt_blnewpage(), _bt_blwritepage(), _bt_check_third_page(), _bt_pagestate(), _bt_sortaddtup(), _bt_truncate(), Assert, BTMaxItemSize, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTPageState::btps_blkno, BTPageState::btps_lastextra, BTPageState::btps_lastoff, BTPageState::btps_level, BTPageState::btps_lowkey, BTPageState::btps_next, BTPageState::btps_page, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), BTreeTupleSetNAtts(), BTWriteState::btws_pages_alloced, CHECK_FOR_INTERRUPTS, CopyIndexTuple(), elog, ERROR, BTWriteState::heap, BTWriteState::index, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, BTWriteState::inskey, ItemIdGetLength, ItemIdSetUnused, MAXALIGN, OffsetNumberNext, OffsetNumberPrev, P_FIRSTKEY, P_HIKEY, P_LEFTMOST, P_NONE, PageGetFreeSpace(), PageGetItem, PageGetItemId, PageGetSpecialPointer, PageIndexTupleOverwrite(), palloc0(), pfree(), IndexTupleData::t_info, and unlikely.

Referenced by _bt_load(), _bt_sort_dedup_finish_pending(), and _bt_uppershutdown().

836 {
837  Page npage;
838  BlockNumber nblkno;
839  OffsetNumber last_off;
840  Size last_truncextra;
841  Size pgspc;
842  Size itupsz;
843  bool isleaf;
844 
845  /*
846  * This is a handy place to check for cancel interrupts during the btree
847  * load phase of index creation.
848  */
850 
851  npage = state->btps_page;
852  nblkno = state->btps_blkno;
853  last_off = state->btps_lastoff;
854  last_truncextra = state->btps_lastextra;
855  state->btps_lastextra = truncextra;
856 
857  pgspc = PageGetFreeSpace(npage);
858  itupsz = IndexTupleSize(itup);
859  itupsz = MAXALIGN(itupsz);
860  /* Leaf case has slightly different rules due to suffix truncation */
861  isleaf = (state->btps_level == 0);
862 
863  /*
864  * Check whether the new item can fit on a btree page on current level at
865  * all.
866  *
867  * Every newly built index will treat heap TID as part of the keyspace,
868  * which imposes the requirement that new high keys must occasionally have
869  * a heap TID appended within _bt_truncate(). That may leave a new pivot
870  * tuple one or two MAXALIGN() quantums larger than the original
871  * firstright tuple it's derived from. v4 deals with the problem by
872  * decreasing the limit on the size of tuples inserted on the leaf level
873  * by the same small amount. Enforce the new v4+ limit on the leaf level,
874  * and the old limit on internal levels, since pivot tuples may need to
875  * make use of the reserved space. This should never fail on internal
876  * pages.
877  */
878  if (unlikely(itupsz > BTMaxItemSize(npage)))
879  _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
880  itup);
881 
882  /*
883  * Check to see if current page will fit new item, with space left over to
884  * append a heap TID during suffix truncation when page is a leaf page.
885  *
886  * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
887  * high key with heap TID when finishing off a leaf page, since we rely on
888  * _bt_check_third_page() rejecting oversized non-pivot tuples. On
889  * internal pages we can always fit 3 pivot tuples with larger internal
890  * page tuple limit (includes page high key).
891  *
892  * Most of the time, a page is only "full" in the sense that the soft
893  * fillfactor-wise limit has been exceeded. However, we must always leave
894  * at least two items plus a high key on each page before starting a new
895  * page. Disregard fillfactor and insert on "full" current page if we
896  * don't have the minimum number of items yet. (Note that we deliberately
897  * assume that suffix truncation neither enlarges nor shrinks new high key
898  * when applying soft limit, except when last tuple has a posting list.)
899  */
900  Assert(last_truncextra == 0 || isleaf);
901  if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
902  (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
903  {
904  /*
905  * Finish off the page and write it out.
906  */
907  Page opage = npage;
908  BlockNumber oblkno = nblkno;
909  ItemId ii;
910  ItemId hii;
911  IndexTuple oitup;
912 
913  /* Create new page of same level */
914  npage = _bt_blnewpage(state->btps_level);
915 
916  /* and assign it a page position */
917  nblkno = wstate->btws_pages_alloced++;
918 
919  /*
920  * We copy the last item on the page into the new page, and then
921  * rearrange the old page so that the 'last item' becomes its high key
922  * rather than a true data item. There had better be at least two
923  * items on the page already, else the page would be empty of useful
924  * data.
925  */
926  Assert(last_off > P_FIRSTKEY);
927  ii = PageGetItemId(opage, last_off);
928  oitup = (IndexTuple) PageGetItem(opage, ii);
929  _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
930  !isleaf);
931 
932  /*
933  * Move 'last' into the high key position on opage. _bt_blnewpage()
934  * allocated empty space for a line pointer when opage was first
935  * created, so this is a matter of rearranging already-allocated space
936  * on page, and initializing high key line pointer. (Actually, leaf
937  * pages must also swap oitup with a truncated version of oitup, which
938  * is sometimes larger than oitup, though never by more than the space
939  * needed to append a heap TID.)
940  */
941  hii = PageGetItemId(opage, P_HIKEY);
942  *hii = *ii;
943  ItemIdSetUnused(ii); /* redundant */
944  ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
945 
946  if (isleaf)
947  {
948  IndexTuple lastleft;
949  IndexTuple truncated;
950 
951  /*
952  * Truncate away any unneeded attributes from high key on leaf
953  * level. This is only done at the leaf level because downlinks
954  * in internal pages are either negative infinity items, or get
955  * their contents from copying from one level down. See also:
956  * _bt_split().
957  *
958  * We don't try to bias our choice of split point to make it more
959  * likely that _bt_truncate() can truncate away more attributes,
960  * whereas the split point used within _bt_split() is chosen much
961  * more delicately. Even still, the lastleft and firstright
962  * tuples passed to _bt_truncate() here are at least not fully
963  * equal to each other when deduplication is used, unless there is
964  * a large group of duplicates (also, unique index builds usually
965  * have few or no spool2 duplicates). When the split point is
966  * between two unequal tuples, _bt_truncate() will avoid including
967  * a heap TID in the new high key, which is the most important
968  * benefit of suffix truncation.
969  *
970  * Overwrite the old item with new truncated high key directly.
971  * oitup is already located at the physical beginning of tuple
972  * space, so this should directly reuse the existing tuple space.
973  */
974  ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
975  lastleft = (IndexTuple) PageGetItem(opage, ii);
976 
977  Assert(IndexTupleSize(oitup) > last_truncextra);
978  truncated = _bt_truncate(wstate->index, lastleft, oitup,
979  wstate->inskey);
980  if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
981  IndexTupleSize(truncated)))
982  elog(ERROR, "failed to add high key to the index page");
983  pfree(truncated);
984 
985  /* oitup should continue to point to the page's high key */
986  hii = PageGetItemId(opage, P_HIKEY);
987  oitup = (IndexTuple) PageGetItem(opage, hii);
988  }
989 
990  /*
991  * Link the old page into its parent, using its low key. If we don't
992  * have a parent, we have to create one; this adds a new btree level.
993  */
994  if (state->btps_next == NULL)
995  state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
996 
997  Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
999  BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
1001  Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
1003  BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
1004  _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
1005  pfree(state->btps_lowkey);
1006 
1007  /*
1008  * Save a copy of the high key from the old page. It is also the low
1009  * key for the new page.
1010  */
1011  state->btps_lowkey = CopyIndexTuple(oitup);
1012 
1013  /*
1014  * Set the sibling links for both pages.
1015  */
1016  {
1017  BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
1018  BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
1019 
1020  oopaque->btpo_next = nblkno;
1021  nopaque->btpo_prev = oblkno;
1022  nopaque->btpo_next = P_NONE; /* redundant */
1023  }
1024 
1025  /*
1026  * Write out the old page. We never need to touch it again, so we can
1027  * free the opage workspace too.
1028  */
1029  _bt_blwritepage(wstate, opage, oblkno);
1030 
1031  /*
1032  * Reset last_off to point to new page
1033  */
1034  last_off = P_FIRSTKEY;
1035  }
1036 
1037  /*
1038  * By here, either original page is still the current page, or a new page
1039  * was created that became the current page. Either way, the current page
1040  * definitely has space for new item.
1041  *
1042  * If the new item is the first for its page, it must also be the first
1043  * item on its entire level. On later same-level pages, a low key for a
1044  * page will be copied from the prior page in the code above. Generate a
1045  * minus infinity low key here instead.
1046  */
1047  if (last_off == P_HIKEY)
1048  {
1049  Assert(state->btps_lowkey == NULL);
1050  state->btps_lowkey = palloc0(sizeof(IndexTupleData));
1051  state->btps_lowkey->t_info = sizeof(IndexTupleData);
1052  BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1053  }
1054 
1055  /*
1056  * Add the new item into the current page.
1057  */
1058  last_off = OffsetNumberNext(last_off);
1059  _bt_sortaddtup(npage, itupsz, itup, last_off,
1060  !isleaf && last_off == P_FIRSTKEY);
1061 
1062  state->btps_page = npage;
1063  state->btps_blkno = nblkno;
1064  state->btps_lastoff = last_off;
1065 }
BlockNumber btpo_next
Definition: nbtree.h:59
BlockNumber btws_pages_alloced
Definition: nbtsort.c:254
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:206
uint32 BlockNumber
Definition: block.h:31
BlockNumber btps_blkno
Definition: nbtsort.c:236
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
static void _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
Definition: nbtsort.c:638
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:782
uint16 OffsetNumber
Definition: off.h:24
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:463
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2199
Page btps_page
Definition: nbtsort.c:235
void pfree(void *pointer)
Definition: mcxt.c:1057
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ERROR
Definition: elog.h:43
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1280
BlockNumber btpo_prev
Definition: nbtree.h:58
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
OffsetNumber btps_lastoff
Definition: nbtsort.c:238
static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtsort.c:764
IndexTupleData * IndexTuple
Definition: itup.h:53
#define P_FIRSTKEY
Definition: nbtree.h:243
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
struct ItemIdData ItemIdData
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:430
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
Size btps_lastextra
Definition: nbtsort.c:239
Relation index
Definition: nbtsort.c:251
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void * palloc0(Size size)
Definition: mcxt.c:981
uint32 btps_level
Definition: nbtsort.c:240
struct IndexTupleData IndexTupleData
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
Definition: nbtsort.c:696
struct BTPageState * btps_next
Definition: nbtsort.c:242
BTScanInsert inskey
Definition: nbtsort.c:252
IndexTuple btps_lowkey
Definition: nbtsort.c:237
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
Definition: nbtsort.c:834
PageHeaderData * PageHeader
Definition: bufpage.h:166
#define Assert(condition)
Definition: c.h:745
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:473
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define MAXALIGN(LEN)
Definition: c.h:698
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2633
#define BTMaxItemSize(page)
Definition: nbtree.h:157
#define P_HIKEY
Definition: nbtree.h:242
static Page _bt_blnewpage(uint32 level)
Definition: nbtsort.c:611
#define elog(elevel,...)
Definition: elog.h:214
#define unlikely(x)
Definition: c.h:206
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
unsigned short t_info
Definition: itup.h:49
Relation heap
Definition: nbtsort.c:250
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_end_parallel()

static void _bt_end_parallel ( BTLeader btleader)
static

Definition at line 1655 of file nbtsort.c.

References BTLeader::bufferusage, DestroyParallelContext(), ExitParallelMode(), i, InstrAccumParallelQuery(), IsMVCCSnapshot, ParallelContext::nworkers_launched, BTLeader::pcxt, BTLeader::snapshot, UnregisterSnapshot(), WaitForParallelWorkersToFinish(), and BTLeader::walusage.

Referenced by _bt_begin_parallel(), and btbuild().

1656 {
1657  int i;
1658 
1659  /* Shutdown worker processes */
1661 
1662  /*
1663  * Next, accumulate WAL usage. (This must wait for the workers to finish,
1664  * or we might get incomplete data.)
1665  */
1666  for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1667  InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1668 
1669  /* Free last reference to MVCC snapshot, if one was used */
1670  if (IsMVCCSnapshot(btleader->snapshot))
1671  UnregisterSnapshot(btleader->snapshot);
1672  DestroyParallelContext(btleader->pcxt);
1673  ExitParallelMode();
1674 }
Snapshot snapshot
Definition: nbtsort.c:196
ParallelContext * pcxt
Definition: nbtsort.c:173
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:750
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:904
void ExitParallelMode(void)
Definition: xact.c:992
BufferUsage * bufferusage
Definition: nbtsort.c:198
int nworkers_launched
Definition: parallel.h:38
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:199
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:852
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:97
WalUsage * walusage
Definition: nbtsort.c:197
int i

◆ _bt_leader_participate_as_worker()

static void _bt_leader_participate_as_worker ( BTBuildState buildstate)
static

Definition at line 1735 of file nbtsort.c.

References _bt_parallel_scan_and_sort(), BTBuildState::btleader, BTLeader::btshared, BTSpool::heap, BTSpool::index, BTSpool::isunique, BTShared::isunique, log_btree_build_stats, maintenance_work_mem, BTLeader::nparticipanttuplesorts, palloc0(), ResetUsage(), BTLeader::sharedsort, BTLeader::sharedsort2, ShowUsage(), and BTBuildState::spool.

Referenced by _bt_begin_parallel().

1736 {
1737  BTLeader *btleader = buildstate->btleader;
1738  BTSpool *leaderworker;
1739  BTSpool *leaderworker2;
1740  int sortmem;
1741 
1742  /* Allocate memory and initialize private spool */
1743  leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1744  leaderworker->heap = buildstate->spool->heap;
1745  leaderworker->index = buildstate->spool->index;
1746  leaderworker->isunique = buildstate->spool->isunique;
1747 
1748  /* Initialize second spool, if required */
1749  if (!btleader->btshared->isunique)
1750  leaderworker2 = NULL;
1751  else
1752  {
1753  /* Allocate memory for worker's own private secondary spool */
1754  leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1755 
1756  /* Initialize worker's own secondary spool */
1757  leaderworker2->heap = leaderworker->heap;
1758  leaderworker2->index = leaderworker->index;
1759  leaderworker2->isunique = false;
1760  }
1761 
1762  /*
1763  * Might as well use reliable figure when doling out maintenance_work_mem
1764  * (when requested number of workers were not launched, this will be
1765  * somewhat higher than it is for other workers).
1766  */
1767  sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1768 
1769  /* Perform work common to all participants */
1770  _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1771  btleader->sharedsort, btleader->sharedsort2,
1772  sortmem, true);
1773 
1774 #ifdef BTREE_BUILD_STATS
1776  {
1777  ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1778  ResetUsage();
1779  }
1780 #endif /* BTREE_BUILD_STATS */
1781 }
Sharedsort * sharedsort2
Definition: nbtsort.c:195
BTShared * btshared
Definition: nbtsort.c:193
void ShowUsage(const char *title)
Definition: postgres.c:4612
Sharedsort * sharedsort
Definition: nbtsort.c:194
bool isunique
Definition: nbtsort.c:109
BTSpool * spool
Definition: nbtsort.c:212
Relation heap
Definition: nbtsort.c:90
void ResetUsage(void)
Definition: postgres.c:4605
bool isunique
Definition: nbtsort.c:92
Relation index
Definition: nbtsort.c:91
void * palloc0(Size size)
Definition: mcxt.c:981
bool log_btree_build_stats
Definition: guc.c:524
BTLeader * btleader
Definition: nbtsort.c:226
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition: nbtsort.c:1901
int maintenance_work_mem
Definition: globals.c:123
int nparticipanttuplesorts
Definition: nbtsort.c:181

◆ _bt_leafbuild()

static void _bt_leafbuild ( BTSpool btspool,
BTSpool btspool2 
)
static

Definition at line 539 of file nbtsort.c.

References _bt_allequalimage(), _bt_load(), _bt_mkscankey(), BTScanInsertData::allequalimage, BTREE_METAPAGE, BTWriteState::btws_pages_alloced, BTWriteState::btws_pages_written, BTWriteState::btws_use_wal, BTWriteState::btws_zeropage, BTSpool::heap, BTWriteState::heap, BTSpool::index, BTWriteState::index, BTWriteState::inskey, log_btree_build_stats, pgstat_progress_update_param(), PROGRESS_BTREE_PHASE_LEAF_LOAD, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, PROGRESS_CREATEIDX_SUBPHASE, RelationNeedsWAL, ResetUsage(), ShowUsage(), BTSpool::sortstate, and tuplesort_performsort().

Referenced by btbuild().

540 {
541  BTWriteState wstate;
542 
543 #ifdef BTREE_BUILD_STATS
545  {
546  ShowUsage("BTREE BUILD (Spool) STATISTICS");
547  ResetUsage();
548  }
549 #endif /* BTREE_BUILD_STATS */
550 
554  if (btspool2)
555  {
558  tuplesort_performsort(btspool2->sortstate);
559  }
560 
561  wstate.heap = btspool->heap;
562  wstate.index = btspool->index;
563  wstate.inskey = _bt_mkscankey(wstate.index, NULL);
564  /* _bt_mkscankey() won't set allequalimage without metapage */
565  wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
566  wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
567 
568  /* reserve the metapage */
569  wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
570  wstate.btws_pages_written = 0;
571  wstate.btws_zeropage = NULL; /* until needed */
572 
575  _bt_load(&wstate, btspool, btspool2);
576 }
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:2021
Page btws_zeropage
Definition: nbtsort.c:256
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:1181
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition: nbtree.h:988
void ShowUsage(const char *title)
Definition: postgres.c:4612
BlockNumber btws_pages_alloced
Definition: nbtsort.c:254
void pgstat_progress_update_param(int index, int64 val)
Definition: pgstat.c:3231
bool allequalimage
Definition: nbtree.h:660
Tuplesortstate * sortstate
Definition: nbtsort.c:89
bool btws_use_wal
Definition: nbtsort.c:253
Relation heap
Definition: nbtsort.c:90
void ResetUsage(void)
Definition: postgres.c:4605
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition: nbtree.h:989
#define BTREE_METAPAGE
Definition: nbtree.h:141
Relation index
Definition: nbtsort.c:91
Relation index
Definition: nbtsort.c:251
BlockNumber btws_pages_written
Definition: nbtsort.c:255
bool log_btree_build_stats
Definition: guc.c:524
BTScanInsert inskey
Definition: nbtsort.c:252
#define RelationNeedsWAL(relation)
Definition: rel.h:562
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition: nbtree.h:990
#define PROGRESS_CREATEIDX_SUBPHASE
Definition: progress.h:83
Relation heap
Definition: nbtsort.c:250
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2691

◆ _bt_load()

static void _bt_load ( BTWriteState wstate,
BTSpool btspool,
BTSpool btspool2 
)
static

Definition at line 1181 of file nbtsort.c.

References _bt_buildadd(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_keep_natts_fast(), _bt_pagestate(), _bt_sort_dedup_finish_pending(), _bt_uppershutdown(), SortSupportData::abbreviate, BTScanInsertData::allequalimage, ApplySortComparator(), Assert, AssertState, BTDedupStateData::base, BTDedupStateData::baseoff, BTDedupStateData::basetupsize, BTGetDeduplicateItems, BTGreaterStrategyNumber, BTLessStrategyNumber, BTMaxItemSize, BTPageState::btps_page, BTWriteState::btws_use_wal, compare(), CopyIndexTuple(), CurrentMemoryContext, BTDedupStateData::deduplicate, BTDedupStateData::htids, i, BTWriteState::index, index_getattr, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, BTWriteState::inskey, InvalidOffsetNumber, BTSpool::isunique, ItemPointerCompare(), MAIN_FORKNUM, MAXALIGN_DOWN, BTDedupStateData::maxpostingsize, merge(), BTDedupStateData::nhtids, BTDedupStateData::nintervals, BTDedupStateData::nitems, BTDedupStateData::nmaxitems, palloc(), palloc0(), pfree(), pgstat_progress_update_param(), BTDedupStateData::phystupsize, PrepareSortSupportFromIndexRel(), PROGRESS_CREATEIDX_TUPLES_DONE, RelationData::rd_smgr, RelationGetDescr, RelationOpenSmgr, BTScanInsertData::scankeys, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, smgrimmedsync(), BTSpool::sortstate, SortSupportData::ssup_attno, SortSupportData::ssup_collation, SortSupportData::ssup_cxt, SortSupportData::ssup_nulls_first, IndexTupleData::t_tid, and tuplesort_getindextuple().

Referenced by _bt_leafbuild().

1182 {
1183  BTPageState *state = NULL;
1184  bool merge = (btspool2 != NULL);
1185  IndexTuple itup,
1186  itup2 = NULL;
1187  bool load1;
1188  TupleDesc tupdes = RelationGetDescr(wstate->index);
1189  int i,
1191  SortSupport sortKeys;
1192  int64 tuples_done = 0;
1193  bool deduplicate;
1194 
1195  deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1196  BTGetDeduplicateItems(wstate->index);
1197 
1198  if (merge)
1199  {
1200  /*
1201  * Another BTSpool for dead tuples exists. Now we have to merge
1202  * btspool and btspool2.
1203  */
1204 
1205  /* the preparation of merge */
1206  itup = tuplesort_getindextuple(btspool->sortstate, true);
1207  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1208 
1209  /* Prepare SortSupport data for each column */
1210  sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1211 
1212  for (i = 0; i < keysz; i++)
1213  {
1214  SortSupport sortKey = sortKeys + i;
1215  ScanKey scanKey = wstate->inskey->scankeys + i;
1216  int16 strategy;
1217 
1218  sortKey->ssup_cxt = CurrentMemoryContext;
1219  sortKey->ssup_collation = scanKey->sk_collation;
1220  sortKey->ssup_nulls_first =
1221  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1222  sortKey->ssup_attno = scanKey->sk_attno;
1223  /* Abbreviation is not supported here */
1224  sortKey->abbreviate = false;
1225 
1226  AssertState(sortKey->ssup_attno != 0);
1227 
1228  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1230 
1231  PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1232  }
1233 
1234  for (;;)
1235  {
1236  load1 = true; /* load BTSpool next ? */
1237  if (itup2 == NULL)
1238  {
1239  if (itup == NULL)
1240  break;
1241  }
1242  else if (itup != NULL)
1243  {
1244  int32 compare = 0;
1245 
1246  for (i = 1; i <= keysz; i++)
1247  {
1248  SortSupport entry;
1249  Datum attrDatum1,
1250  attrDatum2;
1251  bool isNull1,
1252  isNull2;
1253 
1254  entry = sortKeys + i - 1;
1255  attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1256  attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1257 
1258  compare = ApplySortComparator(attrDatum1, isNull1,
1259  attrDatum2, isNull2,
1260  entry);
1261  if (compare > 0)
1262  {
1263  load1 = false;
1264  break;
1265  }
1266  else if (compare < 0)
1267  break;
1268  }
1269 
1270  /*
1271  * If key values are equal, we sort on ItemPointer. This is
1272  * required for btree indexes, since heap TID is treated as an
1273  * implicit last key attribute in order to ensure that all
1274  * keys in the index are physically unique.
1275  */
1276  if (compare == 0)
1277  {
1278  compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1279  Assert(compare != 0);
1280  if (compare > 0)
1281  load1 = false;
1282  }
1283  }
1284  else
1285  load1 = false;
1286 
1287  /* When we see first tuple, create first index page */
1288  if (state == NULL)
1289  state = _bt_pagestate(wstate, 0);
1290 
1291  if (load1)
1292  {
1293  _bt_buildadd(wstate, state, itup, 0);
1294  itup = tuplesort_getindextuple(btspool->sortstate, true);
1295  }
1296  else
1297  {
1298  _bt_buildadd(wstate, state, itup2, 0);
1299  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1300  }
1301 
1302  /* Report progress */
1304  ++tuples_done);
1305  }
1306  pfree(sortKeys);
1307  }
1308  else if (deduplicate)
1309  {
1310  /* merge is unnecessary, deduplicate into posting lists */
1311  BTDedupState dstate;
1312 
1313  dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
1314  dstate->deduplicate = true; /* unused */
1315  dstate->nmaxitems = 0; /* unused */
1316  dstate->maxpostingsize = 0; /* set later */
1317  /* Metadata about base tuple of current pending posting list */
1318  dstate->base = NULL;
1319  dstate->baseoff = InvalidOffsetNumber; /* unused */
1320  dstate->basetupsize = 0;
1321  /* Metadata about current pending posting list TIDs */
1322  dstate->htids = NULL;
1323  dstate->nhtids = 0;
1324  dstate->nitems = 0;
1325  dstate->phystupsize = 0; /* unused */
1326  dstate->nintervals = 0; /* unused */
1327 
1328  while ((itup = tuplesort_getindextuple(btspool->sortstate,
1329  true)) != NULL)
1330  {
1331  /* When we see first tuple, create first index page */
1332  if (state == NULL)
1333  {
1334  state = _bt_pagestate(wstate, 0);
1335 
1336  /*
1337  * Limit size of posting list tuples to 1/10 space we want to
1338  * leave behind on the page, plus space for final item's line
1339  * pointer. This is equal to the space that we'd like to
1340  * leave behind on each leaf page when fillfactor is 90,
1341  * allowing us to get close to fillfactor% space utilization
1342  * when there happen to be a great many duplicates. (This
1343  * makes higher leaf fillfactor settings ineffective when
1344  * building indexes that have many duplicates, but packing
1345  * leaf pages full with few very large tuples doesn't seem
1346  * like a useful goal.)
1347  */
1348  dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1349  sizeof(ItemIdData);
1350  Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
1351  dstate->maxpostingsize <= INDEX_SIZE_MASK);
1352  dstate->htids = palloc(dstate->maxpostingsize);
1353 
1354  /* start new pending posting list with itup copy */
1357  }
1358  else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1359  itup) > keysz &&
1360  _bt_dedup_save_htid(dstate, itup))
1361  {
1362  /*
1363  * Tuple is equal to base tuple of pending posting list. Heap
1364  * TID from itup has been saved in state.
1365  */
1366  }
1367  else
1368  {
1369  /*
1370  * Tuple is not equal to pending posting list tuple, or
1371  * _bt_dedup_save_htid() opted to not merge current item into
1372  * pending posting list.
1373  */
1374  _bt_sort_dedup_finish_pending(wstate, state, dstate);
1375  pfree(dstate->base);
1376 
1377  /* start new pending posting list with itup copy */
1380  }
1381 
1382  /* Report progress */
1384  ++tuples_done);
1385  }
1386 
1387  if (state)
1388  {
1389  /*
1390  * Handle the last item (there must be a last item when the
1391  * tuplesort returned one or more tuples)
1392  */
1393  _bt_sort_dedup_finish_pending(wstate, state, dstate);
1394  pfree(dstate->base);
1395  pfree(dstate->htids);
1396  }
1397 
1398  pfree(dstate);
1399  }
1400  else
1401  {
1402  /* merging and deduplication are both unnecessary */
1403  while ((itup = tuplesort_getindextuple(btspool->sortstate,
1404  true)) != NULL)
1405  {
1406  /* When we see first tuple, create first index page */
1407  if (state == NULL)
1408  state = _bt_pagestate(wstate, 0);
1409 
1410  _bt_buildadd(wstate, state, itup, 0);
1411 
1412  /* Report progress */
1414  ++tuples_done);
1415  }
1416  }
1417 
1418  /* Close down final pages and write the metapage */
1419  _bt_uppershutdown(wstate, state);
1420 
1421  /*
1422  * When we WAL-logged index pages, we must nonetheless fsync index files.
1423  * Since we're building outside shared buffers, a CHECKPOINT occurring
1424  * during the build has no way to flush the previously written data to
1425  * disk (indeed it won't know the index even exists). A crash later on
1426  * would replay WAL from the checkpoint, therefore it wouldn't replay our
1427  * earlier WAL entries. If we do not fsync those pages here, they might
1428  * still not be on disk when the crash occurs.
1429  */
1430  if (wstate->btws_use_wal)
1431  {
1432  RelationOpenSmgr(wstate->index);
1434  }
1435 }
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2446
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
signed short int16
Definition: c.h:361
bool ssup_nulls_first
Definition: sortsupport.h:75
IndexTuple base
Definition: nbtree.h:746
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define AssertState(condition)
Definition: c.h:748
OffsetNumber baseoff
Definition: nbtree.h:747
#define RelationGetDescr(relation)
Definition: rel.h:482
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:976
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
Definition: nbtsort.c:1075
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
void pgstat_progress_update_param(int index, int64 val)
Definition: pgstat.c:3231
ItemPointerData t_tid
Definition: itup.h:37
#define INDEX_SIZE_MASK
Definition: itup.h:65
bool allequalimage
Definition: nbtree.h:660
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:324
Tuplesortstate * sortstate
Definition: nbtsort.c:89
bool btws_use_wal
Definition: nbtsort.c:253
static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
Definition: pairingheap.c:79
signed int int32
Definition: c.h:362
#define RelationOpenSmgr(relation)
Definition: rel.h:513
ItemPointer htids
Definition: nbtree.h:751
Page btps_page
Definition: nbtsort.c:235
void pfree(void *pointer)
Definition: mcxt.c:1057
Size phystupsize
Definition: nbtree.h:754
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
bool isunique
Definition: nbtsort.c:92
MemoryContext ssup_cxt
Definition: sortsupport.h:66
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:375
struct ItemIdData ItemIdData
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup)
Definition: sortsupport.c:162
Relation index
Definition: nbtsort.c:251
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:957
void * palloc0(Size size)
Definition: mcxt.c:981
uintptr_t Datum
Definition: postgres.h:367
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
Definition: nbtsort.c:696
AttrNumber ssup_attno
Definition: sortsupport.h:81
BTScanInsert inskey
Definition: nbtsort.c:252
#define InvalidOffsetNumber
Definition: off.h:26
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
Definition: nbtsort.c:834
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:745
#define SK_BT_DESC
Definition: nbtree.h:956
Definition: regguts.h:298
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
Size basetupsize
Definition: nbtree.h:748
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:666
Size maxpostingsize
Definition: nbtree.h:743
#define BTMaxItemSize(page)
Definition: nbtree.h:157
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2418
void * palloc(Size size)
Definition: mcxt.c:950
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition: progress.h:85
bool deduplicate
Definition: nbtree.h:741
Oid sk_collation
Definition: skey.h:70
int i
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
Definition: nbtsort.c:1109
BTDedupStateData * BTDedupState
Definition: nbtree.h:766
#define BTLessStrategyNumber
Definition: stratnum.h:29
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:200
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:643
AttrNumber sk_attno
Definition: skey.h:67
#define MAXALIGN_DOWN(LEN)
Definition: c.h:710

◆ _bt_pagestate()

static BTPageState * _bt_pagestate ( BTWriteState wstate,
uint32  level 
)
static

Definition at line 696 of file nbtsort.c.

References _bt_blnewpage(), BTGetTargetPageFreeSpace, BTPageState::btps_blkno, BTPageState::btps_full, BTPageState::btps_lastextra, BTPageState::btps_lastoff, BTPageState::btps_level, BTPageState::btps_lowkey, BTPageState::btps_next, BTPageState::btps_page, BTREE_NONLEAF_FILLFACTOR, BTWriteState::btws_pages_alloced, BTWriteState::index, P_HIKEY, and palloc0().

Referenced by _bt_buildadd(), and _bt_load().

697 {
699 
700  /* create initial page for level */
701  state->btps_page = _bt_blnewpage(level);
702 
703  /* and assign it a page position */
704  state->btps_blkno = wstate->btws_pages_alloced++;
705 
706  state->btps_lowkey = NULL;
707  /* initialize lastoff so first item goes into P_FIRSTKEY */
708  state->btps_lastoff = P_HIKEY;
709  state->btps_lastextra = 0;
710  state->btps_level = level;
711  /* set "full" threshold based on level. See notes at head of file. */
712  if (level > 0)
713  state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
714  else
715  state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
716 
717  /* no parent level, yet */
718  state->btps_next = NULL;
719 
720  return state;
721 }
#define BTGetTargetPageFreeSpace(relation)
Definition: nbtree.h:974
BlockNumber btws_pages_alloced
Definition: nbtsort.c:254
BlockNumber btps_blkno
Definition: nbtsort.c:236
Page btps_page
Definition: nbtsort.c:235
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:195
Size btps_full
Definition: nbtsort.c:241
OffsetNumber btps_lastoff
Definition: nbtsort.c:238
Size btps_lastextra
Definition: nbtsort.c:239
Relation index
Definition: nbtsort.c:251
void * palloc0(Size size)
Definition: mcxt.c:981
uint32 btps_level
Definition: nbtsort.c:240
struct BTPageState * btps_next
Definition: nbtsort.c:242
IndexTuple btps_lowkey
Definition: nbtsort.c:237
Definition: regguts.h:298
#define P_HIKEY
Definition: nbtree.h:242
static Page _bt_blnewpage(uint32 level)
Definition: nbtsort.c:611

◆ _bt_parallel_build_main()

void _bt_parallel_build_main ( dsm_segment seg,
shm_toc toc 
)

Definition at line 1787 of file nbtsort.c.

References _bt_parallel_scan_and_sort(), AccessExclusiveLock, debug_query_string, BTSpool::heap, BTShared::heaprelid, BTSpool::index, index_close(), index_open(), BTShared::indexrelid, InstrEndParallelQuery(), InstrStartParallelQuery(), BTShared::isconcurrent, BTSpool::isunique, BTShared::isunique, log_btree_build_stats, maintenance_work_mem, palloc0(), PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelWorkerNumber, pgstat_report_activity(), ResetUsage(), RowExclusiveLock, BTShared::scantuplesortstates, ShareLock, ShareUpdateExclusiveLock, shm_toc_lookup(), ShowUsage(), STATE_RUNNING, table_close(), table_open(), and tuplesort_attach_shared().

1788 {
1789  char *sharedquery;
1790  BTSpool *btspool;
1791  BTSpool *btspool2;
1792  BTShared *btshared;
1793  Sharedsort *sharedsort;
1794  Sharedsort *sharedsort2;
1795  Relation heapRel;
1796  Relation indexRel;
1797  LOCKMODE heapLockmode;
1798  LOCKMODE indexLockmode;
1799  WalUsage *walusage;
1800  BufferUsage *bufferusage;
1801  int sortmem;
1802 
1803 #ifdef BTREE_BUILD_STATS
1805  ResetUsage();
1806 #endif /* BTREE_BUILD_STATS */
1807 
1808  /* Set debug_query_string for individual workers first */
1809  sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false);
1810  debug_query_string = sharedquery;
1811 
1812  /* Report the query string from leader */
1814 
1815  /* Look up nbtree shared state */
1816  btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1817 
1818  /* Open relations using lock modes known to be obtained by index.c */
1819  if (!btshared->isconcurrent)
1820  {
1821  heapLockmode = ShareLock;
1822  indexLockmode = AccessExclusiveLock;
1823  }
1824  else
1825  {
1826  heapLockmode = ShareUpdateExclusiveLock;
1827  indexLockmode = RowExclusiveLock;
1828  }
1829 
1830  /* Open relations within worker */
1831  heapRel = table_open(btshared->heaprelid, heapLockmode);
1832  indexRel = index_open(btshared->indexrelid, indexLockmode);
1833 
1834  /* Initialize worker's own spool */
1835  btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1836  btspool->heap = heapRel;
1837  btspool->index = indexRel;
1838  btspool->isunique = btshared->isunique;
1839 
1840  /* Look up shared state private to tuplesort.c */
1841  sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1842  tuplesort_attach_shared(sharedsort, seg);
1843  if (!btshared->isunique)
1844  {
1845  btspool2 = NULL;
1846  sharedsort2 = NULL;
1847  }
1848  else
1849  {
1850  /* Allocate memory for worker's own private secondary spool */
1851  btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1852 
1853  /* Initialize worker's own secondary spool */
1854  btspool2->heap = btspool->heap;
1855  btspool2->index = btspool->index;
1856  btspool2->isunique = false;
1857  /* Look up shared state private to tuplesort.c */
1858  sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1859  tuplesort_attach_shared(sharedsort2, seg);
1860  }
1861 
1862  /* Prepare to track buffer usage during parallel execution */
1864 
1865  /* Perform sorting of spool, and possibly a spool2 */
1866  sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1867  _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1868  sharedsort2, sortmem, false);
1869 
1870  /* Report WAL/buffer usage during parallel execution */
1871  bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1872  walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1874  &walusage[ParallelWorkerNumber]);
1875 
1876 #ifdef BTREE_BUILD_STATS
1878  {
1879  ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1880  ResetUsage();
1881  }
1882 #endif /* BTREE_BUILD_STATS */
1883 
1884  index_close(indexRel, indexLockmode);
1885  table_close(heapRel, heapLockmode);
1886 }
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:167
int scantuplesortstates
Definition: nbtsort.c:111
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:70
int LOCKMODE
Definition: lockdefs.h:26
Oid indexrelid
Definition: nbtsort.c:108
void pgstat_report_activity(BackendState state, const char *cmd_str)
Definition: pgstat.c:3132
void ShowUsage(const char *title)
Definition: postgres.c:4612
bool isunique
Definition: nbtsort.c:109
bool isconcurrent
Definition: nbtsort.c:110
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:189
Relation heap
Definition: nbtsort.c:90
void ResetUsage(void)
Definition: postgres.c:4605
#define PARALLEL_KEY_BUFFER_USAGE
Definition: nbtsort.c:74
bool isunique
Definition: nbtsort.c:92
Oid heaprelid
Definition: nbtsort.c:107
#define RowExclusiveLock
Definition: lockdefs.h:38
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:72
int ParallelWorkerNumber
Definition: parallel.c:112
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:71
Relation index
Definition: nbtsort.c:91
const char * debug_query_string
Definition: postgres.c:88
void InstrStartParallelQuery(void)
Definition: instrument.c:181
void * palloc0(Size size)
Definition: mcxt.c:981
bool log_btree_build_stats
Definition: guc.c:524
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition: nbtsort.c:1901
int maintenance_work_mem
Definition: globals.c:123
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:4582
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:158
#define AccessExclusiveLock
Definition: lockdefs.h:45
#define PARALLEL_KEY_WAL_USAGE
Definition: nbtsort.c:73
#define ShareLock
Definition: lockdefs.h:41
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:69
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:39
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:132

◆ _bt_parallel_estimate_shared()

static Size _bt_parallel_estimate_shared ( Relation  heap,
Snapshot  snapshot 
)
static

Definition at line 1681 of file nbtsort.c.

References add_size(), BUFFERALIGN, and table_parallelscan_estimate().

Referenced by _bt_begin_parallel().

1682 {
1683  /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1684  return add_size(BUFFERALIGN(sizeof(BTShared)),
1685  table_parallelscan_estimate(heap, snapshot));
1686 }
Size add_size(Size s1, Size s2)
Definition: shmem.c:498
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition: tableam.c:140
#define BUFFERALIGN(LEN)
Definition: c.h:700

◆ _bt_parallel_heapscan()

static double _bt_parallel_heapscan ( BTBuildState buildstate,
bool brokenhotchain 
)
static

Definition at line 1701 of file nbtsort.c.

References BTShared::brokenhotchain, BTBuildState::btleader, BTLeader::btshared, ConditionVariableCancelSleep(), ConditionVariableSleep(), BTShared::havedead, BTBuildState::havedead, BTShared::indtuples, BTBuildState::indtuples, BTShared::mutex, BTShared::nparticipantsdone, BTLeader::nparticipanttuplesorts, BTShared::reltuples, SpinLockAcquire, SpinLockRelease, WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN, and BTShared::workersdonecv.

Referenced by _bt_spools_heapscan().

1702 {
1703  BTShared *btshared = buildstate->btleader->btshared;
1704  int nparticipanttuplesorts;
1705  double reltuples;
1706 
1707  nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1708  for (;;)
1709  {
1710  SpinLockAcquire(&btshared->mutex);
1711  if (btshared->nparticipantsdone == nparticipanttuplesorts)
1712  {
1713  buildstate->havedead = btshared->havedead;
1714  buildstate->indtuples = btshared->indtuples;
1715  *brokenhotchain = btshared->brokenhotchain;
1716  reltuples = btshared->reltuples;
1717  SpinLockRelease(&btshared->mutex);
1718  break;
1719  }
1720  SpinLockRelease(&btshared->mutex);
1721 
1724  }
1725 
1727 
1728  return reltuples;
1729 }
BTShared * btshared
Definition: nbtsort.c:193
int nparticipantsdone
Definition: nbtsort.c:145
bool havedead
Definition: nbtsort.c:210
#define SpinLockAcquire(lock)
Definition: spin.h:62
void ConditionVariableCancelSleep(void)
double indtuples
Definition: nbtsort.c:219
bool brokenhotchain
Definition: nbtsort.c:149
#define SpinLockRelease(lock)
Definition: spin.h:64
BTLeader * btleader
Definition: nbtsort.c:226
double reltuples
Definition: nbtsort.c:146
double indtuples
Definition: nbtsort.c:148
slock_t mutex
Definition: nbtsort.c:127
bool havedead
Definition: nbtsort.c:147
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int nparticipanttuplesorts
Definition: nbtsort.c:181
ConditionVariable workersdonecv
Definition: nbtsort.c:119

◆ _bt_parallel_scan_and_sort()

static void _bt_parallel_scan_and_sort ( BTSpool btspool,
BTSpool btspool2,
BTShared btshared,
Sharedsort sharedsort,
Sharedsort sharedsort2,
int  sortmem,
bool  progress 
)
static

Definition at line 1901 of file nbtsort.c.

References _bt_build_callback(), BTShared::brokenhotchain, BTBuildState::btleader, BuildIndexInfo(), ConditionVariableSignal(), BTShared::havedead, BTBuildState::havedead, BTSpool::heap, BTBuildState::heap, IndexInfo::ii_BrokenHotChain, IndexInfo::ii_Concurrent, BTSpool::index, BTShared::indtuples, BTBuildState::indtuples, BTShared::isconcurrent, BTSpool::isunique, BTShared::isunique, BTBuildState::isunique, SortCoordinateData::isWorker, Min, BTShared::mutex, SortCoordinateData::nParticipants, BTShared::nparticipantsdone, palloc0(), ParallelTableScanFromBTShared, BTShared::reltuples, SortCoordinateData::sharedsort, BTSpool::sortstate, SpinLockAcquire, SpinLockRelease, BTBuildState::spool, BTBuildState::spool2, table_beginscan_parallel(), table_index_build_scan(), tuplesort_begin_index_btree(), tuplesort_end(), tuplesort_performsort(), work_mem, and BTShared::workersdonecv.

Referenced by _bt_leader_participate_as_worker(), and _bt_parallel_build_main().

1904 {
1905  SortCoordinate coordinate;
1906  BTBuildState buildstate;
1907  TableScanDesc scan;
1908  double reltuples;
1909  IndexInfo *indexInfo;
1910 
1911  /* Initialize local tuplesort coordination state */
1912  coordinate = palloc0(sizeof(SortCoordinateData));
1913  coordinate->isWorker = true;
1914  coordinate->nParticipants = -1;
1915  coordinate->sharedsort = sharedsort;
1916 
1917  /* Begin "partial" tuplesort */
1918  btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1919  btspool->index,
1920  btspool->isunique,
1921  sortmem, coordinate,
1922  false);
1923 
1924  /*
1925  * Just as with serial case, there may be a second spool. If so, a
1926  * second, dedicated spool2 partial tuplesort is required.
1927  */
1928  if (btspool2)
1929  {
1930  SortCoordinate coordinate2;
1931 
1932  /*
1933  * We expect that the second one (for dead tuples) won't get very
1934  * full, so we give it only work_mem (unless sortmem is less for
1935  * worker). Worker processes are generally permitted to allocate
1936  * work_mem independently.
1937  */
1938  coordinate2 = palloc0(sizeof(SortCoordinateData));
1939  coordinate2->isWorker = true;
1940  coordinate2->nParticipants = -1;
1941  coordinate2->sharedsort = sharedsort2;
1942  btspool2->sortstate =
1943  tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
1944  Min(sortmem, work_mem), coordinate2,
1945  false);
1946  }
1947 
1948  /* Fill in buildstate for _bt_build_callback() */
1949  buildstate.isunique = btshared->isunique;
1950  buildstate.havedead = false;
1951  buildstate.heap = btspool->heap;
1952  buildstate.spool = btspool;
1953  buildstate.spool2 = btspool2;
1954  buildstate.indtuples = 0;
1955  buildstate.btleader = NULL;
1956 
1957  /* Join parallel scan */
1958  indexInfo = BuildIndexInfo(btspool->index);
1959  indexInfo->ii_Concurrent = btshared->isconcurrent;
1960  scan = table_beginscan_parallel(btspool->heap,
1961  ParallelTableScanFromBTShared(btshared));
1962  reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1964  (void *) &buildstate, scan);
1965 
1966  /*
1967  * Execute this worker's part of the sort.
1968  *
1969  * Unlike leader and serial cases, we cannot avoid calling
1970  * tuplesort_performsort() for spool2 if it ends up containing no dead
1971  * tuples (this is disallowed for workers by tuplesort).
1972  */
1973  tuplesort_performsort(btspool->sortstate);
1974  if (btspool2)
1975  tuplesort_performsort(btspool2->sortstate);
1976 
1977  /*
1978  * Done. Record ambuild statistics, and whether we encountered a broken
1979  * HOT chain.
1980  */
1981  SpinLockAcquire(&btshared->mutex);
1982  btshared->nparticipantsdone++;
1983  btshared->reltuples += reltuples;
1984  if (buildstate.havedead)
1985  btshared->havedead = true;
1986  btshared->indtuples += buildstate.indtuples;
1987  if (indexInfo->ii_BrokenHotChain)
1988  btshared->brokenhotchain = true;
1989  SpinLockRelease(&btshared->mutex);
1990 
1991  /* Notify leader */
1993 
1994  /* We can end tuplesorts immediately */
1995  tuplesort_end(btspool->sortstate);
1996  if (btspool2)
1997  tuplesort_end(btspool2->sortstate);
1998 }
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:2021
Relation heap
Definition: nbtsort.c:211
int nparticipantsdone
Definition: nbtsort.c:145
#define Min(x, y)
Definition: c.h:927
bool havedead
Definition: nbtsort.c:210
Sharedsort * sharedsort
Definition: tuplesort.h:55
bool isunique
Definition: nbtsort.c:109
#define ParallelTableScanFromBTShared(shared)
Definition: nbtsort.c:164
bool isconcurrent
Definition: nbtsort.c:110
Tuplesortstate * sortstate
Definition: nbtsort.c:89
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2301
BTSpool * spool
Definition: nbtsort.c:212
Relation heap
Definition: nbtsort.c:90
#define SpinLockAcquire(lock)
Definition: spin.h:62
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1552
bool isunique
Definition: nbtsort.c:92
void ConditionVariableSignal(ConditionVariable *cv)
BTSpool * spool2
Definition: nbtsort.c:218
double indtuples
Definition: nbtsort.c:219
bool isunique
Definition: nbtsort.c:209
bool ii_BrokenHotChain
Definition: execnodes.h:175
bool brokenhotchain
Definition: nbtsort.c:149
Relation index
Definition: nbtsort.c:91
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
Definition: tableam.c:175
int progress
Definition: pgbench.c:235
#define SpinLockRelease(lock)
Definition: spin.h:64
void * palloc0(Size size)
Definition: mcxt.c:981
static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: nbtsort.c:582
BTLeader * btleader
Definition: nbtsort.c:226
int work_mem
Definition: globals.c:121
double reltuples
Definition: nbtsort.c:146
double indtuples
Definition: nbtsort.c:148
slock_t mutex
Definition: nbtsort.c:127
bool havedead
Definition: nbtsort.c:147
bool ii_Concurrent
Definition: execnodes.h:174
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1047
ConditionVariable workersdonecv
Definition: nbtsort.c:119
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1445

◆ _bt_slideleft()

static void _bt_slideleft ( Page  rightmostpage)
static

Definition at line 733 of file nbtsort.c.

References Assert, OffsetNumberNext, P_FIRSTKEY, P_HIKEY, PageGetItemId, and PageGetMaxOffsetNumber.

Referenced by _bt_uppershutdown().

734 {
735  OffsetNumber off;
736  OffsetNumber maxoff;
737  ItemId previi;
738 
739  maxoff = PageGetMaxOffsetNumber(rightmostpage);
740  Assert(maxoff >= P_FIRSTKEY);
741  previi = PageGetItemId(rightmostpage, P_HIKEY);
742  for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
743  {
744  ItemId thisii = PageGetItemId(rightmostpage, off);
745 
746  *previi = *thisii;
747  previi = thisii;
748  }
749  ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
750 }
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
uint16 OffsetNumber
Definition: off.h:24
#define P_FIRSTKEY
Definition: nbtree.h:243
struct ItemIdData ItemIdData
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
PageHeaderData * PageHeader
Definition: bufpage.h:166
#define Assert(condition)
Definition: c.h:745
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define P_HIKEY
Definition: nbtree.h:242

◆ _bt_sort_dedup_finish_pending()

static void _bt_sort_dedup_finish_pending ( BTWriteState wstate,
BTPageState state,
BTDedupState  dstate 
)
static

Definition at line 1075 of file nbtsort.c.

References _bt_buildadd(), _bt_form_posting(), Assert, BTDedupStateData::base, BTreeTupleGetPostingOffset(), BTDedupStateData::htids, IndexTupleSize, BTDedupStateData::nhtids, BTDedupStateData::nitems, BTDedupStateData::nmaxitems, pfree(), and BTDedupStateData::phystupsize.

Referenced by _bt_load().

1077 {
1078  Assert(dstate->nitems > 0);
1079 
1080  if (dstate->nitems == 1)
1081  _bt_buildadd(wstate, state, dstate->base, 0);
1082  else
1083  {
1084  IndexTuple postingtuple;
1085  Size truncextra;
1086 
1087  /* form a tuple with a posting list */
1088  postingtuple = _bt_form_posting(dstate->base,
1089  dstate->htids,
1090  dstate->nhtids);
1091  /* Calculate posting list overhead */
1092  truncextra = IndexTupleSize(postingtuple) -
1093  BTreeTupleGetPostingOffset(postingtuple);
1094 
1095  _bt_buildadd(wstate, state, postingtuple, truncextra);
1096  pfree(postingtuple);
1097  }
1098 
1099  dstate->nmaxitems = 0;
1100  dstate->nhtids = 0;
1101  dstate->nitems = 0;
1102  dstate->phystupsize = 0;
1103 }
IndexTuple base
Definition: nbtree.h:746
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:616
ItemPointer htids
Definition: nbtree.h:751
void pfree(void *pointer)
Definition: mcxt.c:1057
Size phystupsize
Definition: nbtree.h:754
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
Definition: nbtsort.c:834
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:397
#define Assert(condition)
Definition: c.h:745
size_t Size
Definition: c.h:473
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_sortaddtup()

static void _bt_sortaddtup ( Page  page,
Size  itemsize,
IndexTuple  itup,
OffsetNumber  itup_off,
bool  newfirstdataitem 
)
static

Definition at line 764 of file nbtsort.c.

References BTreeTupleSetNAtts(), elog, ERROR, InvalidOffsetNumber, PageAddItem, and IndexTupleData::t_info.

Referenced by _bt_buildadd().

769 {
770  IndexTupleData trunctuple;
771 
772  if (newfirstdataitem)
773  {
774  trunctuple = *itup;
775  trunctuple.t_info = sizeof(IndexTupleData);
776  BTreeTupleSetNAtts(&trunctuple, 0, false);
777  itup = &trunctuple;
778  itemsize = sizeof(IndexTupleData);
779  }
780 
781  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
782  false, false) == InvalidOffsetNumber)
783  elog(ERROR, "failed to add item to the index page");
784 }
Pointer Item
Definition: item.h:17
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:410
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:463
#define ERROR
Definition: elog.h:43
struct IndexTupleData IndexTupleData
#define InvalidOffsetNumber
Definition: off.h:26
#define elog(elevel,...)
Definition: elog.h:214
unsigned short t_info
Definition: itup.h:49

◆ _bt_spool()

static void _bt_spool ( BTSpool btspool,
ItemPointer  self,
Datum values,
bool isnull 
)
static

Definition at line 528 of file nbtsort.c.

References BTSpool::index, BTSpool::sortstate, and tuplesort_putindextuplevalues().

Referenced by _bt_build_callback().

529 {
530  tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
531  self, values, isnull);
532 }
Tuplesortstate * sortstate
Definition: nbtsort.c:89
Relation index
Definition: nbtsort.c:91
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
Definition: tuplesort.c:1708
static Datum values[MAXATTR]
Definition: bootstrap.c:165

◆ _bt_spooldestroy()

static void _bt_spooldestroy ( BTSpool btspool)
static

Definition at line 518 of file nbtsort.c.

References pfree(), BTSpool::sortstate, and tuplesort_end().

Referenced by _bt_spools_heapscan(), and btbuild().

519 {
520  tuplesort_end(btspool->sortstate);
521  pfree(btspool);
522 }
Tuplesortstate * sortstate
Definition: nbtsort.c:89
void pfree(void *pointer)
Definition: mcxt.c:1057
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1445

◆ _bt_spools_heapscan()

static double _bt_spools_heapscan ( Relation  heap,
Relation  index,
BTBuildState buildstate,
IndexInfo indexInfo 
)
static

Definition at line 368 of file nbtsort.c.

References _bt_begin_parallel(), _bt_build_callback(), _bt_parallel_heapscan(), _bt_spooldestroy(), BTBuildState::btleader, BTSpool::heap, IndexInfo::ii_BrokenHotChain, IndexInfo::ii_Concurrent, IndexInfo::ii_ParallelWorkers, IndexInfo::ii_Unique, BTSpool::index, BTSpool::isunique, BTBuildState::isunique, SortCoordinateData::isWorker, maintenance_work_mem, SortCoordinateData::nParticipants, BTLeader::nparticipanttuplesorts, palloc0(), pgstat_progress_update_multi_param(), pgstat_progress_update_param(), PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN, PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_CREATEIDX_TUPLES_TOTAL, PROGRESS_SCAN_BLOCKS_DONE, PROGRESS_SCAN_BLOCKS_TOTAL, SortCoordinateData::sharedsort, BTLeader::sharedsort, BTLeader::sharedsort2, BTSpool::sortstate, BTBuildState::spool, BTBuildState::spool2, table_index_build_scan(), tuplesort_begin_index_btree(), val, and work_mem.

Referenced by btbuild().

370 {
371  BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
372  SortCoordinate coordinate = NULL;
373  double reltuples = 0;
374 
375  /*
376  * We size the sort area as maintenance_work_mem rather than work_mem to
377  * speed index creation. This should be OK since a single backend can't
378  * run multiple index creations in parallel (see also: notes on
379  * parallelism and maintenance_work_mem below).
380  */
381  btspool->heap = heap;
382  btspool->index = index;
383  btspool->isunique = indexInfo->ii_Unique;
384 
385  /* Save as primary spool */
386  buildstate->spool = btspool;
387 
388  /* Report table scan phase started */
391 
392  /* Attempt to launch parallel worker scan when required */
393  if (indexInfo->ii_ParallelWorkers > 0)
394  _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
395  indexInfo->ii_ParallelWorkers);
396 
397  /*
398  * If parallel build requested and at least one worker process was
399  * successfully launched, set up coordination state
400  */
401  if (buildstate->btleader)
402  {
403  coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
404  coordinate->isWorker = false;
405  coordinate->nParticipants =
406  buildstate->btleader->nparticipanttuplesorts;
407  coordinate->sharedsort = buildstate->btleader->sharedsort;
408  }
409 
410  /*
411  * Begin serial/leader tuplesort.
412  *
413  * In cases where parallelism is involved, the leader receives the same
414  * share of maintenance_work_mem as a serial sort (it is generally treated
415  * in the same way as a serial sort once we return). Parallel worker
416  * Tuplesortstates will have received only a fraction of
417  * maintenance_work_mem, though.
418  *
419  * We rely on the lifetime of the Leader Tuplesortstate almost not
420  * overlapping with any worker Tuplesortstate's lifetime. There may be
421  * some small overlap, but that's okay because we rely on leader
422  * Tuplesortstate only allocating a small, fixed amount of memory here.
423  * When its tuplesort_performsort() is called (by our caller), and
424  * significant amounts of memory are likely to be used, all workers must
425  * have already freed almost all memory held by their Tuplesortstates
426  * (they are about to go away completely, too). The overall effect is
427  * that maintenance_work_mem always represents an absolute high watermark
428  * on the amount of memory used by a CREATE INDEX operation, regardless of
429  * the use of parallelism or any other factor.
430  */
431  buildstate->spool->sortstate =
432  tuplesort_begin_index_btree(heap, index, buildstate->isunique,
433  maintenance_work_mem, coordinate,
434  false);
435 
436  /*
437  * If building a unique index, put dead tuples in a second spool to keep
438  * them out of the uniqueness check. We expect that the second spool (for
439  * dead tuples) won't get very full, so we give it only work_mem.
440  */
441  if (indexInfo->ii_Unique)
442  {
443  BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
444  SortCoordinate coordinate2 = NULL;
445 
446  /* Initialize secondary spool */
447  btspool2->heap = heap;
448  btspool2->index = index;
449  btspool2->isunique = false;
450  /* Save as secondary spool */
451  buildstate->spool2 = btspool2;
452 
453  if (buildstate->btleader)
454  {
455  /*
456  * Set up non-private state that is passed to
457  * tuplesort_begin_index_btree() about the basic high level
458  * coordination of a parallel sort.
459  */
460  coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
461  coordinate2->isWorker = false;
462  coordinate2->nParticipants =
463  buildstate->btleader->nparticipanttuplesorts;
464  coordinate2->sharedsort = buildstate->btleader->sharedsort2;
465  }
466 
467  /*
468  * We expect that the second one (for dead tuples) won't get very
469  * full, so we give it only work_mem
470  */
471  buildstate->spool2->sortstate =
472  tuplesort_begin_index_btree(heap, index, false, work_mem,
473  coordinate2, false);
474  }
475 
476  /* Fill spool using either serial or parallel heap scan */
477  if (!buildstate->btleader)
478  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
479  _bt_build_callback, (void *) buildstate,
480  NULL);
481  else
482  reltuples = _bt_parallel_heapscan(buildstate,
483  &indexInfo->ii_BrokenHotChain);
484 
485  /*
486  * Set the progress target for the next phase. Reset the block number
487  * values set by table_index_build_scan
488  */
489  {
490  const int index[] = {
494  };
495  const int64 val[] = {
496  buildstate->indtuples,
497  0, 0
498  };
499 
500  pgstat_progress_update_multi_param(3, index, val);
501  }
502 
503  /* okay, all heap tuples are spooled */
504  if (buildstate->spool2 && !buildstate->havedead)
505  {
506  /* spool2 turns out to be unnecessary */
507  _bt_spooldestroy(buildstate->spool2);
508  buildstate->spool2 = NULL;
509  }
510 
511  return reltuples;
512 }
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
Definition: nbtsort.c:1454
Sharedsort * sharedsort2
Definition: nbtsort.c:195
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition: nbtree.h:987
void pgstat_progress_update_param(int index, int64 val)
Definition: pgstat.c:3231
Sharedsort * sharedsort
Definition: nbtsort.c:194
bool havedead
Definition: nbtsort.c:210
Sharedsort * sharedsort
Definition: tuplesort.h:55
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:84
Tuplesortstate * sortstate
Definition: nbtsort.c:89
BTSpool * spool
Definition: nbtsort.c:212
Relation heap
Definition: nbtsort.c:90
static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
Definition: nbtsort.c:1701
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1552
bool isunique
Definition: nbtsort.c:92
BTSpool * spool2
Definition: nbtsort.c:218
double indtuples
Definition: nbtsort.c:219
bool isunique
Definition: nbtsort.c:209
bool ii_BrokenHotChain
Definition: execnodes.h:175
Relation index
Definition: nbtsort.c:91
void * palloc0(Size size)
Definition: mcxt.c:981
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:58
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:120
static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: nbtsort.c:582
BTLeader * btleader
Definition: nbtsort.c:226
int work_mem
Definition: globals.c:121
int maintenance_work_mem
Definition: globals.c:123
bool ii_Unique
Definition: execnodes.h:172
int nparticipanttuplesorts
Definition: nbtsort.c:181
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
Definition: pgstat.c:3253
int ii_ParallelWorkers
Definition: execnodes.h:176
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:119
bool ii_Concurrent
Definition: execnodes.h:174
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:518
#define PROGRESS_CREATEIDX_SUBPHASE
Definition: progress.h:83
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1047
long val
Definition: informix.c:664

◆ _bt_uppershutdown()

static void _bt_uppershutdown ( BTWriteState wstate,
BTPageState state 
)
static

Definition at line 1109 of file nbtsort.c.

References _bt_blwritepage(), _bt_buildadd(), _bt_initmetapage(), _bt_slideleft(), BTScanInsertData::allequalimage, Assert, BTP_ROOT, BTPageOpaqueData::btpo_flags, BTPageState::btps_blkno, BTPageState::btps_level, BTPageState::btps_lowkey, BTPageState::btps_next, BTPageState::btps_page, BTREE_METAPAGE, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), BTWriteState::index, IndexRelationGetNumberOfKeyAttributes, BTWriteState::inskey, P_LEFTMOST, P_NONE, PageGetSpecialPointer, palloc(), and pfree().

Referenced by _bt_load().

1110 {
1111  BTPageState *s;
1112  BlockNumber rootblkno = P_NONE;
1113  uint32 rootlevel = 0;
1114  Page metapage;
1115 
1116  /*
1117  * Each iteration of this loop completes one more level of the tree.
1118  */
1119  for (s = state; s != NULL; s = s->btps_next)
1120  {
1121  BlockNumber blkno;
1122  BTPageOpaque opaque;
1123 
1124  blkno = s->btps_blkno;
1126 
1127  /*
1128  * We have to link the last page on this level to somewhere.
1129  *
1130  * If we're at the top, it's the root, so attach it to the metapage.
1131  * Otherwise, add an entry for it to its parent using its low key.
1132  * This may cause the last page of the parent level to split, but
1133  * that's not a problem -- we haven't gotten to it yet.
1134  */
1135  if (s->btps_next == NULL)
1136  {
1137  opaque->btpo_flags |= BTP_ROOT;
1138  rootblkno = blkno;
1139  rootlevel = s->btps_level;
1140  }
1141  else
1142  {
1143  Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1145  BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1146  P_LEFTMOST(opaque));
1147  Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1148  !P_LEFTMOST(opaque));
1150  _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1151  pfree(s->btps_lowkey);
1152  s->btps_lowkey = NULL;
1153  }
1154 
1155  /*
1156  * This is the rightmost page, so the ItemId array needs to be slid
1157  * back one slot. Then we can dump out the page.
1158  */
1160  _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1161  s->btps_page = NULL; /* writepage freed the workspace */
1162  }
1163 
1164  /*
1165  * As the last step in the process, construct the metapage and make it
1166  * point to the new root (unless we had no data at all, in which case it's
1167  * set to point to "P_NONE"). This changes the index to the "valid" state
1168  * by filling in a valid magic number in the metapage.
1169  */
1170  metapage = (Page) palloc(BLCKSZ);
1171  _bt_initmetapage(metapage, rootblkno, rootlevel,
1172  wstate->inskey->allequalimage);
1173  _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1174 }
#define BTP_ROOT
Definition: nbtree.h:73
#define P_NONE
Definition: nbtree.h:206
bool allequalimage
Definition: nbtree.h:660
static void _bt_slideleft(Page rightmostpage)
Definition: nbtsort.c:733
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:61
uint32 BlockNumber
Definition: block.h:31
BlockNumber btps_blkno
Definition: nbtsort.c:236
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
static void _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
Definition: nbtsort.c:638
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Page btps_page
Definition: nbtsort.c:235
void pfree(void *pointer)
Definition: mcxt.c:1057
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
unsigned int uint32
Definition: c.h:374
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:430
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define BTREE_METAPAGE
Definition: nbtree.h:141
Relation index
Definition: nbtsort.c:251
uint32 btps_level
Definition: nbtsort.c:240
struct BTPageState * btps_next
Definition: nbtsort.c:242
BTScanInsert inskey
Definition: nbtsort.c:252
IndexTuple btps_lowkey
Definition: nbtsort.c:237
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
Definition: nbtsort.c:834
#define Assert(condition)
Definition: c.h:745
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
void * palloc(Size size)
Definition: mcxt.c:950
uint16 btpo_flags
Definition: nbtree.h:65
Pointer Page
Definition: bufpage.h:78

◆ btbuild()

IndexBuildResult* btbuild ( Relation  heap,
Relation  index,
IndexInfo indexInfo 
)

Definition at line 299 of file nbtsort.c.

References _bt_end_parallel(), _bt_leafbuild(), _bt_spooldestroy(), _bt_spools_heapscan(), BTBuildState::btleader, elog, ERROR, BTBuildState::havedead, BTSpool::heap, BTBuildState::heap, IndexBuildResult::heap_tuples, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, BTBuildState::indtuples, BTBuildState::isunique, log_btree_build_stats, palloc(), RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), ShowUsage(), BTBuildState::spool, and BTBuildState::spool2.

Referenced by bthandler().

300 {
301  IndexBuildResult *result;
302  BTBuildState buildstate;
303  double reltuples;
304 
305 #ifdef BTREE_BUILD_STATS
307  ResetUsage();
308 #endif /* BTREE_BUILD_STATS */
309 
310  buildstate.isunique = indexInfo->ii_Unique;
311  buildstate.havedead = false;
312  buildstate.heap = heap;
313  buildstate.spool = NULL;
314  buildstate.spool2 = NULL;
315  buildstate.indtuples = 0;
316  buildstate.btleader = NULL;
317 
318  /*
319  * We expect to be called exactly once for any index relation. If that's
320  * not the case, big trouble's what we have.
321  */
322  if (RelationGetNumberOfBlocks(index) != 0)
323  elog(ERROR, "index \"%s\" already contains data",
324  RelationGetRelationName(index));
325 
326  reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
327 
328  /*
329  * Finish the build by (1) completing the sort of the spool file, (2)
330  * inserting the sorted tuples into btree pages and (3) building the upper
331  * levels. Finally, it may also be necessary to end use of parallelism.
332  */
333  _bt_leafbuild(buildstate.spool, buildstate.spool2);
334  _bt_spooldestroy(buildstate.spool);
335  if (buildstate.spool2)
336  _bt_spooldestroy(buildstate.spool2);
337  if (buildstate.btleader)
338  _bt_end_parallel(buildstate.btleader);
339 
340  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
341 
342  result->heap_tuples = reltuples;
343  result->index_tuples = buildstate.indtuples;
344 
345 #ifdef BTREE_BUILD_STATS
347  {
348  ShowUsage("BTREE BUILD STATS");
349  ResetUsage();
350  }
351 #endif /* BTREE_BUILD_STATS */
352 
353  return result;
354 }
Relation heap
Definition: nbtsort.c:211
void ShowUsage(const char *title)
Definition: postgres.c:4612
bool havedead
Definition: nbtsort.c:210
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1655
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:539
BTSpool * spool
Definition: nbtsort.c:212
void ResetUsage(void)
Definition: postgres.c:4605
#define ERROR
Definition: elog.h:43
BTSpool * spool2
Definition: nbtsort.c:218
double indtuples
Definition: nbtsort.c:219
bool isunique
Definition: nbtsort.c:209
#define RelationGetRelationName(relation)
Definition: rel.h:490
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition: nbtsort.c:368
bool log_btree_build_stats
Definition: guc.c:524
BTLeader * btleader
Definition: nbtsort.c:226
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:211
bool ii_Unique
Definition: execnodes.h:172
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:518
void * palloc(Size size)
Definition: mcxt.c:950
#define elog(elevel,...)
Definition: elog.h:214
double index_tuples
Definition: genam.h:33
double heap_tuples
Definition: genam.h:32