PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtsort.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/parallel.h"
#include "access/relscan.h"
#include "access/table.h"
#include "access/tableam.h"
#include "access/xact.h"
#include "catalog/index.h"
#include "commands/progress.h"
#include "executor/instrument.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bulk_write.h"
#include "storage/condition_variable.h"
#include "storage/proc.h"
#include "tcop/tcopprot.h"
#include "utils/rel.h"
#include "utils/sortsupport.h"
#include "utils/tuplesort.h"
#include "utils/wait_event.h"
Include dependency graph for nbtsort.c:

Go to the source code of this file.

Data Structures

struct  BTSpool
 
struct  BTShared
 
struct  BTLeader
 
struct  BTBuildState
 
struct  BTPageState
 
struct  BTWriteState
 

Macros

#define PARALLEL_KEY_BTREE_SHARED   UINT64CONST(0xA000000000000001)
 
#define PARALLEL_KEY_TUPLESORT   UINT64CONST(0xA000000000000002)
 
#define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
 
#define PARALLEL_KEY_QUERY_TEXT   UINT64CONST(0xA000000000000004)
 
#define PARALLEL_KEY_WAL_USAGE   UINT64CONST(0xA000000000000005)
 
#define PARALLEL_KEY_BUFFER_USAGE   UINT64CONST(0xA000000000000006)
 
#define ParallelTableScanFromBTShared(shared)    (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
 

Typedefs

typedef struct BTSpool BTSpool
 
typedef struct BTShared BTShared
 
typedef struct BTLeader BTLeader
 
typedef struct BTBuildState BTBuildState
 
typedef struct BTPageState BTPageState
 
typedef struct BTWriteState BTWriteState
 

Functions

static double _bt_spools_heapscan (Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
 
static void _bt_spooldestroy (BTSpool *btspool)
 
static void _bt_spool (BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
 
static void _bt_leafbuild (BTSpool *btspool, BTSpool *btspool2)
 
static void _bt_build_callback (Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
 
static BulkWriteBuffer _bt_blnewpage (BTWriteState *wstate, uint32 level)
 
static BTPageState_bt_pagestate (BTWriteState *wstate, uint32 level)
 
static void _bt_slideleft (Page rightmostpage)
 
static void _bt_sortaddtup (Page page, Size itemsize, const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem)
 
static void _bt_buildadd (BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
 
static void _bt_sort_dedup_finish_pending (BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
 
static void _bt_uppershutdown (BTWriteState *wstate, BTPageState *state)
 
static void _bt_load (BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 
static void _bt_begin_parallel (BTBuildState *buildstate, bool isconcurrent, int request)
 
static void _bt_end_parallel (BTLeader *btleader)
 
static Size _bt_parallel_estimate_shared (Relation heap, Snapshot snapshot)
 
static double _bt_parallel_heapscan (BTBuildState *buildstate, bool *brokenhotchain)
 
static void _bt_leader_participate_as_worker (BTBuildState *buildstate)
 
static void _bt_parallel_scan_and_sort (BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
 
IndexBuildResultbtbuild (Relation heap, Relation index, IndexInfo *indexInfo)
 
static void _bt_blwritepage (BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ PARALLEL_KEY_BTREE_SHARED

#define PARALLEL_KEY_BTREE_SHARED   UINT64CONST(0xA000000000000001)

Definition at line 65 of file nbtsort.c.

◆ PARALLEL_KEY_BUFFER_USAGE

#define PARALLEL_KEY_BUFFER_USAGE   UINT64CONST(0xA000000000000006)

Definition at line 70 of file nbtsort.c.

◆ PARALLEL_KEY_QUERY_TEXT

#define PARALLEL_KEY_QUERY_TEXT   UINT64CONST(0xA000000000000004)

Definition at line 68 of file nbtsort.c.

◆ PARALLEL_KEY_TUPLESORT

#define PARALLEL_KEY_TUPLESORT   UINT64CONST(0xA000000000000002)

Definition at line 66 of file nbtsort.c.

◆ PARALLEL_KEY_TUPLESORT_SPOOL2

#define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)

Definition at line 67 of file nbtsort.c.

◆ PARALLEL_KEY_WAL_USAGE

#define PARALLEL_KEY_WAL_USAGE   UINT64CONST(0xA000000000000005)

Definition at line 69 of file nbtsort.c.

◆ ParallelTableScanFromBTShared

#define ParallelTableScanFromBTShared (   shared)     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))

Definition at line 165 of file nbtsort.c.

171{
172 /* parallel context itself */
173 ParallelContext *pcxt;
174
175 /*
176 * nparticipanttuplesorts is the exact number of worker processes
177 * successfully launched, plus one leader process if it participates as a
178 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
179 * participating as a worker).
180 */
181 int nparticipanttuplesorts;
182
183 /*
184 * Leader process convenience pointers to shared state (leader avoids TOC
185 * lookups).
186 *
187 * btshared is the shared state for entire build. sharedsort is the
188 * shared, tuplesort-managed state passed to each process tuplesort.
189 * sharedsort2 is the corresponding btspool2 shared state, used only when
190 * building unique indexes. snapshot is the snapshot used by the scan iff
191 * an MVCC snapshot is required.
192 */
193 BTShared *btshared;
194 Sharedsort *sharedsort;
195 Sharedsort *sharedsort2;
196 Snapshot snapshot;
197 WalUsage *walusage;
198 BufferUsage *bufferusage;
199} BTLeader;
200
201/*
202 * Working state for btbuild and its callback.
203 *
204 * When parallel CREATE INDEX is used, there is a BTBuildState for each
205 * participant.
206 */
207typedef struct BTBuildState
208{
209 bool isunique;
211 bool havedead;
213 BTSpool *spool;
214
215 /*
216 * spool2 is needed only when the index is a unique index. Dead tuples are
217 * put into spool2 instead of spool in order to avoid uniqueness check.
218 */
220 double indtuples;
221
222 /*
223 * btleader is only present when a parallel index build is performed, and
224 * only in the leader process. (Actually, only the leader has a
225 * BTBuildState. Workers have their own spool and spool2, though.)
226 */
229
230/*
231 * Status record for a btree page being built. We have one of these
232 * for each active tree level.
233 */
234typedef struct BTPageState
235{
236 BulkWriteBuffer btps_buf; /* workspace for page building */
237 BlockNumber btps_blkno; /* block # to write this page at */
238 IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
239 OffsetNumber btps_lastoff; /* last item offset loaded */
240 Size btps_lastextra; /* last item's extra posting list space */
241 uint32 btps_level; /* tree level (0 = leaf) */
242 Size btps_full; /* "full" if less than this much free space */
243 struct BTPageState *btps_next; /* link to parent level, if any */
245
246/*
247 * Overall status record for index writing phase.
248 */
249typedef struct BTWriteState
250{
254 BTScanInsert inskey; /* generic insertion scankey */
255 BlockNumber btws_pages_alloced; /* # pages allocated */
257
258
259static double _bt_spools_heapscan(Relation heap, Relation index,
260 BTBuildState *buildstate, IndexInfo *indexInfo);
261static void _bt_spooldestroy(BTSpool *btspool);
262static void _bt_spool(BTSpool *btspool, const ItemPointerData *self,
263 const Datum *values, const bool *isnull);
266 bool *isnull, bool tupleIsAlive, void *state);
269static void _bt_slideleft(Page rightmostpage);
270static void _bt_sortaddtup(Page page, Size itemsize,
272 bool newfirstdataitem);
279static void _bt_load(BTWriteState *wstate,
281static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
282 int request);
283static void _bt_end_parallel(BTLeader *btleader);
286 bool *brokenhotchain);
289 BTShared *btshared, Sharedsort *sharedsort,
290 Sharedsort *sharedsort2, int sortmem,
291 bool progress);
292
293
294/*
295 * btbuild() -- build a new btree index.
296 */
298btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
299{
300 IndexBuildResult *result;
302 double reltuples;
303
304#ifdef BTREE_BUILD_STATS
306 ResetUsage();
307#endif /* BTREE_BUILD_STATS */
308
309 buildstate.isunique = indexInfo->ii_Unique;
310 buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
311 buildstate.havedead = false;
312 buildstate.heap = heap;
313 buildstate.spool = NULL;
314 buildstate.spool2 = NULL;
315 buildstate.indtuples = 0;
316 buildstate.btleader = NULL;
317
318 /*
319 * We expect to be called exactly once for any index relation. If that's
320 * not the case, big trouble's what we have.
321 */
323 elog(ERROR, "index \"%s\" already contains data",
325
326 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
327
328 /*
329 * Finish the build by (1) completing the sort of the spool file, (2)
330 * inserting the sorted tuples into btree pages and (3) building the upper
331 * levels. Finally, it may also be necessary to end use of parallelism.
332 */
333 _bt_leafbuild(buildstate.spool, buildstate.spool2);
335 if (buildstate.spool2)
337 if (buildstate.btleader)
339
341
342 result->heap_tuples = reltuples;
343 result->index_tuples = buildstate.indtuples;
344
345#ifdef BTREE_BUILD_STATS
347 {
348 ShowUsage("BTREE BUILD STATS");
349 ResetUsage();
350 }
351#endif /* BTREE_BUILD_STATS */
352
353 return result;
354}
355
356/*
357 * Create and initialize one or two spool structures, and save them in caller's
358 * buildstate argument. May also fill-in fields within indexInfo used by index
359 * builds.
360 *
361 * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
362 * routine encapsulates all aspects of managing parallelism. Caller need only
363 * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
364 *
365 * Returns the total number of heap tuples scanned.
366 */
367static double
369 IndexInfo *indexInfo)
370{
373 double reltuples = 0;
374
375 /*
376 * We size the sort area as maintenance_work_mem rather than work_mem to
377 * speed index creation. This should be OK since a single backend can't
378 * run multiple index creations in parallel (see also: notes on
379 * parallelism and maintenance_work_mem below).
380 */
381 btspool->heap = heap;
382 btspool->index = index;
383 btspool->isunique = indexInfo->ii_Unique;
384 btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
385
386 /* Save as primary spool */
387 buildstate->spool = btspool;
388
389 /* Report table scan phase started */
392
393 /* Attempt to launch parallel worker scan when required */
394 if (indexInfo->ii_ParallelWorkers > 0)
396 indexInfo->ii_ParallelWorkers);
397
398 /*
399 * If parallel build requested and at least one worker process was
400 * successfully launched, set up coordination state
401 */
402 if (buildstate->btleader)
403 {
405 coordinate->isWorker = false;
406 coordinate->nParticipants =
407 buildstate->btleader->nparticipanttuplesorts;
408 coordinate->sharedsort = buildstate->btleader->sharedsort;
409 }
410
411 /*
412 * Begin serial/leader tuplesort.
413 *
414 * In cases where parallelism is involved, the leader receives the same
415 * share of maintenance_work_mem as a serial sort (it is generally treated
416 * in the same way as a serial sort once we return). Parallel worker
417 * Tuplesortstates will have received only a fraction of
418 * maintenance_work_mem, though.
419 *
420 * We rely on the lifetime of the Leader Tuplesortstate almost not
421 * overlapping with any worker Tuplesortstate's lifetime. There may be
422 * some small overlap, but that's okay because we rely on leader
423 * Tuplesortstate only allocating a small, fixed amount of memory here.
424 * When its tuplesort_performsort() is called (by our caller), and
425 * significant amounts of memory are likely to be used, all workers must
426 * have already freed almost all memory held by their Tuplesortstates
427 * (they are about to go away completely, too). The overall effect is
428 * that maintenance_work_mem always represents an absolute high watermark
429 * on the amount of memory used by a CREATE INDEX operation, regardless of
430 * the use of parallelism or any other factor.
431 */
432 buildstate->spool->sortstate =
434 buildstate->nulls_not_distinct,
437
438 /*
439 * If building a unique index, put dead tuples in a second spool to keep
440 * them out of the uniqueness check. We expect that the second spool (for
441 * dead tuples) won't get very full, so we give it only work_mem.
442 */
443 if (indexInfo->ii_Unique)
444 {
447
448 /* Initialize secondary spool */
449 btspool2->heap = heap;
450 btspool2->index = index;
451 btspool2->isunique = false;
452 /* Save as secondary spool */
453 buildstate->spool2 = btspool2;
454
455 if (buildstate->btleader)
456 {
457 /*
458 * Set up non-private state that is passed to
459 * tuplesort_begin_index_btree() about the basic high level
460 * coordination of a parallel sort.
461 */
463 coordinate2->isWorker = false;
464 coordinate2->nParticipants =
465 buildstate->btleader->nparticipanttuplesorts;
466 coordinate2->sharedsort = buildstate->btleader->sharedsort2;
467 }
468
469 /*
470 * We expect that the second one (for dead tuples) won't get very
471 * full, so we give it only work_mem
472 */
473 buildstate->spool2->sortstate =
474 tuplesort_begin_index_btree(heap, index, false, false, work_mem,
476 }
477
478 /* Fill spool using either serial or parallel heap scan */
479 if (!buildstate->btleader)
480 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
482 NULL);
483 else
485 &indexInfo->ii_BrokenHotChain);
486
487 /*
488 * Set the progress target for the next phase. Reset the block number
489 * values set by table_index_build_scan
490 */
491 {
492 const int progress_index[] = {
496 };
497 const int64 progress_vals[] = {
498 buildstate->indtuples,
499 0, 0
500 };
501
503 }
504
505 /* okay, all heap tuples are spooled */
506 if (buildstate->spool2 && !buildstate->havedead)
507 {
508 /* spool2 turns out to be unnecessary */
510 buildstate->spool2 = NULL;
511 }
512
513 return reltuples;
514}
515
516/*
517 * clean up a spool structure and its substructures.
518 */
519static void
521{
522 tuplesort_end(btspool->sortstate);
523 pfree(btspool);
524}
525
526/*
527 * spool an index entry into the sort file.
528 */
529static void
530_bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
531{
533 self, values, isnull);
534}
535
536/*
537 * given a spool loaded by successive calls to _bt_spool,
538 * create an entire btree.
539 */
540static void
542{
544
545#ifdef BTREE_BUILD_STATS
547 {
548 ShowUsage("BTREE BUILD (Spool) STATISTICS");
549 ResetUsage();
550 }
551#endif /* BTREE_BUILD_STATS */
552
553 /* Execute the sort */
556 tuplesort_performsort(btspool->sortstate);
557 if (btspool2)
558 {
562 }
563
564 wstate.heap = btspool->heap;
565 wstate.index = btspool->index;
566 wstate.inskey = _bt_mkscankey(wstate.index, NULL);
567 /* _bt_mkscankey() won't set allequalimage without metapage */
568 wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
569
570 /* reserve the metapage */
571 wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
572
576}
577
578/*
579 * Per-tuple callback for table_index_build_scan
580 */
581static void
583 ItemPointer tid,
584 Datum *values,
585 bool *isnull,
586 bool tupleIsAlive,
587 void *state)
588{
590
591 /*
592 * insert the index tuple into the appropriate spool file for subsequent
593 * processing
594 */
595 if (tupleIsAlive || buildstate->spool2 == NULL)
596 _bt_spool(buildstate->spool, tid, values, isnull);
597 else
598 {
599 /* dead tuples are put into spool2 */
600 buildstate->havedead = true;
601 _bt_spool(buildstate->spool2, tid, values, isnull);
602 }
603
604 buildstate->indtuples += 1;
605}
606
607/*
608 * allocate workspace for a new, clean btree page, not linked to any siblings.
609 */
610static BulkWriteBuffer
612{
614 Page page;
615 BTPageOpaque opaque;
616
617 buf = smgr_bulk_get_buf(wstate->bulkstate);
618 page = (Page) buf;
619
620 /* Zero the page and set up standard page header info */
621 _bt_pageinit(page, BLCKSZ);
622
623 /* Initialize BT opaque state */
624 opaque = BTPageGetOpaque(page);
625 opaque->btpo_prev = opaque->btpo_next = P_NONE;
626 opaque->btpo_level = level;
627 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
628 opaque->btpo_cycleid = 0;
629
630 /* Make the P_HIKEY line pointer appear allocated */
631 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
632
633 return buf;
634}
635
636/*
637 * emit a completed btree page, and release the working storage.
638 */
639static void
641{
642 smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
643 /* smgr_bulk_write took ownership of 'buf' */
644}
645
646/*
647 * allocate and initialize a new BTPageState. the returned structure
648 * is suitable for immediate use by _bt_buildadd.
649 */
650static BTPageState *
652{
654
655 /* create initial page for level */
656 state->btps_buf = _bt_blnewpage(wstate, level);
657
658 /* and assign it a page position */
659 state->btps_blkno = wstate->btws_pages_alloced++;
660
661 state->btps_lowkey = NULL;
662 /* initialize lastoff so first item goes into P_FIRSTKEY */
663 state->btps_lastoff = P_HIKEY;
664 state->btps_lastextra = 0;
665 state->btps_level = level;
666 /* set "full" threshold based on level. See notes at head of file. */
667 if (level > 0)
668 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
669 else
670 state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
671
672 /* no parent level, yet */
673 state->btps_next = NULL;
674
675 return state;
676}
677
678/*
679 * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
680 * P_HIKEY, overwriting P_HIKEY).
681 *
682 * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
683 * rightmost page on its level is not supposed to get a high key. Now that
684 * it's clear that this page is a rightmost page, remove the unneeded empty
685 * P_HIKEY line pointer space.
686 */
687static void
689{
690 OffsetNumber off;
691 OffsetNumber maxoff;
693
695 Assert(maxoff >= P_FIRSTKEY);
697 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
698 {
700
701 *previi = *thisii;
702 previi = thisii;
703 }
704 ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
705}
706
707/*
708 * Add an item to a page being built.
709 *
710 * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
711 * raises an error directly.
712 *
713 * Note that our nbtsort.c caller does not know yet if the page will be
714 * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
715 * caller. Page that turns out to be the rightmost on its level is fixed by
716 * calling _bt_slideleft().
717 */
718static void
721 const IndexTupleData *itup,
723 bool newfirstdataitem)
724{
726
728 {
729 trunctuple = *itup;
731 BTreeTupleSetNAtts(&trunctuple, 0, false);
732 itup = &trunctuple;
733 itemsize = sizeof(IndexTupleData);
734 }
735
736 if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
737 elog(ERROR, "failed to add item to the index page");
738}
739
740/*----------
741 * Add an item to a disk page from the sort output (or add a posting list
742 * item formed from the sort output).
743 *
744 * We must be careful to observe the page layout conventions of nbtsearch.c:
745 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
746 * - on non-leaf pages, the key portion of the first item need not be
747 * stored, we should store only the link.
748 *
749 * A leaf page being built looks like:
750 *
751 * +----------------+---------------------------------+
752 * | PageHeaderData | linp0 linp1 linp2 ... |
753 * +-----------+----+---------------------------------+
754 * | ... linpN | |
755 * +-----------+--------------------------------------+
756 * | ^ last |
757 * | |
758 * +-------------+------------------------------------+
759 * | | itemN ... |
760 * +-------------+------------------+-----------------+
761 * | ... item3 item2 item1 | "special space" |
762 * +--------------------------------+-----------------+
763 *
764 * Contrast this with the diagram in bufpage.h; note the mismatch
765 * between linps and items. This is because we reserve linp0 as a
766 * placeholder for the pointer to the "high key" item; when we have
767 * filled up the page, we will set linp0 to point to itemN and clear
768 * linpN. On the other hand, if we find this is the last (rightmost)
769 * page, we leave the items alone and slide the linp array over. If
770 * the high key is to be truncated, offset 1 is deleted, and we insert
771 * the truncated high key at offset 1.
772 *
773 * 'last' pointer indicates the last offset added to the page.
774 *
775 * 'truncextra' is the size of the posting list in itup, if any. This
776 * information is stashed for the next call here, when we may benefit
777 * from considering the impact of truncating away the posting list on
778 * the page before deciding to finish the page off. Posting lists are
779 * often relatively large, so it is worth going to the trouble of
780 * accounting for the saving from truncating away the posting list of
781 * the tuple that becomes the high key (that may be the only way to
782 * get close to target free space on the page). Note that this is
783 * only used for the soft fillfactor-wise limit, not the critical hard
784 * limit.
785 *----------
786 */
787static void
790{
792 Page npage;
796 Size pgspc;
797 Size itupsz;
798 bool isleaf;
799
800 /*
801 * This is a handy place to check for cancel interrupts during the btree
802 * load phase of index creation.
803 */
805
806 nbuf = state->btps_buf;
807 npage = (Page) nbuf;
808 nblkno = state->btps_blkno;
809 last_off = state->btps_lastoff;
810 last_truncextra = state->btps_lastextra;
811 state->btps_lastextra = truncextra;
812
813 pgspc = PageGetFreeSpace(npage);
814 itupsz = IndexTupleSize(itup);
816 /* Leaf case has slightly different rules due to suffix truncation */
817 isleaf = (state->btps_level == 0);
818
819 /*
820 * Check whether the new item can fit on a btree page on current level at
821 * all.
822 *
823 * Every newly built index will treat heap TID as part of the keyspace,
824 * which imposes the requirement that new high keys must occasionally have
825 * a heap TID appended within _bt_truncate(). That may leave a new pivot
826 * tuple one or two MAXALIGN() quantums larger than the original
827 * firstright tuple it's derived from. v4 deals with the problem by
828 * decreasing the limit on the size of tuples inserted on the leaf level
829 * by the same small amount. Enforce the new v4+ limit on the leaf level,
830 * and the old limit on internal levels, since pivot tuples may need to
831 * make use of the reserved space. This should never fail on internal
832 * pages.
833 */
835 _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
836 itup);
837
838 /*
839 * Check to see if current page will fit new item, with space left over to
840 * append a heap TID during suffix truncation when page is a leaf page.
841 *
842 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
843 * high key with heap TID when finishing off a leaf page, since we rely on
844 * _bt_check_third_page() rejecting oversized non-pivot tuples. On
845 * internal pages we can always fit 3 pivot tuples with larger internal
846 * page tuple limit (includes page high key).
847 *
848 * Most of the time, a page is only "full" in the sense that the soft
849 * fillfactor-wise limit has been exceeded. However, we must always leave
850 * at least two items plus a high key on each page before starting a new
851 * page. Disregard fillfactor and insert on "full" current page if we
852 * don't have the minimum number of items yet. (Note that we deliberately
853 * assume that suffix truncation neither enlarges nor shrinks new high key
854 * when applying soft limit, except when last tuple has a posting list.)
855 */
857 if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
858 (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
859 {
860 /*
861 * Finish off the page and write it out.
862 */
864 Page opage = npage;
866 ItemId ii;
867 ItemId hii;
869
870 /* Create new page of same level */
871 nbuf = _bt_blnewpage(wstate, state->btps_level);
872 npage = (Page) nbuf;
873
874 /* and assign it a page position */
875 nblkno = wstate->btws_pages_alloced++;
876
877 /*
878 * We copy the last item on the page into the new page, and then
879 * rearrange the old page so that the 'last item' becomes its high key
880 * rather than a true data item. There had better be at least two
881 * items on the page already, else the page would be empty of useful
882 * data.
883 */
888 !isleaf);
889
890 /*
891 * Move 'last' into the high key position on opage. _bt_blnewpage()
892 * allocated empty space for a line pointer when opage was first
893 * created, so this is a matter of rearranging already-allocated space
894 * on page, and initializing high key line pointer. (Actually, leaf
895 * pages must also swap oitup with a truncated version of oitup, which
896 * is sometimes larger than oitup, though never by more than the space
897 * needed to append a heap TID.)
898 */
900 *hii = *ii;
901 ItemIdSetUnused(ii); /* redundant */
902 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
903
904 if (isleaf)
905 {
906 IndexTuple lastleft;
907 IndexTuple truncated;
908
909 /*
910 * Truncate away any unneeded attributes from high key on leaf
911 * level. This is only done at the leaf level because downlinks
912 * in internal pages are either negative infinity items, or get
913 * their contents from copying from one level down. See also:
914 * _bt_split().
915 *
916 * We don't try to bias our choice of split point to make it more
917 * likely that _bt_truncate() can truncate away more attributes,
918 * whereas the split point used within _bt_split() is chosen much
919 * more delicately. Even still, the lastleft and firstright
920 * tuples passed to _bt_truncate() here are at least not fully
921 * equal to each other when deduplication is used, unless there is
922 * a large group of duplicates (also, unique index builds usually
923 * have few or no spool2 duplicates). When the split point is
924 * between two unequal tuples, _bt_truncate() will avoid including
925 * a heap TID in the new high key, which is the most important
926 * benefit of suffix truncation.
927 *
928 * Overwrite the old item with new truncated high key directly.
929 * oitup is already located at the physical beginning of tuple
930 * space, so this should directly reuse the existing tuple space.
931 */
933 lastleft = (IndexTuple) PageGetItem(opage, ii);
934
936 truncated = _bt_truncate(wstate->index, lastleft, oitup,
937 wstate->inskey);
938 if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
939 elog(ERROR, "failed to add high key to the index page");
940 pfree(truncated);
941
942 /* oitup should continue to point to the page's high key */
945 }
946
947 /*
948 * Link the old page into its parent, using its low key. If we don't
949 * have a parent, we have to create one; this adds a new btree level.
950 */
951 if (state->btps_next == NULL)
952 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
953
954 Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
956 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
958 Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
960 BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
961 _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
962 pfree(state->btps_lowkey);
963
964 /*
965 * Save a copy of the high key from the old page. It is also the low
966 * key for the new page.
967 */
968 state->btps_lowkey = CopyIndexTuple(oitup);
969
970 /*
971 * Set the sibling links for both pages.
972 */
973 {
976
977 oopaque->btpo_next = nblkno;
978 nopaque->btpo_prev = oblkno;
979 nopaque->btpo_next = P_NONE; /* redundant */
980 }
981
982 /*
983 * Write out the old page. _bt_blwritepage takes ownership of the
984 * 'opage' buffer.
985 */
987
988 /*
989 * Reset last_off to point to new page
990 */
992 }
993
994 /*
995 * By here, either original page is still the current page, or a new page
996 * was created that became the current page. Either way, the current page
997 * definitely has space for new item.
998 *
999 * If the new item is the first for its page, it must also be the first
1000 * item on its entire level. On later same-level pages, a low key for a
1001 * page will be copied from the prior page in the code above. Generate a
1002 * minus infinity low key here instead.
1003 */
1004 if (last_off == P_HIKEY)
1005 {
1006 Assert(state->btps_lowkey == NULL);
1007 state->btps_lowkey = palloc0_object(IndexTupleData);
1008 state->btps_lowkey->t_info = sizeof(IndexTupleData);
1009 BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1010 }
1011
1012 /*
1013 * Add the new item into the current page.
1014 */
1016 _bt_sortaddtup(npage, itupsz, itup, last_off,
1017 !isleaf && last_off == P_FIRSTKEY);
1018
1019 state->btps_buf = nbuf;
1020 state->btps_blkno = nblkno;
1021 state->btps_lastoff = last_off;
1022}
1023
1024/*
1025 * Finalize pending posting list tuple, and add it to the index. Final tuple
1026 * is based on saved base tuple, and saved list of heap TIDs.
1027 *
1028 * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1029 * using _bt_buildadd().
1030 */
1031static void
1034{
1035 Assert(dstate->nitems > 0);
1036
1037 if (dstate->nitems == 1)
1038 _bt_buildadd(wstate, state, dstate->base, 0);
1039 else
1040 {
1043
1044 /* form a tuple with a posting list */
1046 dstate->htids,
1047 dstate->nhtids);
1048 /* Calculate posting list overhead */
1051
1054 }
1055
1056 dstate->nmaxitems = 0;
1057 dstate->nhtids = 0;
1058 dstate->nitems = 0;
1059 dstate->phystupsize = 0;
1060}
1061
1062/*
1063 * Finish writing out the completed btree.
1064 */
1065static void
1067{
1068 BTPageState *s;
1070 uint32 rootlevel = 0;
1072
1073 /*
1074 * Each iteration of this loop completes one more level of the tree.
1075 */
1076 for (s = state; s != NULL; s = s->btps_next)
1077 {
1078 BlockNumber blkno;
1079 BTPageOpaque opaque;
1080
1081 blkno = s->btps_blkno;
1082 opaque = BTPageGetOpaque((Page) s->btps_buf);
1083
1084 /*
1085 * We have to link the last page on this level to somewhere.
1086 *
1087 * If we're at the top, it's the root, so attach it to the metapage.
1088 * Otherwise, add an entry for it to its parent using its low key.
1089 * This may cause the last page of the parent level to split, but
1090 * that's not a problem -- we haven't gotten to it yet.
1091 */
1092 if (s->btps_next == NULL)
1093 {
1094 opaque->btpo_flags |= BTP_ROOT;
1095 rootblkno = blkno;
1096 rootlevel = s->btps_level;
1097 }
1098 else
1099 {
1102 BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1103 P_LEFTMOST(opaque));
1104 Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1105 !P_LEFTMOST(opaque));
1108 pfree(s->btps_lowkey);
1109 s->btps_lowkey = NULL;
1110 }
1111
1112 /*
1113 * This is the rightmost page, so the ItemId array needs to be slid
1114 * back one slot. Then we can dump out the page.
1115 */
1118 s->btps_buf = NULL; /* writepage took ownership of the buffer */
1119 }
1120
1121 /*
1122 * As the last step in the process, construct the metapage and make it
1123 * point to the new root (unless we had no data at all, in which case it's
1124 * set to point to "P_NONE"). This changes the index to the "valid" state
1125 * by filling in a valid magic number in the metapage.
1126 */
1127 metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1128 _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
1129 wstate->inskey->allequalimage);
1131}
1132
1133/*
1134 * Read tuples in correct sort order from tuplesort, and load them into
1135 * btree leaves.
1136 */
1137static void
1139{
1141 bool merge = (btspool2 != NULL);
1142 IndexTuple itup,
1143 itup2 = NULL;
1144 bool load1;
1146 int i,
1148 SortSupport sortKeys;
1149 int64 tuples_done = 0;
1150 bool deduplicate;
1151
1152 wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1153
1154 deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1156
1157 if (merge)
1158 {
1159 /*
1160 * Another BTSpool for dead tuples exists. Now we have to merge
1161 * btspool and btspool2.
1162 */
1163
1164 /* the preparation of merge */
1165 itup = tuplesort_getindextuple(btspool->sortstate, true);
1166 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1167
1168 /* Prepare SortSupport data for each column */
1169 sortKeys = palloc0_array(SortSupportData, keysz);
1170
1171 for (i = 0; i < keysz; i++)
1172 {
1173 SortSupport sortKey = sortKeys + i;
1174 ScanKey scanKey = wstate->inskey->scankeys + i;
1175 bool reverse;
1176
1177 sortKey->ssup_cxt = CurrentMemoryContext;
1178 sortKey->ssup_collation = scanKey->sk_collation;
1179 sortKey->ssup_nulls_first =
1180 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1181 sortKey->ssup_attno = scanKey->sk_attno;
1182 /* Abbreviation is not supported here */
1183 sortKey->abbreviate = false;
1184
1185 Assert(sortKey->ssup_attno != 0);
1186
1187 reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
1188
1190 }
1191
1192 for (;;)
1193 {
1194 load1 = true; /* load BTSpool next ? */
1195 if (itup2 == NULL)
1196 {
1197 if (itup == NULL)
1198 break;
1199 }
1200 else if (itup != NULL)
1201 {
1202 int32 compare = 0;
1203
1204 for (i = 1; i <= keysz; i++)
1205 {
1206 SortSupport entry;
1208 attrDatum2;
1209 bool isNull1,
1210 isNull2;
1211
1212 entry = sortKeys + i - 1;
1215
1218 entry);
1219 if (compare > 0)
1220 {
1221 load1 = false;
1222 break;
1223 }
1224 else if (compare < 0)
1225 break;
1226 }
1227
1228 /*
1229 * If key values are equal, we sort on ItemPointer. This is
1230 * required for btree indexes, since heap TID is treated as an
1231 * implicit last key attribute in order to ensure that all
1232 * keys in the index are physically unique.
1233 */
1234 if (compare == 0)
1235 {
1236 compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1237 Assert(compare != 0);
1238 if (compare > 0)
1239 load1 = false;
1240 }
1241 }
1242 else
1243 load1 = false;
1244
1245 /* When we see first tuple, create first index page */
1246 if (state == NULL)
1248
1249 if (load1)
1250 {
1251 _bt_buildadd(wstate, state, itup, 0);
1252 itup = tuplesort_getindextuple(btspool->sortstate, true);
1253 }
1254 else
1255 {
1257 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1258 }
1259
1260 /* Report progress */
1262 ++tuples_done);
1263 }
1264 pfree(sortKeys);
1265 }
1266 else if (deduplicate)
1267 {
1268 /* merge is unnecessary, deduplicate into posting lists */
1270
1272 dstate->deduplicate = true; /* unused */
1273 dstate->nmaxitems = 0; /* unused */
1274 dstate->maxpostingsize = 0; /* set later */
1275 /* Metadata about base tuple of current pending posting list */
1276 dstate->base = NULL;
1277 dstate->baseoff = InvalidOffsetNumber; /* unused */
1278 dstate->basetupsize = 0;
1279 /* Metadata about current pending posting list TIDs */
1280 dstate->htids = NULL;
1281 dstate->nhtids = 0;
1282 dstate->nitems = 0;
1283 dstate->phystupsize = 0; /* unused */
1284 dstate->nintervals = 0; /* unused */
1285
1286 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1287 true)) != NULL)
1288 {
1289 /* When we see first tuple, create first index page */
1290 if (state == NULL)
1291 {
1293
1294 /*
1295 * Limit size of posting list tuples to 1/10 space we want to
1296 * leave behind on the page, plus space for final item's line
1297 * pointer. This is equal to the space that we'd like to
1298 * leave behind on each leaf page when fillfactor is 90,
1299 * allowing us to get close to fillfactor% space utilization
1300 * when there happen to be a great many duplicates. (This
1301 * makes higher leaf fillfactor settings ineffective when
1302 * building indexes that have many duplicates, but packing
1303 * leaf pages full with few very large tuples doesn't seem
1304 * like a useful goal.)
1305 */
1306 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1307 sizeof(ItemIdData);
1308 Assert(dstate->maxpostingsize <= BTMaxItemSize &&
1309 dstate->maxpostingsize <= INDEX_SIZE_MASK);
1310 dstate->htids = palloc(dstate->maxpostingsize);
1311
1312 /* start new pending posting list with itup copy */
1315 }
1316 else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1317 itup) > keysz &&
1319 {
1320 /*
1321 * Tuple is equal to base tuple of pending posting list. Heap
1322 * TID from itup has been saved in state.
1323 */
1324 }
1325 else
1326 {
1327 /*
1328 * Tuple is not equal to pending posting list tuple, or
1329 * _bt_dedup_save_htid() opted to not merge current item into
1330 * pending posting list.
1331 */
1333 pfree(dstate->base);
1334
1335 /* start new pending posting list with itup copy */
1338 }
1339
1340 /* Report progress */
1342 ++tuples_done);
1343 }
1344
1345 if (state)
1346 {
1347 /*
1348 * Handle the last item (there must be a last item when the
1349 * tuplesort returned one or more tuples)
1350 */
1352 pfree(dstate->base);
1353 pfree(dstate->htids);
1354 }
1355
1356 pfree(dstate);
1357 }
1358 else
1359 {
1360 /* merging and deduplication are both unnecessary */
1361 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1362 true)) != NULL)
1363 {
1364 /* When we see first tuple, create first index page */
1365 if (state == NULL)
1367
1368 _bt_buildadd(wstate, state, itup, 0);
1369
1370 /* Report progress */
1372 ++tuples_done);
1373 }
1374 }
1375
1376 /* Close down final pages and write the metapage */
1378 smgr_bulk_finish(wstate->bulkstate);
1379}
1380
1381/*
1382 * Create parallel context, and launch workers for leader.
1383 *
1384 * buildstate argument should be initialized (with the exception of the
1385 * tuplesort state in spools, which may later be created based on shared
1386 * state initially set up here).
1387 *
1388 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1389 *
1390 * request is the target number of parallel worker processes to launch.
1391 *
1392 * Sets buildstate's BTLeader, which caller must use to shut down parallel
1393 * mode by passing it to _bt_end_parallel() at the very end of its index
1394 * build. If not even a single worker process can be launched, this is
1395 * never set, and caller should proceed with a serial index build.
1396 */
1397static void
1398_bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1399{
1400 ParallelContext *pcxt;
1401 int scantuplesortstates;
1402 Snapshot snapshot;
1404 Size estsort;
1405 BTShared *btshared;
1406 Sharedsort *sharedsort;
1407 Sharedsort *sharedsort2;
1408 BTSpool *btspool = buildstate->spool;
1409 BTLeader *btleader = palloc0_object(BTLeader);
1410 WalUsage *walusage;
1411 BufferUsage *bufferusage;
1412 bool leaderparticipates = true;
1413 int querylen;
1414
1415#ifdef DISABLE_LEADER_PARTICIPATION
1416 leaderparticipates = false;
1417#endif
1418
1419 /*
1420 * Enter parallel mode, and create context for parallel build of btree
1421 * index
1422 */
1424 Assert(request > 0);
1425 pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1426 request);
1427
1428 scantuplesortstates = leaderparticipates ? request + 1 : request;
1429
1430 /*
1431 * Prepare for scan of the base relation. In a normal index build, we use
1432 * SnapshotAny because we must retrieve all tuples and do our own time
1433 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1434 * concurrent build, we take a regular MVCC snapshot and index whatever's
1435 * live according to that.
1436 */
1437 if (!isconcurrent)
1438 snapshot = SnapshotAny;
1439 else
1441
1442 /*
1443 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1444 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1445 */
1448 estsort = tuplesort_estimate_shared(scantuplesortstates);
1450
1451 /*
1452 * Unique case requires a second spool, and so we may have to account for
1453 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1454 */
1455 if (!btspool->isunique)
1457 else
1458 {
1461 }
1462
1463 /*
1464 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1465 * and PARALLEL_KEY_BUFFER_USAGE.
1466 *
1467 * If there are no extensions loaded that care, we could skip this. We
1468 * have no way of knowing whether anyone's looking at pgWalUsage or
1469 * pgBufferUsage, so do it unconditionally.
1470 */
1472 mul_size(sizeof(WalUsage), pcxt->nworkers));
1475 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1477
1478 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1480 {
1484 }
1485 else
1486 querylen = 0; /* keep compiler quiet */
1487
1488 /* Everyone's had a chance to ask for space, so now create the DSM */
1490
1491 /* If no DSM segment was available, back out (do serial build) */
1492 if (pcxt->seg == NULL)
1493 {
1494 if (IsMVCCSnapshot(snapshot))
1495 UnregisterSnapshot(snapshot);
1498 return;
1499 }
1500
1501 /* Store shared build state, for which we reserved space */
1502 btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1503 /* Initialize immutable state */
1504 btshared->heaprelid = RelationGetRelid(btspool->heap);
1505 btshared->indexrelid = RelationGetRelid(btspool->index);
1506 btshared->isunique = btspool->isunique;
1507 btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1508 btshared->isconcurrent = isconcurrent;
1509 btshared->scantuplesortstates = scantuplesortstates;
1510 btshared->queryid = pgstat_get_my_query_id();
1512 SpinLockInit(&btshared->mutex);
1513 /* Initialize mutable state */
1514 btshared->nparticipantsdone = 0;
1515 btshared->reltuples = 0.0;
1516 btshared->havedead = false;
1517 btshared->indtuples = 0.0;
1518 btshared->brokenhotchain = false;
1521 snapshot);
1522
1523 /*
1524 * Store shared tuplesort-private state, for which we reserved space.
1525 * Then, initialize opaque state using tuplesort routine.
1526 */
1527 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1528 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1529 pcxt->seg);
1530
1532 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1533
1534 /* Unique case requires a second spool, and associated shared state */
1535 if (!btspool->isunique)
1536 sharedsort2 = NULL;
1537 else
1538 {
1539 /*
1540 * Store additional shared tuplesort-private state, for which we
1541 * reserved space. Then, initialize opaque state using tuplesort
1542 * routine.
1543 */
1544 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1545 tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1546 pcxt->seg);
1547
1549 }
1550
1551 /* Store query string for workers */
1553 {
1554 char *sharedquery;
1555
1556 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1559 }
1560
1561 /*
1562 * Allocate space for each worker's WalUsage and BufferUsage; no need to
1563 * initialize.
1564 */
1565 walusage = shm_toc_allocate(pcxt->toc,
1566 mul_size(sizeof(WalUsage), pcxt->nworkers));
1567 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1568 bufferusage = shm_toc_allocate(pcxt->toc,
1569 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1570 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1571
1572 /* Launch workers, saving status for leader/caller */
1574 btleader->pcxt = pcxt;
1575 btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1577 btleader->nparticipanttuplesorts++;
1578 btleader->btshared = btshared;
1579 btleader->sharedsort = sharedsort;
1580 btleader->sharedsort2 = sharedsort2;
1581 btleader->snapshot = snapshot;
1582 btleader->walusage = walusage;
1583 btleader->bufferusage = bufferusage;
1584
1585 /* If no workers were successfully launched, back out (do serial build) */
1586 if (pcxt->nworkers_launched == 0)
1587 {
1588 _bt_end_parallel(btleader);
1589 return;
1590 }
1591
1592 /* Save leader state now that it's clear build will be parallel */
1593 buildstate->btleader = btleader;
1594
1595 /* Join heap scan ourselves */
1598
1599 /*
1600 * Caller needs to wait for all launched workers when we return. Make
1601 * sure that the failure-to-start case will not hang forever.
1602 */
1604}
1605
1606/*
1607 * Shut down workers, destroy parallel context, and end parallel mode.
1608 */
1609static void
1610_bt_end_parallel(BTLeader *btleader)
1611{
1612 int i;
1613
1614 /* Shutdown worker processes */
1616
1617 /*
1618 * Next, accumulate WAL usage. (This must wait for the workers to finish,
1619 * or we might get incomplete data.)
1620 */
1621 for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1622 InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1623
1624 /* Free last reference to MVCC snapshot, if one was used */
1625 if (IsMVCCSnapshot(btleader->snapshot))
1626 UnregisterSnapshot(btleader->snapshot);
1627 DestroyParallelContext(btleader->pcxt);
1629}
1630
1631/*
1632 * Returns size of shared memory required to store state for a parallel
1633 * btree index build based on the snapshot its parallel scan will use.
1634 */
1635static Size
1637{
1638 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1639 return add_size(BUFFERALIGN(sizeof(BTShared)),
1640 table_parallelscan_estimate(heap, snapshot));
1641}
1642
1643/*
1644 * Within leader, wait for end of heap scan.
1645 *
1646 * When called, parallel heap scan started by _bt_begin_parallel() will
1647 * already be underway within worker processes (when leader participates
1648 * as a worker, we should end up here just as workers are finishing).
1649 *
1650 * Fills in fields needed for ambuild statistics, and lets caller set
1651 * field indicating that some worker encountered a broken HOT chain.
1652 *
1653 * Returns the total number of heap tuples scanned.
1654 */
1655static double
1656_bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1657{
1658 BTShared *btshared = buildstate->btleader->btshared;
1659 int nparticipanttuplesorts;
1660 double reltuples;
1661
1662 nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1663 for (;;)
1664 {
1665 SpinLockAcquire(&btshared->mutex);
1666 if (btshared->nparticipantsdone == nparticipanttuplesorts)
1667 {
1668 buildstate->havedead = btshared->havedead;
1669 buildstate->indtuples = btshared->indtuples;
1670 *brokenhotchain = btshared->brokenhotchain;
1671 reltuples = btshared->reltuples;
1672 SpinLockRelease(&btshared->mutex);
1673 break;
1674 }
1675 SpinLockRelease(&btshared->mutex);
1676
1679 }
1680
1682
1683 return reltuples;
1684}
1685
1686/*
1687 * Within leader, participate as a parallel worker.
1688 */
1689static void
1691{
1692 BTLeader *btleader = buildstate->btleader;
1695 int sortmem;
1696
1697 /* Allocate memory and initialize private spool */
1699 leaderworker->heap = buildstate->spool->heap;
1700 leaderworker->index = buildstate->spool->index;
1701 leaderworker->isunique = buildstate->spool->isunique;
1702 leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1703
1704 /* Initialize second spool, if required */
1705 if (!btleader->btshared->isunique)
1707 else
1708 {
1709 /* Allocate memory for worker's own private secondary spool */
1711
1712 /* Initialize worker's own secondary spool */
1713 leaderworker2->heap = leaderworker->heap;
1714 leaderworker2->index = leaderworker->index;
1715 leaderworker2->isunique = false;
1716 }
1717
1718 /*
1719 * Might as well use reliable figure when doling out maintenance_work_mem
1720 * (when requested number of workers were not launched, this will be
1721 * somewhat higher than it is for other workers).
1722 */
1724
1725 /* Perform work common to all participants */
1727 btleader->sharedsort, btleader->sharedsort2,
1728 sortmem, true);
1729
1730#ifdef BTREE_BUILD_STATS
1732 {
1733 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1734 ResetUsage();
1735 }
1736#endif /* BTREE_BUILD_STATS */
1737}
1738
1739/*
1740 * Perform work within a launched parallel process.
1741 */
1742void
1744{
1745 char *sharedquery;
1748 BTShared *btshared;
1749 Sharedsort *sharedsort;
1750 Sharedsort *sharedsort2;
1751 Relation heapRel;
1752 Relation indexRel;
1755 WalUsage *walusage;
1756 BufferUsage *bufferusage;
1757 int sortmem;
1758
1759#ifdef BTREE_BUILD_STATS
1761 ResetUsage();
1762#endif /* BTREE_BUILD_STATS */
1763
1764 /*
1765 * The only possible status flag that can be set to the parallel worker is
1766 * PROC_IN_SAFE_IC.
1767 */
1768 Assert((MyProc->statusFlags == 0) ||
1770
1771 /* Set debug_query_string for individual workers first */
1774
1775 /* Report the query string from leader */
1777
1778 /* Look up nbtree shared state */
1779 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1780
1781 /* Open relations using lock modes known to be obtained by index.c */
1782 if (!btshared->isconcurrent)
1783 {
1786 }
1787 else
1788 {
1791 }
1792
1793 /* Track query ID */
1794 pgstat_report_query_id(btshared->queryid, false);
1795
1796 /* Open relations within worker */
1797 heapRel = table_open(btshared->heaprelid, heapLockmode);
1798 indexRel = index_open(btshared->indexrelid, indexLockmode);
1799
1800 /* Initialize worker's own spool */
1802 btspool->heap = heapRel;
1803 btspool->index = indexRel;
1804 btspool->isunique = btshared->isunique;
1805 btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1806
1807 /* Look up shared state private to tuplesort.c */
1808 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1809 tuplesort_attach_shared(sharedsort, seg);
1810 if (!btshared->isunique)
1811 {
1812 btspool2 = NULL;
1813 sharedsort2 = NULL;
1814 }
1815 else
1816 {
1817 /* Allocate memory for worker's own private secondary spool */
1819
1820 /* Initialize worker's own secondary spool */
1821 btspool2->heap = btspool->heap;
1822 btspool2->index = btspool->index;
1823 btspool2->isunique = false;
1824 /* Look up shared state private to tuplesort.c */
1825 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1826 tuplesort_attach_shared(sharedsort2, seg);
1827 }
1828
1829 /* Prepare to track buffer usage during parallel execution */
1831
1832 /* Perform sorting of spool, and possibly a spool2 */
1834 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1835 sharedsort2, sortmem, false);
1836
1837 /* Report WAL/buffer usage during parallel execution */
1838 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1839 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1841 &walusage[ParallelWorkerNumber]);
1842
1843#ifdef BTREE_BUILD_STATS
1845 {
1846 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1847 ResetUsage();
1848 }
1849#endif /* BTREE_BUILD_STATS */
1850
1851 index_close(indexRel, indexLockmode);
1852 table_close(heapRel, heapLockmode);
1853}
1854
1855/*
1856 * Perform a worker's portion of a parallel sort.
1857 *
1858 * This generates a tuplesort for passed btspool, and a second tuplesort
1859 * state if a second btspool is need (i.e. for unique index builds). All
1860 * other spool fields should already be set when this is called.
1861 *
1862 * sortmem is the amount of working memory to use within each worker,
1863 * expressed in KBs.
1864 *
1865 * When this returns, workers are done, and need only release resources.
1866 */
1867static void
1869 BTShared *btshared, Sharedsort *sharedsort,
1870 Sharedsort *sharedsort2, int sortmem, bool progress)
1871{
1874 TableScanDesc scan;
1875 double reltuples;
1876 IndexInfo *indexInfo;
1877
1878 /* Initialize local tuplesort coordination state */
1880 coordinate->isWorker = true;
1881 coordinate->nParticipants = -1;
1882 coordinate->sharedsort = sharedsort;
1883
1884 /* Begin "partial" tuplesort */
1885 btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1886 btspool->index,
1887 btspool->isunique,
1888 btspool->nulls_not_distinct,
1891
1892 /*
1893 * Just as with serial case, there may be a second spool. If so, a
1894 * second, dedicated spool2 partial tuplesort is required.
1895 */
1896 if (btspool2)
1897 {
1899
1900 /*
1901 * We expect that the second one (for dead tuples) won't get very
1902 * full, so we give it only work_mem (unless sortmem is less for
1903 * worker). Worker processes are generally permitted to allocate
1904 * work_mem independently.
1905 */
1907 coordinate2->isWorker = true;
1908 coordinate2->nParticipants = -1;
1909 coordinate2->sharedsort = sharedsort2;
1910 btspool2->sortstate =
1911 tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1913 false);
1914 }
1915
1916 /* Fill in buildstate for _bt_build_callback() */
1917 buildstate.isunique = btshared->isunique;
1918 buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1919 buildstate.havedead = false;
1920 buildstate.heap = btspool->heap;
1921 buildstate.spool = btspool;
1922 buildstate.spool2 = btspool2;
1923 buildstate.indtuples = 0;
1924 buildstate.btleader = NULL;
1925
1926 /* Join parallel scan */
1927 indexInfo = BuildIndexInfo(btspool->index);
1928 indexInfo->ii_Concurrent = btshared->isconcurrent;
1929 scan = table_beginscan_parallel(btspool->heap,
1931 reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1933 &buildstate, scan);
1934
1935 /* Execute this worker's part of the sort */
1936 if (progress)
1939 tuplesort_performsort(btspool->sortstate);
1940 if (btspool2)
1941 {
1942 if (progress)
1945 tuplesort_performsort(btspool2->sortstate);
1946 }
1947
1948 /*
1949 * Done. Record ambuild statistics, and whether we encountered a broken
1950 * HOT chain.
1951 */
1952 SpinLockAcquire(&btshared->mutex);
1953 btshared->nparticipantsdone++;
1954 btshared->reltuples += reltuples;
1955 if (buildstate.havedead)
1956 btshared->havedead = true;
1957 btshared->indtuples += buildstate.indtuples;
1958 if (indexInfo->ii_BrokenHotChain)
1959 btshared->brokenhotchain = true;
1960 SpinLockRelease(&btshared->mutex);
1961
1962 /* Notify leader */
1964
1965 /* We can end tuplesorts immediately */
1966 tuplesort_end(btspool->sortstate);
1967 if (btspool2)
1968 tuplesort_end(btspool2->sortstate);
1969}
int ParallelWorkerNumber
Definition parallel.c:117
void InitializeParallelDSM(ParallelContext *pcxt)
Definition parallel.c:213
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition parallel.c:805
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition parallel.c:583
void DestroyParallelContext(ParallelContext *pcxt)
Definition parallel.c:959
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition parallel.c:175
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition parallel.c:702
void pgstat_progress_update_param(int index, int64 val)
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
void pgstat_report_query_id(int64 query_id, bool force)
int64 pgstat_get_my_query_id(void)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition block.h:31
static Datum values[MAXATTR]
Definition bootstrap.c:188
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:307
Size PageGetFreeSpace(const PageData *page)
Definition bufpage.c:906
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, const void *newtup, Size newsize)
Definition bufpage.c:1404
PageHeaderData * PageHeader
Definition bufpage.h:199
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:269
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:379
PageData * Page
Definition bufpage.h:81
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition bufpage.h:504
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:397
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition bulk_write.c:130
#define MAXALIGN_DOWN(LEN)
Definition c.h:910
#define Min(x, y)
Definition c.h:1093
#define MAXALIGN(LEN)
Definition c.h:898
#define BUFFERALIGN(LEN)
Definition c.h:900
#define Assert(condition)
Definition c.h:945
int64_t int64
Definition c.h:615
int32_t int32
Definition c.h:614
#define unlikely(x)
Definition c.h:432
uint32_t uint32
Definition c.h:618
size_t Size
Definition c.h:691
bool ConditionVariableCancelSleep(void)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc0_array(type, count)
Definition fe_memutils.h:77
#define palloc0_object(type)
Definition fe_memutils.h:75
static int compare(const void *arg1, const void *arg2)
Definition geqo_pool.c:144
int maintenance_work_mem
Definition globals.c:133
int work_mem
Definition globals.c:131
bool log_btree_build_stats
Definition guc_tables.c:535
IndexInfo * BuildIndexInfo(Relation index)
Definition index.c:2429
void index_close(Relation relation, LOCKMODE lockmode)
Definition indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition indexam.c:133
IndexTuple CopyIndexTuple(IndexTuple source)
Definition indextuple.c:479
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:219
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:209
void InstrStartParallelQuery(void)
Definition instrument.c:201
int i
Definition isn.c:77
#define ItemIdGetLength(itemId)
Definition itemid.h:59
#define ItemIdSetUnused(itemId)
Definition itemid.h:128
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
IndexTupleData * IndexTuple
Definition itup.h:53
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition itup.h:131
static Size IndexTupleSize(const IndexTupleData *itup)
Definition itup.h:71
#define INDEX_SIZE_MASK
Definition itup.h:65
int LOCKMODE
Definition lockdefs.h:26
#define AccessExclusiveLock
Definition lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition lockdefs.h:39
#define ShareLock
Definition lockdefs.h:40
#define RowExclusiveLock
Definition lockdefs.h:38
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition nbtdedup.c:486
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition nbtdedup.c:435
IndexTuple _bt_form_posting(IndexTuple base, const ItemPointerData *htids, int nhtids)
Definition nbtdedup.c:864
void _bt_pageinit(Page page, Size size)
Definition nbtpage.c:1134
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition nbtpage.c:68
#define BTGetDeduplicateItems(relation)
Definition nbtree.h:1135
#define BTGetTargetPageFreeSpace(relation)
Definition nbtree.h:1133
#define BTP_LEAF
Definition nbtree.h:77
#define P_HIKEY
Definition nbtree.h:368
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition nbtree.h:1148
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition nbtree.h:1149
#define P_LEFTMOST(opaque)
Definition nbtree.h:219
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define BTP_ROOT
Definition nbtree.h:78
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition nbtree.h:563
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition nbtree.h:1146
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition nbtree.h:1147
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition nbtree.h:530
#define P_NONE
Definition nbtree.h:213
#define SK_BT_NULLS_FIRST
Definition nbtree.h:1117
#define BTREE_METAPAGE
Definition nbtree.h:149
#define SK_BT_DESC
Definition nbtree.h:1116
#define P_FIRSTKEY
Definition nbtree.h:369
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition nbtree.h:596
#define BTMaxItemSize
Definition nbtree.h:165
#define BTreeTupleGetNAtts(itup, rel)
Definition nbtree.h:578
#define BTREE_NONLEAF_FILLFACTOR
Definition nbtree.h:202
#define PARALLEL_KEY_BUFFER_USAGE
Definition nbtsort.c:70
#define ParallelTableScanFromBTShared(shared)
Definition nbtsort.c:165
static void _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
Definition nbtsort.c:641
static void _bt_slideleft(Page rightmostpage)
Definition nbtsort.c:689
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
Definition nbtsort.c:652
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
Definition nbtsort.c:1139
static void _bt_end_parallel(BTLeader *btleader)
Definition nbtsort.c:1611
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition nbtsort.c:67
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition nbtsort.c:1869
static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition nbtsort.c:1637
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
Definition nbtsort.c:1033
static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
Definition nbtsort.c:1657
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition nbtsort.c:542
#define PARALLEL_KEY_BTREE_SHARED
Definition nbtsort.c:65
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition nbtsort.c:299
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
Definition nbtsort.c:1399
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
Definition nbtsort.c:789
void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition nbtsort.c:1744
static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition nbtsort.c:583
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition nbtsort.c:369
static void _bt_spooldestroy(BTSpool *btspool)
Definition nbtsort.c:521
static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
Definition nbtsort.c:531
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
Definition nbtsort.c:1067
#define PARALLEL_KEY_TUPLESORT
Definition nbtsort.c:66
#define PARALLEL_KEY_QUERY_TEXT
Definition nbtsort.c:68
#define PARALLEL_KEY_WAL_USAGE
Definition nbtsort.c:69
static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level)
Definition nbtsort.c:612
static void _bt_leader_participate_as_worker(BTBuildState *buildstate)
Definition nbtsort.c:1691
static void _bt_sortaddtup(Page page, Size itemsize, const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition nbtsort.c:720
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition nbtutils.c:1119
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition nbtutils.c:59
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition nbtutils.c:693
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition nbtutils.c:912
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition nbtutils.c:1176
#define InvalidOffsetNumber
Definition off.h:26
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition off.h:54
static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
Definition pairingheap.c:93
static char buf[DEFAULT_XLOG_SEG_SIZE]
static int progress
Definition pgbench.c:262
const char * debug_query_string
Definition postgres.c:91
void ShowUsage(const char *title)
Definition postgres.c:5103
void ResetUsage(void)
Definition postgres.c:5096
uint64_t Datum
Definition postgres.h:70
static int fb(int x)
#define PROC_IN_SAFE_IC
Definition proc.h:60
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition progress.h:112
#define PROGRESS_SCAN_BLOCKS_DONE
Definition progress.h:148
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition progress.h:113
#define PROGRESS_CREATEIDX_SUBPHASE
Definition progress.h:111
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition progress.h:147
#define RelationGetRelid(relation)
Definition rel.h:514
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetRelationName(relation)
Definition rel.h:548
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:533
@ MAIN_FORKNUM
Definition relpath.h:58
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition shm_toc.c:88
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition shm_toc.c:171
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition shmem.c:485
Size mul_size(Size s1, Size s2)
Definition shmem.c:500
Snapshot GetTransactionSnapshot(void)
Definition snapmgr.c:272
void UnregisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:866
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:824
#define SnapshotAny
Definition snapmgr.h:33
#define IsMVCCSnapshot(snapshot)
Definition snapmgr.h:59
void PrepareSortSupportFromIndexRel(Relation indexRel, bool reverse, SortSupport ssup)
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
static void SpinLockRelease(volatile slock_t *lock)
Definition spin.h:62
static void SpinLockAcquire(volatile slock_t *lock)
Definition spin.h:56
static void SpinLockInit(volatile slock_t *lock)
Definition spin.h:50
PGPROC * MyProc
Definition proc.c:68
bool isunique
Definition nbtsort.c:210
BTSpool * spool
Definition nbtsort.c:214
BTLeader * btleader
Definition nbtsort.c:228
bool nulls_not_distinct
Definition nbtsort.c:211
bool havedead
Definition nbtsort.c:212
Relation heap
Definition nbtsort.c:213
BTSpool * spool2
Definition nbtsort.c:220
double indtuples
Definition nbtsort.c:221
ParallelContext * pcxt
Definition nbtsort.c:174
BTShared * btshared
Definition nbtsort.c:194
Sharedsort * sharedsort
Definition nbtsort.c:195
Sharedsort * sharedsort2
Definition nbtsort.c:196
int nparticipanttuplesorts
Definition nbtsort.c:182
BufferUsage * bufferusage
Definition nbtsort.c:199
Snapshot snapshot
Definition nbtsort.c:197
WalUsage * walusage
Definition nbtsort.c:198
BlockNumber btpo_next
Definition nbtree.h:66
BlockNumber btpo_prev
Definition nbtree.h:65
uint16 btpo_flags
Definition nbtree.h:68
uint32 btpo_level
Definition nbtree.h:67
BTCycleId btpo_cycleid
Definition nbtree.h:69
IndexTuple btps_lowkey
Definition nbtsort.c:239
Size btps_full
Definition nbtsort.c:243
BulkWriteBuffer btps_buf
Definition nbtsort.c:237
OffsetNumber btps_lastoff
Definition nbtsort.c:240
Size btps_lastextra
Definition nbtsort.c:241
BlockNumber btps_blkno
Definition nbtsort.c:238
struct BTPageState * btps_next
Definition nbtsort.c:244
uint32 btps_level
Definition nbtsort.c:242
slock_t mutex
Definition nbtsort.c:128
bool isconcurrent
Definition nbtsort.c:108
double indtuples
Definition nbtsort.c:149
double reltuples
Definition nbtsort.c:147
Oid heaprelid
Definition nbtsort.c:104
bool brokenhotchain
Definition nbtsort.c:150
int64 queryid
Definition nbtsort.c:112
bool isunique
Definition nbtsort.c:106
int nparticipantsdone
Definition nbtsort.c:146
ConditionVariable workersdonecv
Definition nbtsort.c:120
int scantuplesortstates
Definition nbtsort.c:109
Oid indexrelid
Definition nbtsort.c:105
bool havedead
Definition nbtsort.c:148
bool nulls_not_distinct
Definition nbtsort.c:107
BulkWriteState * bulkstate
Definition nbtsort.c:254
Relation heap
Definition nbtsort.c:252
Relation index
Definition nbtsort.c:253
BlockNumber btws_pages_alloced
Definition nbtsort.c:256
BTScanInsert inskey
Definition nbtsort.c:255
double heap_tuples
Definition genam.h:40
double index_tuples
Definition genam.h:41
bool ii_Unique
Definition execnodes.h:211
bool ii_BrokenHotChain
Definition execnodes.h:223
bool ii_NullsNotDistinct
Definition execnodes.h:213
int ii_ParallelWorkers
Definition execnodes.h:229
bool ii_Concurrent
Definition execnodes.h:221
ItemPointerData t_tid
Definition itup.h:37
unsigned short t_info
Definition itup.h:49
uint8 statusFlags
Definition proc.h:207
dsm_segment * seg
Definition parallel.h:44
shm_toc_estimator estimator
Definition parallel.h:43
shm_toc * toc
Definition parallel.h:46
int nworkers_launched
Definition parallel.h:39
Definition type.h:96
void table_close(Relation relation, LOCKMODE lockmode)
Definition table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition table.c:40
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition tableam.c:166
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition tableam.c:131
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition tableam.c:146
static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition tableam.h:1765
void tuplesort_performsort(Tuplesortstate *state)
Definition tuplesort.c:1259
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition tuplesort.c:3210
Size tuplesort_estimate_shared(int nWorkers)
Definition tuplesort.c:3189
void tuplesort_end(Tuplesortstate *state)
Definition tuplesort.c:847
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition tuplesort.c:3233
#define TUPLESORT_NONE
Definition tuplesort.h:67
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, const ItemPointerData *self, const Datum *values, const bool *isnull)
void ExitParallelMode(void)
Definition xact.c:1066
void EnterParallelMode(void)
Definition xact.c:1053

Typedef Documentation

◆ BTBuildState

◆ BTLeader

◆ BTPageState

◆ BTShared

◆ BTSpool

◆ BTWriteState

Function Documentation

◆ _bt_begin_parallel()

static void _bt_begin_parallel ( BTBuildState buildstate,
bool  isconcurrent,
int  request 
)
static

Definition at line 1399 of file nbtsort.c.

1400{
1401 ParallelContext *pcxt;
1402 int scantuplesortstates;
1403 Snapshot snapshot;
1405 Size estsort;
1406 BTShared *btshared;
1407 Sharedsort *sharedsort;
1408 Sharedsort *sharedsort2;
1409 BTSpool *btspool = buildstate->spool;
1410 BTLeader *btleader = palloc0_object(BTLeader);
1411 WalUsage *walusage;
1412 BufferUsage *bufferusage;
1413 bool leaderparticipates = true;
1414 int querylen;
1415
1416#ifdef DISABLE_LEADER_PARTICIPATION
1417 leaderparticipates = false;
1418#endif
1419
1420 /*
1421 * Enter parallel mode, and create context for parallel build of btree
1422 * index
1423 */
1425 Assert(request > 0);
1426 pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1427 request);
1428
1429 scantuplesortstates = leaderparticipates ? request + 1 : request;
1430
1431 /*
1432 * Prepare for scan of the base relation. In a normal index build, we use
1433 * SnapshotAny because we must retrieve all tuples and do our own time
1434 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1435 * concurrent build, we take a regular MVCC snapshot and index whatever's
1436 * live according to that.
1437 */
1438 if (!isconcurrent)
1439 snapshot = SnapshotAny;
1440 else
1442
1443 /*
1444 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1445 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1446 */
1449 estsort = tuplesort_estimate_shared(scantuplesortstates);
1451
1452 /*
1453 * Unique case requires a second spool, and so we may have to account for
1454 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1455 */
1456 if (!btspool->isunique)
1458 else
1459 {
1462 }
1463
1464 /*
1465 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1466 * and PARALLEL_KEY_BUFFER_USAGE.
1467 *
1468 * If there are no extensions loaded that care, we could skip this. We
1469 * have no way of knowing whether anyone's looking at pgWalUsage or
1470 * pgBufferUsage, so do it unconditionally.
1471 */
1473 mul_size(sizeof(WalUsage), pcxt->nworkers));
1476 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1478
1479 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1481 {
1485 }
1486 else
1487 querylen = 0; /* keep compiler quiet */
1488
1489 /* Everyone's had a chance to ask for space, so now create the DSM */
1491
1492 /* If no DSM segment was available, back out (do serial build) */
1493 if (pcxt->seg == NULL)
1494 {
1495 if (IsMVCCSnapshot(snapshot))
1496 UnregisterSnapshot(snapshot);
1499 return;
1500 }
1501
1502 /* Store shared build state, for which we reserved space */
1503 btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1504 /* Initialize immutable state */
1505 btshared->heaprelid = RelationGetRelid(btspool->heap);
1506 btshared->indexrelid = RelationGetRelid(btspool->index);
1507 btshared->isunique = btspool->isunique;
1508 btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1509 btshared->isconcurrent = isconcurrent;
1510 btshared->scantuplesortstates = scantuplesortstates;
1511 btshared->queryid = pgstat_get_my_query_id();
1513 SpinLockInit(&btshared->mutex);
1514 /* Initialize mutable state */
1515 btshared->nparticipantsdone = 0;
1516 btshared->reltuples = 0.0;
1517 btshared->havedead = false;
1518 btshared->indtuples = 0.0;
1519 btshared->brokenhotchain = false;
1522 snapshot);
1523
1524 /*
1525 * Store shared tuplesort-private state, for which we reserved space.
1526 * Then, initialize opaque state using tuplesort routine.
1527 */
1528 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1529 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1530 pcxt->seg);
1531
1533 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1534
1535 /* Unique case requires a second spool, and associated shared state */
1536 if (!btspool->isunique)
1537 sharedsort2 = NULL;
1538 else
1539 {
1540 /*
1541 * Store additional shared tuplesort-private state, for which we
1542 * reserved space. Then, initialize opaque state using tuplesort
1543 * routine.
1544 */
1545 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1546 tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1547 pcxt->seg);
1548
1550 }
1551
1552 /* Store query string for workers */
1554 {
1555 char *sharedquery;
1556
1557 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1560 }
1561
1562 /*
1563 * Allocate space for each worker's WalUsage and BufferUsage; no need to
1564 * initialize.
1565 */
1566 walusage = shm_toc_allocate(pcxt->toc,
1567 mul_size(sizeof(WalUsage), pcxt->nworkers));
1568 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1569 bufferusage = shm_toc_allocate(pcxt->toc,
1570 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1571 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1572
1573 /* Launch workers, saving status for leader/caller */
1575 btleader->pcxt = pcxt;
1576 btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1578 btleader->nparticipanttuplesorts++;
1579 btleader->btshared = btshared;
1580 btleader->sharedsort = sharedsort;
1581 btleader->sharedsort2 = sharedsort2;
1582 btleader->snapshot = snapshot;
1583 btleader->walusage = walusage;
1584 btleader->bufferusage = bufferusage;
1585
1586 /* If no workers were successfully launched, back out (do serial build) */
1587 if (pcxt->nworkers_launched == 0)
1588 {
1589 _bt_end_parallel(btleader);
1590 return;
1591 }
1592
1593 /* Save leader state now that it's clear build will be parallel */
1594 buildstate->btleader = btleader;
1595
1596 /* Join heap scan ourselves */
1599
1600 /*
1601 * Caller needs to wait for all launched workers when we return. Make
1602 * sure that the failure-to-start case will not hang forever.
1603 */
1605}

References _bt_end_parallel(), _bt_leader_participate_as_worker(), _bt_parallel_estimate_shared(), Assert, BTShared::brokenhotchain, BTLeader::btshared, BTLeader::bufferusage, ConditionVariableInit(), CreateParallelContext(), debug_query_string, DestroyParallelContext(), EnterParallelMode(), ParallelContext::estimator, ExitParallelMode(), fb(), GetTransactionSnapshot(), BTShared::havedead, BTShared::heaprelid, BTShared::indexrelid, BTShared::indtuples, InitializeParallelDSM(), BTShared::isconcurrent, IsMVCCSnapshot, BTShared::isunique, LaunchParallelWorkers(), mul_size(), BTShared::mutex, BTShared::nparticipantsdone, BTLeader::nparticipanttuplesorts, BTShared::nulls_not_distinct, ParallelContext::nworkers, ParallelContext::nworkers_launched, palloc0_object, PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelTableScanFromBTShared, BTLeader::pcxt, pgstat_get_my_query_id(), BTShared::queryid, RegisterSnapshot(), RelationGetRelid, BTShared::reltuples, BTShared::scantuplesortstates, ParallelContext::seg, BTLeader::sharedsort, BTLeader::sharedsort2, shm_toc_allocate(), shm_toc_estimate_chunk, shm_toc_estimate_keys, shm_toc_insert(), BTLeader::snapshot, SnapshotAny, SpinLockInit(), table_parallelscan_initialize(), ParallelContext::toc, tuplesort_estimate_shared(), tuplesort_initialize_shared(), UnregisterSnapshot(), WaitForParallelWorkersToAttach(), BTLeader::walusage, and BTShared::workersdonecv.

Referenced by _bt_spools_heapscan().

◆ _bt_blnewpage()

static BulkWriteBuffer _bt_blnewpage ( BTWriteState wstate,
uint32  level 
)
static

Definition at line 612 of file nbtsort.c.

613{
615 Page page;
616 BTPageOpaque opaque;
617
618 buf = smgr_bulk_get_buf(wstate->bulkstate);
619 page = (Page) buf;
620
621 /* Zero the page and set up standard page header info */
622 _bt_pageinit(page, BLCKSZ);
623
624 /* Initialize BT opaque state */
625 opaque = BTPageGetOpaque(page);
626 opaque->btpo_prev = opaque->btpo_next = P_NONE;
627 opaque->btpo_level = level;
628 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
629 opaque->btpo_cycleid = 0;
630
631 /* Make the P_HIKEY line pointer appear allocated */
632 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
633
634 return buf;
635}

References _bt_pageinit(), BTP_LEAF, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, fb(), P_NONE, and smgr_bulk_get_buf().

Referenced by _bt_buildadd(), and _bt_pagestate().

◆ _bt_blwritepage()

static void _bt_blwritepage ( BTWriteState wstate,
BulkWriteBuffer  buf,
BlockNumber  blkno 
)
static

Definition at line 641 of file nbtsort.c.

642{
643 smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
644 /* smgr_bulk_write took ownership of 'buf' */
645}

References buf, fb(), and smgr_bulk_write().

Referenced by _bt_buildadd(), and _bt_uppershutdown().

◆ _bt_build_callback()

static void _bt_build_callback ( Relation  index,
ItemPointer  tid,
Datum values,
bool isnull,
bool  tupleIsAlive,
void state 
)
static

Definition at line 583 of file nbtsort.c.

589{
591
592 /*
593 * insert the index tuple into the appropriate spool file for subsequent
594 * processing
595 */
596 if (tupleIsAlive || buildstate->spool2 == NULL)
597 _bt_spool(buildstate->spool, tid, values, isnull);
598 else
599 {
600 /* dead tuples are put into spool2 */
601 buildstate->havedead = true;
602 _bt_spool(buildstate->spool2, tid, values, isnull);
603 }
604
605 buildstate->indtuples += 1;
606}

References _bt_spool(), fb(), and values.

Referenced by _bt_parallel_scan_and_sort(), and _bt_spools_heapscan().

◆ _bt_buildadd()

static void _bt_buildadd ( BTWriteState wstate,
BTPageState state,
IndexTuple  itup,
Size  truncextra 
)
static

Definition at line 789 of file nbtsort.c.

791{
793 Page npage;
797 Size pgspc;
798 Size itupsz;
799 bool isleaf;
800
801 /*
802 * This is a handy place to check for cancel interrupts during the btree
803 * load phase of index creation.
804 */
806
807 nbuf = state->btps_buf;
808 npage = (Page) nbuf;
809 nblkno = state->btps_blkno;
810 last_off = state->btps_lastoff;
811 last_truncextra = state->btps_lastextra;
812 state->btps_lastextra = truncextra;
813
814 pgspc = PageGetFreeSpace(npage);
815 itupsz = IndexTupleSize(itup);
817 /* Leaf case has slightly different rules due to suffix truncation */
818 isleaf = (state->btps_level == 0);
819
820 /*
821 * Check whether the new item can fit on a btree page on current level at
822 * all.
823 *
824 * Every newly built index will treat heap TID as part of the keyspace,
825 * which imposes the requirement that new high keys must occasionally have
826 * a heap TID appended within _bt_truncate(). That may leave a new pivot
827 * tuple one or two MAXALIGN() quantums larger than the original
828 * firstright tuple it's derived from. v4 deals with the problem by
829 * decreasing the limit on the size of tuples inserted on the leaf level
830 * by the same small amount. Enforce the new v4+ limit on the leaf level,
831 * and the old limit on internal levels, since pivot tuples may need to
832 * make use of the reserved space. This should never fail on internal
833 * pages.
834 */
836 _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
837 itup);
838
839 /*
840 * Check to see if current page will fit new item, with space left over to
841 * append a heap TID during suffix truncation when page is a leaf page.
842 *
843 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
844 * high key with heap TID when finishing off a leaf page, since we rely on
845 * _bt_check_third_page() rejecting oversized non-pivot tuples. On
846 * internal pages we can always fit 3 pivot tuples with larger internal
847 * page tuple limit (includes page high key).
848 *
849 * Most of the time, a page is only "full" in the sense that the soft
850 * fillfactor-wise limit has been exceeded. However, we must always leave
851 * at least two items plus a high key on each page before starting a new
852 * page. Disregard fillfactor and insert on "full" current page if we
853 * don't have the minimum number of items yet. (Note that we deliberately
854 * assume that suffix truncation neither enlarges nor shrinks new high key
855 * when applying soft limit, except when last tuple has a posting list.)
856 */
858 if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
859 (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
860 {
861 /*
862 * Finish off the page and write it out.
863 */
865 Page opage = npage;
867 ItemId ii;
868 ItemId hii;
870
871 /* Create new page of same level */
872 nbuf = _bt_blnewpage(wstate, state->btps_level);
873 npage = (Page) nbuf;
874
875 /* and assign it a page position */
876 nblkno = wstate->btws_pages_alloced++;
877
878 /*
879 * We copy the last item on the page into the new page, and then
880 * rearrange the old page so that the 'last item' becomes its high key
881 * rather than a true data item. There had better be at least two
882 * items on the page already, else the page would be empty of useful
883 * data.
884 */
889 !isleaf);
890
891 /*
892 * Move 'last' into the high key position on opage. _bt_blnewpage()
893 * allocated empty space for a line pointer when opage was first
894 * created, so this is a matter of rearranging already-allocated space
895 * on page, and initializing high key line pointer. (Actually, leaf
896 * pages must also swap oitup with a truncated version of oitup, which
897 * is sometimes larger than oitup, though never by more than the space
898 * needed to append a heap TID.)
899 */
901 *hii = *ii;
902 ItemIdSetUnused(ii); /* redundant */
903 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
904
905 if (isleaf)
906 {
907 IndexTuple lastleft;
908 IndexTuple truncated;
909
910 /*
911 * Truncate away any unneeded attributes from high key on leaf
912 * level. This is only done at the leaf level because downlinks
913 * in internal pages are either negative infinity items, or get
914 * their contents from copying from one level down. See also:
915 * _bt_split().
916 *
917 * We don't try to bias our choice of split point to make it more
918 * likely that _bt_truncate() can truncate away more attributes,
919 * whereas the split point used within _bt_split() is chosen much
920 * more delicately. Even still, the lastleft and firstright
921 * tuples passed to _bt_truncate() here are at least not fully
922 * equal to each other when deduplication is used, unless there is
923 * a large group of duplicates (also, unique index builds usually
924 * have few or no spool2 duplicates). When the split point is
925 * between two unequal tuples, _bt_truncate() will avoid including
926 * a heap TID in the new high key, which is the most important
927 * benefit of suffix truncation.
928 *
929 * Overwrite the old item with new truncated high key directly.
930 * oitup is already located at the physical beginning of tuple
931 * space, so this should directly reuse the existing tuple space.
932 */
934 lastleft = (IndexTuple) PageGetItem(opage, ii);
935
937 truncated = _bt_truncate(wstate->index, lastleft, oitup,
938 wstate->inskey);
939 if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
940 elog(ERROR, "failed to add high key to the index page");
941 pfree(truncated);
942
943 /* oitup should continue to point to the page's high key */
946 }
947
948 /*
949 * Link the old page into its parent, using its low key. If we don't
950 * have a parent, we have to create one; this adds a new btree level.
951 */
952 if (state->btps_next == NULL)
953 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
954
955 Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
957 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
959 Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
961 BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
962 _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
963 pfree(state->btps_lowkey);
964
965 /*
966 * Save a copy of the high key from the old page. It is also the low
967 * key for the new page.
968 */
969 state->btps_lowkey = CopyIndexTuple(oitup);
970
971 /*
972 * Set the sibling links for both pages.
973 */
974 {
977
978 oopaque->btpo_next = nblkno;
979 nopaque->btpo_prev = oblkno;
980 nopaque->btpo_next = P_NONE; /* redundant */
981 }
982
983 /*
984 * Write out the old page. _bt_blwritepage takes ownership of the
985 * 'opage' buffer.
986 */
988
989 /*
990 * Reset last_off to point to new page
991 */
993 }
994
995 /*
996 * By here, either original page is still the current page, or a new page
997 * was created that became the current page. Either way, the current page
998 * definitely has space for new item.
999 *
1000 * If the new item is the first for its page, it must also be the first
1001 * item on its entire level. On later same-level pages, a low key for a
1002 * page will be copied from the prior page in the code above. Generate a
1003 * minus infinity low key here instead.
1004 */
1005 if (last_off == P_HIKEY)
1006 {
1007 Assert(state->btps_lowkey == NULL);
1008 state->btps_lowkey = palloc0_object(IndexTupleData);
1009 state->btps_lowkey->t_info = sizeof(IndexTupleData);
1010 BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1011 }
1012
1013 /*
1014 * Add the new item into the current page.
1015 */
1017 _bt_sortaddtup(npage, itupsz, itup, last_off,
1018 !isleaf && last_off == P_FIRSTKEY);
1019
1020 state->btps_buf = nbuf;
1021 state->btps_blkno = nblkno;
1022 state->btps_lastoff = last_off;
1023}

References _bt_blnewpage(), _bt_blwritepage(), _bt_buildadd(), _bt_check_third_page(), _bt_pagestate(), _bt_sortaddtup(), _bt_truncate(), Assert, BTMaxItemSize, BTPageGetOpaque, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), BTreeTupleSetNAtts(), CHECK_FOR_INTERRUPTS, CopyIndexTuple(), elog, ERROR, fb(), IndexRelationGetNumberOfKeyAttributes, IndexTupleSize(), ItemIdGetLength, ItemIdSetUnused, MAXALIGN, OffsetNumberNext, OffsetNumberPrev, P_FIRSTKEY, P_HIKEY, P_LEFTMOST, P_NONE, PageGetFreeSpace(), PageGetItem(), PageGetItemId(), PageIndexTupleOverwrite(), palloc0_object, pfree(), and unlikely.

Referenced by _bt_buildadd(), _bt_load(), _bt_sort_dedup_finish_pending(), and _bt_uppershutdown().

◆ _bt_end_parallel()

static void _bt_end_parallel ( BTLeader btleader)
static

Definition at line 1611 of file nbtsort.c.

1612{
1613 int i;
1614
1615 /* Shutdown worker processes */
1617
1618 /*
1619 * Next, accumulate WAL usage. (This must wait for the workers to finish,
1620 * or we might get incomplete data.)
1621 */
1622 for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1623 InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1624
1625 /* Free last reference to MVCC snapshot, if one was used */
1626 if (IsMVCCSnapshot(btleader->snapshot))
1627 UnregisterSnapshot(btleader->snapshot);
1628 DestroyParallelContext(btleader->pcxt);
1630}

References BTLeader::bufferusage, DestroyParallelContext(), ExitParallelMode(), i, InstrAccumParallelQuery(), IsMVCCSnapshot, ParallelContext::nworkers_launched, BTLeader::pcxt, BTLeader::snapshot, UnregisterSnapshot(), WaitForParallelWorkersToFinish(), and BTLeader::walusage.

Referenced by _bt_begin_parallel(), and btbuild().

◆ _bt_leader_participate_as_worker()

static void _bt_leader_participate_as_worker ( BTBuildState buildstate)
static

Definition at line 1691 of file nbtsort.c.

1692{
1693 BTLeader *btleader = buildstate->btleader;
1696 int sortmem;
1697
1698 /* Allocate memory and initialize private spool */
1700 leaderworker->heap = buildstate->spool->heap;
1701 leaderworker->index = buildstate->spool->index;
1702 leaderworker->isunique = buildstate->spool->isunique;
1703 leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1704
1705 /* Initialize second spool, if required */
1706 if (!btleader->btshared->isunique)
1708 else
1709 {
1710 /* Allocate memory for worker's own private secondary spool */
1712
1713 /* Initialize worker's own secondary spool */
1714 leaderworker2->heap = leaderworker->heap;
1715 leaderworker2->index = leaderworker->index;
1716 leaderworker2->isunique = false;
1717 }
1718
1719 /*
1720 * Might as well use reliable figure when doling out maintenance_work_mem
1721 * (when requested number of workers were not launched, this will be
1722 * somewhat higher than it is for other workers).
1723 */
1725
1726 /* Perform work common to all participants */
1728 btleader->sharedsort, btleader->sharedsort2,
1729 sortmem, true);
1730
1731#ifdef BTREE_BUILD_STATS
1733 {
1734 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1735 ResetUsage();
1736 }
1737#endif /* BTREE_BUILD_STATS */
1738}

References _bt_parallel_scan_and_sort(), BTLeader::btshared, fb(), BTShared::isunique, log_btree_build_stats, maintenance_work_mem, BTLeader::nparticipanttuplesorts, palloc0_object, ResetUsage(), BTLeader::sharedsort, BTLeader::sharedsort2, and ShowUsage().

Referenced by _bt_begin_parallel().

◆ _bt_leafbuild()

static void _bt_leafbuild ( BTSpool btspool,
BTSpool btspool2 
)
static

Definition at line 542 of file nbtsort.c.

543{
545
546#ifdef BTREE_BUILD_STATS
548 {
549 ShowUsage("BTREE BUILD (Spool) STATISTICS");
550 ResetUsage();
551 }
552#endif /* BTREE_BUILD_STATS */
553
554 /* Execute the sort */
557 tuplesort_performsort(btspool->sortstate);
558 if (btspool2)
559 {
563 }
564
565 wstate.heap = btspool->heap;
566 wstate.index = btspool->index;
567 wstate.inskey = _bt_mkscankey(wstate.index, NULL);
568 /* _bt_mkscankey() won't set allequalimage without metapage */
569 wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
570
571 /* reserve the metapage */
572 wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
573
577}

References _bt_allequalimage(), _bt_load(), _bt_mkscankey(), BTREE_METAPAGE, fb(), log_btree_build_stats, pgstat_progress_update_param(), PROGRESS_BTREE_PHASE_LEAF_LOAD, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, PROGRESS_CREATEIDX_SUBPHASE, ResetUsage(), ShowUsage(), and tuplesort_performsort().

Referenced by btbuild().

◆ _bt_load()

static void _bt_load ( BTWriteState wstate,
BTSpool btspool,
BTSpool btspool2 
)
static

Definition at line 1139 of file nbtsort.c.

1140{
1142 bool merge = (btspool2 != NULL);
1143 IndexTuple itup,
1144 itup2 = NULL;
1145 bool load1;
1147 int i,
1149 SortSupport sortKeys;
1150 int64 tuples_done = 0;
1151 bool deduplicate;
1152
1153 wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1154
1155 deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1157
1158 if (merge)
1159 {
1160 /*
1161 * Another BTSpool for dead tuples exists. Now we have to merge
1162 * btspool and btspool2.
1163 */
1164
1165 /* the preparation of merge */
1166 itup = tuplesort_getindextuple(btspool->sortstate, true);
1167 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1168
1169 /* Prepare SortSupport data for each column */
1170 sortKeys = palloc0_array(SortSupportData, keysz);
1171
1172 for (i = 0; i < keysz; i++)
1173 {
1174 SortSupport sortKey = sortKeys + i;
1175 ScanKey scanKey = wstate->inskey->scankeys + i;
1176 bool reverse;
1177
1178 sortKey->ssup_cxt = CurrentMemoryContext;
1179 sortKey->ssup_collation = scanKey->sk_collation;
1180 sortKey->ssup_nulls_first =
1181 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1182 sortKey->ssup_attno = scanKey->sk_attno;
1183 /* Abbreviation is not supported here */
1184 sortKey->abbreviate = false;
1185
1186 Assert(sortKey->ssup_attno != 0);
1187
1188 reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
1189
1191 }
1192
1193 for (;;)
1194 {
1195 load1 = true; /* load BTSpool next ? */
1196 if (itup2 == NULL)
1197 {
1198 if (itup == NULL)
1199 break;
1200 }
1201 else if (itup != NULL)
1202 {
1203 int32 compare = 0;
1204
1205 for (i = 1; i <= keysz; i++)
1206 {
1207 SortSupport entry;
1209 attrDatum2;
1210 bool isNull1,
1211 isNull2;
1212
1213 entry = sortKeys + i - 1;
1216
1219 entry);
1220 if (compare > 0)
1221 {
1222 load1 = false;
1223 break;
1224 }
1225 else if (compare < 0)
1226 break;
1227 }
1228
1229 /*
1230 * If key values are equal, we sort on ItemPointer. This is
1231 * required for btree indexes, since heap TID is treated as an
1232 * implicit last key attribute in order to ensure that all
1233 * keys in the index are physically unique.
1234 */
1235 if (compare == 0)
1236 {
1237 compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1238 Assert(compare != 0);
1239 if (compare > 0)
1240 load1 = false;
1241 }
1242 }
1243 else
1244 load1 = false;
1245
1246 /* When we see first tuple, create first index page */
1247 if (state == NULL)
1249
1250 if (load1)
1251 {
1252 _bt_buildadd(wstate, state, itup, 0);
1253 itup = tuplesort_getindextuple(btspool->sortstate, true);
1254 }
1255 else
1256 {
1258 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1259 }
1260
1261 /* Report progress */
1263 ++tuples_done);
1264 }
1265 pfree(sortKeys);
1266 }
1267 else if (deduplicate)
1268 {
1269 /* merge is unnecessary, deduplicate into posting lists */
1271
1273 dstate->deduplicate = true; /* unused */
1274 dstate->nmaxitems = 0; /* unused */
1275 dstate->maxpostingsize = 0; /* set later */
1276 /* Metadata about base tuple of current pending posting list */
1277 dstate->base = NULL;
1278 dstate->baseoff = InvalidOffsetNumber; /* unused */
1279 dstate->basetupsize = 0;
1280 /* Metadata about current pending posting list TIDs */
1281 dstate->htids = NULL;
1282 dstate->nhtids = 0;
1283 dstate->nitems = 0;
1284 dstate->phystupsize = 0; /* unused */
1285 dstate->nintervals = 0; /* unused */
1286
1287 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1288 true)) != NULL)
1289 {
1290 /* When we see first tuple, create first index page */
1291 if (state == NULL)
1292 {
1294
1295 /*
1296 * Limit size of posting list tuples to 1/10 space we want to
1297 * leave behind on the page, plus space for final item's line
1298 * pointer. This is equal to the space that we'd like to
1299 * leave behind on each leaf page when fillfactor is 90,
1300 * allowing us to get close to fillfactor% space utilization
1301 * when there happen to be a great many duplicates. (This
1302 * makes higher leaf fillfactor settings ineffective when
1303 * building indexes that have many duplicates, but packing
1304 * leaf pages full with few very large tuples doesn't seem
1305 * like a useful goal.)
1306 */
1307 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1308 sizeof(ItemIdData);
1309 Assert(dstate->maxpostingsize <= BTMaxItemSize &&
1310 dstate->maxpostingsize <= INDEX_SIZE_MASK);
1311 dstate->htids = palloc(dstate->maxpostingsize);
1312
1313 /* start new pending posting list with itup copy */
1316 }
1317 else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1318 itup) > keysz &&
1320 {
1321 /*
1322 * Tuple is equal to base tuple of pending posting list. Heap
1323 * TID from itup has been saved in state.
1324 */
1325 }
1326 else
1327 {
1328 /*
1329 * Tuple is not equal to pending posting list tuple, or
1330 * _bt_dedup_save_htid() opted to not merge current item into
1331 * pending posting list.
1332 */
1334 pfree(dstate->base);
1335
1336 /* start new pending posting list with itup copy */
1339 }
1340
1341 /* Report progress */
1343 ++tuples_done);
1344 }
1345
1346 if (state)
1347 {
1348 /*
1349 * Handle the last item (there must be a last item when the
1350 * tuplesort returned one or more tuples)
1351 */
1353 pfree(dstate->base);
1354 pfree(dstate->htids);
1355 }
1356
1357 pfree(dstate);
1358 }
1359 else
1360 {
1361 /* merging and deduplication are both unnecessary */
1362 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1363 true)) != NULL)
1364 {
1365 /* When we see first tuple, create first index page */
1366 if (state == NULL)
1368
1369 _bt_buildadd(wstate, state, itup, 0);
1370
1371 /* Report progress */
1373 ++tuples_done);
1374 }
1375 }
1376
1377 /* Close down final pages and write the metapage */
1379 smgr_bulk_finish(wstate->bulkstate);
1380}

References _bt_buildadd(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_keep_natts_fast(), _bt_pagestate(), _bt_sort_dedup_finish_pending(), _bt_uppershutdown(), ApplySortComparator(), Assert, BTGetDeduplicateItems, BTMaxItemSize, compare(), CopyIndexTuple(), CurrentMemoryContext, fb(), i, index_getattr(), INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemPointerCompare(), MAIN_FORKNUM, MAXALIGN_DOWN, merge(), palloc(), palloc0_array, palloc_object, pfree(), pgstat_progress_update_param(), PrepareSortSupportFromIndexRel(), PROGRESS_CREATEIDX_TUPLES_DONE, RelationGetDescr, SK_BT_DESC, SK_BT_NULLS_FIRST, smgr_bulk_finish(), smgr_bulk_start_rel(), IndexTupleData::t_tid, and tuplesort_getindextuple().

Referenced by _bt_leafbuild().

◆ _bt_pagestate()

static BTPageState * _bt_pagestate ( BTWriteState wstate,
uint32  level 
)
static

Definition at line 652 of file nbtsort.c.

653{
655
656 /* create initial page for level */
657 state->btps_buf = _bt_blnewpage(wstate, level);
658
659 /* and assign it a page position */
660 state->btps_blkno = wstate->btws_pages_alloced++;
661
662 state->btps_lowkey = NULL;
663 /* initialize lastoff so first item goes into P_FIRSTKEY */
664 state->btps_lastoff = P_HIKEY;
665 state->btps_lastextra = 0;
666 state->btps_level = level;
667 /* set "full" threshold based on level. See notes at head of file. */
668 if (level > 0)
669 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
670 else
671 state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
672
673 /* no parent level, yet */
674 state->btps_next = NULL;
675
676 return state;
677}

References _bt_blnewpage(), BTGetTargetPageFreeSpace, BTREE_NONLEAF_FILLFACTOR, fb(), P_HIKEY, and palloc0_object.

Referenced by _bt_buildadd(), and _bt_load().

◆ _bt_parallel_build_main()

void _bt_parallel_build_main ( dsm_segment seg,
shm_toc toc 
)

Definition at line 1744 of file nbtsort.c.

1745{
1746 char *sharedquery;
1749 BTShared *btshared;
1750 Sharedsort *sharedsort;
1751 Sharedsort *sharedsort2;
1752 Relation heapRel;
1753 Relation indexRel;
1756 WalUsage *walusage;
1757 BufferUsage *bufferusage;
1758 int sortmem;
1759
1760#ifdef BTREE_BUILD_STATS
1762 ResetUsage();
1763#endif /* BTREE_BUILD_STATS */
1764
1765 /*
1766 * The only possible status flag that can be set to the parallel worker is
1767 * PROC_IN_SAFE_IC.
1768 */
1769 Assert((MyProc->statusFlags == 0) ||
1771
1772 /* Set debug_query_string for individual workers first */
1775
1776 /* Report the query string from leader */
1778
1779 /* Look up nbtree shared state */
1780 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1781
1782 /* Open relations using lock modes known to be obtained by index.c */
1783 if (!btshared->isconcurrent)
1784 {
1787 }
1788 else
1789 {
1792 }
1793
1794 /* Track query ID */
1795 pgstat_report_query_id(btshared->queryid, false);
1796
1797 /* Open relations within worker */
1798 heapRel = table_open(btshared->heaprelid, heapLockmode);
1799 indexRel = index_open(btshared->indexrelid, indexLockmode);
1800
1801 /* Initialize worker's own spool */
1803 btspool->heap = heapRel;
1804 btspool->index = indexRel;
1805 btspool->isunique = btshared->isunique;
1806 btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1807
1808 /* Look up shared state private to tuplesort.c */
1809 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1810 tuplesort_attach_shared(sharedsort, seg);
1811 if (!btshared->isunique)
1812 {
1813 btspool2 = NULL;
1814 sharedsort2 = NULL;
1815 }
1816 else
1817 {
1818 /* Allocate memory for worker's own private secondary spool */
1820
1821 /* Initialize worker's own secondary spool */
1822 btspool2->heap = btspool->heap;
1823 btspool2->index = btspool->index;
1824 btspool2->isunique = false;
1825 /* Look up shared state private to tuplesort.c */
1826 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1827 tuplesort_attach_shared(sharedsort2, seg);
1828 }
1829
1830 /* Prepare to track buffer usage during parallel execution */
1832
1833 /* Perform sorting of spool, and possibly a spool2 */
1835 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1836 sharedsort2, sortmem, false);
1837
1838 /* Report WAL/buffer usage during parallel execution */
1839 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1840 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1842 &walusage[ParallelWorkerNumber]);
1843
1844#ifdef BTREE_BUILD_STATS
1846 {
1847 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1848 ResetUsage();
1849 }
1850#endif /* BTREE_BUILD_STATS */
1851
1852 index_close(indexRel, indexLockmode);
1853 table_close(heapRel, heapLockmode);
1854}

References _bt_parallel_scan_and_sort(), AccessExclusiveLock, Assert, debug_query_string, fb(), BTShared::heaprelid, index_close(), index_open(), BTShared::indexrelid, InstrEndParallelQuery(), InstrStartParallelQuery(), BTShared::isconcurrent, BTShared::isunique, log_btree_build_stats, maintenance_work_mem, MyProc, BTShared::nulls_not_distinct, palloc0_object, PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelWorkerNumber, pgstat_report_activity(), pgstat_report_query_id(), PROC_IN_SAFE_IC, BTShared::queryid, ResetUsage(), RowExclusiveLock, BTShared::scantuplesortstates, ShareLock, ShareUpdateExclusiveLock, shm_toc_lookup(), ShowUsage(), STATE_RUNNING, PGPROC::statusFlags, table_close(), table_open(), and tuplesort_attach_shared().

◆ _bt_parallel_estimate_shared()

static Size _bt_parallel_estimate_shared ( Relation  heap,
Snapshot  snapshot 
)
static

Definition at line 1637 of file nbtsort.c.

1638{
1639 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1640 return add_size(BUFFERALIGN(sizeof(BTShared)),
1641 table_parallelscan_estimate(heap, snapshot));
1642}

References add_size(), BUFFERALIGN, and table_parallelscan_estimate().

Referenced by _bt_begin_parallel().

◆ _bt_parallel_heapscan()

static double _bt_parallel_heapscan ( BTBuildState buildstate,
bool brokenhotchain 
)
static

Definition at line 1657 of file nbtsort.c.

1658{
1659 BTShared *btshared = buildstate->btleader->btshared;
1660 int nparticipanttuplesorts;
1661 double reltuples;
1662
1663 nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1664 for (;;)
1665 {
1666 SpinLockAcquire(&btshared->mutex);
1667 if (btshared->nparticipantsdone == nparticipanttuplesorts)
1668 {
1669 buildstate->havedead = btshared->havedead;
1670 buildstate->indtuples = btshared->indtuples;
1671 *brokenhotchain = btshared->brokenhotchain;
1672 reltuples = btshared->reltuples;
1673 SpinLockRelease(&btshared->mutex);
1674 break;
1675 }
1676 SpinLockRelease(&btshared->mutex);
1677
1680 }
1681
1683
1684 return reltuples;
1685}

References BTShared::brokenhotchain, ConditionVariableCancelSleep(), ConditionVariableSleep(), fb(), BTShared::havedead, BTShared::indtuples, BTShared::mutex, BTShared::nparticipantsdone, BTShared::reltuples, SpinLockAcquire(), SpinLockRelease(), and BTShared::workersdonecv.

Referenced by _bt_spools_heapscan().

◆ _bt_parallel_scan_and_sort()

static void _bt_parallel_scan_and_sort ( BTSpool btspool,
BTSpool btspool2,
BTShared btshared,
Sharedsort sharedsort,
Sharedsort sharedsort2,
int  sortmem,
bool  progress 
)
static

Definition at line 1869 of file nbtsort.c.

1872{
1875 TableScanDesc scan;
1876 double reltuples;
1877 IndexInfo *indexInfo;
1878
1879 /* Initialize local tuplesort coordination state */
1881 coordinate->isWorker = true;
1882 coordinate->nParticipants = -1;
1883 coordinate->sharedsort = sharedsort;
1884
1885 /* Begin "partial" tuplesort */
1886 btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1887 btspool->index,
1888 btspool->isunique,
1889 btspool->nulls_not_distinct,
1892
1893 /*
1894 * Just as with serial case, there may be a second spool. If so, a
1895 * second, dedicated spool2 partial tuplesort is required.
1896 */
1897 if (btspool2)
1898 {
1900
1901 /*
1902 * We expect that the second one (for dead tuples) won't get very
1903 * full, so we give it only work_mem (unless sortmem is less for
1904 * worker). Worker processes are generally permitted to allocate
1905 * work_mem independently.
1906 */
1908 coordinate2->isWorker = true;
1909 coordinate2->nParticipants = -1;
1910 coordinate2->sharedsort = sharedsort2;
1911 btspool2->sortstate =
1912 tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1914 false);
1915 }
1916
1917 /* Fill in buildstate for _bt_build_callback() */
1918 buildstate.isunique = btshared->isunique;
1919 buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1920 buildstate.havedead = false;
1921 buildstate.heap = btspool->heap;
1922 buildstate.spool = btspool;
1923 buildstate.spool2 = btspool2;
1924 buildstate.indtuples = 0;
1925 buildstate.btleader = NULL;
1926
1927 /* Join parallel scan */
1928 indexInfo = BuildIndexInfo(btspool->index);
1929 indexInfo->ii_Concurrent = btshared->isconcurrent;
1930 scan = table_beginscan_parallel(btspool->heap,
1932 reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1934 &buildstate, scan);
1935
1936 /* Execute this worker's part of the sort */
1937 if (progress)
1940 tuplesort_performsort(btspool->sortstate);
1941 if (btspool2)
1942 {
1943 if (progress)
1946 tuplesort_performsort(btspool2->sortstate);
1947 }
1948
1949 /*
1950 * Done. Record ambuild statistics, and whether we encountered a broken
1951 * HOT chain.
1952 */
1953 SpinLockAcquire(&btshared->mutex);
1954 btshared->nparticipantsdone++;
1955 btshared->reltuples += reltuples;
1956 if (buildstate.havedead)
1957 btshared->havedead = true;
1958 btshared->indtuples += buildstate.indtuples;
1959 if (indexInfo->ii_BrokenHotChain)
1960 btshared->brokenhotchain = true;
1961 SpinLockRelease(&btshared->mutex);
1962
1963 /* Notify leader */
1965
1966 /* We can end tuplesorts immediately */
1967 tuplesort_end(btspool->sortstate);
1968 if (btspool2)
1969 tuplesort_end(btspool2->sortstate);
1970}

References _bt_build_callback(), BTShared::brokenhotchain, BuildIndexInfo(), ConditionVariableSignal(), fb(), BTShared::havedead, IndexInfo::ii_BrokenHotChain, IndexInfo::ii_Concurrent, BTShared::indtuples, BTShared::isconcurrent, BTShared::isunique, Min, BTShared::mutex, BTShared::nparticipantsdone, BTShared::nulls_not_distinct, palloc0_object, ParallelTableScanFromBTShared, pgstat_progress_update_param(), progress, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, PROGRESS_CREATEIDX_SUBPHASE, BTShared::reltuples, SpinLockAcquire(), SpinLockRelease(), table_beginscan_parallel(), table_index_build_scan(), tuplesort_begin_index_btree(), tuplesort_end(), TUPLESORT_NONE, tuplesort_performsort(), work_mem, and BTShared::workersdonecv.

Referenced by _bt_leader_participate_as_worker(), and _bt_parallel_build_main().

◆ _bt_slideleft()

static void _bt_slideleft ( Page  rightmostpage)
static

Definition at line 689 of file nbtsort.c.

690{
691 OffsetNumber off;
692 OffsetNumber maxoff;
694
696 Assert(maxoff >= P_FIRSTKEY);
698 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
699 {
701
702 *previi = *thisii;
703 previi = thisii;
704 }
705 ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
706}

References Assert, fb(), OffsetNumberNext, P_FIRSTKEY, P_HIKEY, PageGetItemId(), and PageGetMaxOffsetNumber().

Referenced by _bt_uppershutdown().

◆ _bt_sort_dedup_finish_pending()

static void _bt_sort_dedup_finish_pending ( BTWriteState wstate,
BTPageState state,
BTDedupState  dstate 
)
static

Definition at line 1033 of file nbtsort.c.

1035{
1036 Assert(dstate->nitems > 0);
1037
1038 if (dstate->nitems == 1)
1039 _bt_buildadd(wstate, state, dstate->base, 0);
1040 else
1041 {
1044
1045 /* form a tuple with a posting list */
1047 dstate->htids,
1048 dstate->nhtids);
1049 /* Calculate posting list overhead */
1052
1055 }
1056
1057 dstate->nmaxitems = 0;
1058 dstate->nhtids = 0;
1059 dstate->nitems = 0;
1060 dstate->phystupsize = 0;
1061}

References _bt_buildadd(), _bt_form_posting(), Assert, BTreeTupleGetPostingOffset(), fb(), IndexTupleSize(), and pfree().

Referenced by _bt_load().

◆ _bt_sortaddtup()

static void _bt_sortaddtup ( Page  page,
Size  itemsize,
const IndexTupleData itup,
OffsetNumber  itup_off,
bool  newfirstdataitem 
)
static

Definition at line 720 of file nbtsort.c.

725{
727
729 {
730 trunctuple = *itup;
732 BTreeTupleSetNAtts(&trunctuple, 0, false);
733 itup = &trunctuple;
734 itemsize = sizeof(IndexTupleData);
735 }
736
737 if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
738 elog(ERROR, "failed to add item to the index page");
739}

References BTreeTupleSetNAtts(), elog, ERROR, fb(), InvalidOffsetNumber, PageAddItem, and IndexTupleData::t_info.

Referenced by _bt_buildadd().

◆ _bt_spool()

static void _bt_spool ( BTSpool btspool,
const ItemPointerData self,
const Datum values,
const bool isnull 
)
static

Definition at line 531 of file nbtsort.c.

532{
534 self, values, isnull);
535}

References fb(), tuplesort_putindextuplevalues(), and values.

Referenced by _bt_build_callback().

◆ _bt_spooldestroy()

static void _bt_spooldestroy ( BTSpool btspool)
static

Definition at line 521 of file nbtsort.c.

522{
523 tuplesort_end(btspool->sortstate);
524 pfree(btspool);
525}

References fb(), pfree(), and tuplesort_end().

Referenced by _bt_spools_heapscan(), and btbuild().

◆ _bt_spools_heapscan()

static double _bt_spools_heapscan ( Relation  heap,
Relation  index,
BTBuildState buildstate,
IndexInfo indexInfo 
)
static

Definition at line 369 of file nbtsort.c.

371{
374 double reltuples = 0;
375
376 /*
377 * We size the sort area as maintenance_work_mem rather than work_mem to
378 * speed index creation. This should be OK since a single backend can't
379 * run multiple index creations in parallel (see also: notes on
380 * parallelism and maintenance_work_mem below).
381 */
382 btspool->heap = heap;
383 btspool->index = index;
384 btspool->isunique = indexInfo->ii_Unique;
385 btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
386
387 /* Save as primary spool */
388 buildstate->spool = btspool;
389
390 /* Report table scan phase started */
393
394 /* Attempt to launch parallel worker scan when required */
395 if (indexInfo->ii_ParallelWorkers > 0)
397 indexInfo->ii_ParallelWorkers);
398
399 /*
400 * If parallel build requested and at least one worker process was
401 * successfully launched, set up coordination state
402 */
403 if (buildstate->btleader)
404 {
406 coordinate->isWorker = false;
407 coordinate->nParticipants =
408 buildstate->btleader->nparticipanttuplesorts;
409 coordinate->sharedsort = buildstate->btleader->sharedsort;
410 }
411
412 /*
413 * Begin serial/leader tuplesort.
414 *
415 * In cases where parallelism is involved, the leader receives the same
416 * share of maintenance_work_mem as a serial sort (it is generally treated
417 * in the same way as a serial sort once we return). Parallel worker
418 * Tuplesortstates will have received only a fraction of
419 * maintenance_work_mem, though.
420 *
421 * We rely on the lifetime of the Leader Tuplesortstate almost not
422 * overlapping with any worker Tuplesortstate's lifetime. There may be
423 * some small overlap, but that's okay because we rely on leader
424 * Tuplesortstate only allocating a small, fixed amount of memory here.
425 * When its tuplesort_performsort() is called (by our caller), and
426 * significant amounts of memory are likely to be used, all workers must
427 * have already freed almost all memory held by their Tuplesortstates
428 * (they are about to go away completely, too). The overall effect is
429 * that maintenance_work_mem always represents an absolute high watermark
430 * on the amount of memory used by a CREATE INDEX operation, regardless of
431 * the use of parallelism or any other factor.
432 */
433 buildstate->spool->sortstate =
435 buildstate->nulls_not_distinct,
438
439 /*
440 * If building a unique index, put dead tuples in a second spool to keep
441 * them out of the uniqueness check. We expect that the second spool (for
442 * dead tuples) won't get very full, so we give it only work_mem.
443 */
444 if (indexInfo->ii_Unique)
445 {
448
449 /* Initialize secondary spool */
450 btspool2->heap = heap;
451 btspool2->index = index;
452 btspool2->isunique = false;
453 /* Save as secondary spool */
454 buildstate->spool2 = btspool2;
455
456 if (buildstate->btleader)
457 {
458 /*
459 * Set up non-private state that is passed to
460 * tuplesort_begin_index_btree() about the basic high level
461 * coordination of a parallel sort.
462 */
464 coordinate2->isWorker = false;
465 coordinate2->nParticipants =
466 buildstate->btleader->nparticipanttuplesorts;
467 coordinate2->sharedsort = buildstate->btleader->sharedsort2;
468 }
469
470 /*
471 * We expect that the second one (for dead tuples) won't get very
472 * full, so we give it only work_mem
473 */
474 buildstate->spool2->sortstate =
475 tuplesort_begin_index_btree(heap, index, false, false, work_mem,
477 }
478
479 /* Fill spool using either serial or parallel heap scan */
480 if (!buildstate->btleader)
481 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
483 NULL);
484 else
486 &indexInfo->ii_BrokenHotChain);
487
488 /*
489 * Set the progress target for the next phase. Reset the block number
490 * values set by table_index_build_scan
491 */
492 {
493 const int progress_index[] = {
497 };
498 const int64 progress_vals[] = {
499 buildstate->indtuples,
500 0, 0
501 };
502
504 }
505
506 /* okay, all heap tuples are spooled */
507 if (buildstate->spool2 && !buildstate->havedead)
508 {
509 /* spool2 turns out to be unnecessary */
511 buildstate->spool2 = NULL;
512 }
513
514 return reltuples;
515}

References _bt_begin_parallel(), _bt_build_callback(), _bt_parallel_heapscan(), _bt_spooldestroy(), fb(), IndexInfo::ii_BrokenHotChain, IndexInfo::ii_Concurrent, IndexInfo::ii_NullsNotDistinct, IndexInfo::ii_ParallelWorkers, IndexInfo::ii_Unique, maintenance_work_mem, palloc0_object, pgstat_progress_update_multi_param(), pgstat_progress_update_param(), PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN, PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_CREATEIDX_TUPLES_TOTAL, PROGRESS_SCAN_BLOCKS_DONE, PROGRESS_SCAN_BLOCKS_TOTAL, table_index_build_scan(), tuplesort_begin_index_btree(), TUPLESORT_NONE, and work_mem.

Referenced by btbuild().

◆ _bt_uppershutdown()

static void _bt_uppershutdown ( BTWriteState wstate,
BTPageState state 
)
static

Definition at line 1067 of file nbtsort.c.

1068{
1069 BTPageState *s;
1071 uint32 rootlevel = 0;
1073
1074 /*
1075 * Each iteration of this loop completes one more level of the tree.
1076 */
1077 for (s = state; s != NULL; s = s->btps_next)
1078 {
1079 BlockNumber blkno;
1080 BTPageOpaque opaque;
1081
1082 blkno = s->btps_blkno;
1083 opaque = BTPageGetOpaque((Page) s->btps_buf);
1084
1085 /*
1086 * We have to link the last page on this level to somewhere.
1087 *
1088 * If we're at the top, it's the root, so attach it to the metapage.
1089 * Otherwise, add an entry for it to its parent using its low key.
1090 * This may cause the last page of the parent level to split, but
1091 * that's not a problem -- we haven't gotten to it yet.
1092 */
1093 if (s->btps_next == NULL)
1094 {
1095 opaque->btpo_flags |= BTP_ROOT;
1096 rootblkno = blkno;
1097 rootlevel = s->btps_level;
1098 }
1099 else
1100 {
1103 BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1104 P_LEFTMOST(opaque));
1105 Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1106 !P_LEFTMOST(opaque));
1109 pfree(s->btps_lowkey);
1110 s->btps_lowkey = NULL;
1111 }
1112
1113 /*
1114 * This is the rightmost page, so the ItemId array needs to be slid
1115 * back one slot. Then we can dump out the page.
1116 */
1119 s->btps_buf = NULL; /* writepage took ownership of the buffer */
1120 }
1121
1122 /*
1123 * As the last step in the process, construct the metapage and make it
1124 * point to the new root (unless we had no data at all, in which case it's
1125 * set to point to "P_NONE"). This changes the index to the "valid" state
1126 * by filling in a valid magic number in the metapage.
1127 */
1128 metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1129 _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
1130 wstate->inskey->allequalimage);
1132}

References _bt_blwritepage(), _bt_buildadd(), _bt_initmetapage(), _bt_slideleft(), Assert, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTPageState::btps_blkno, BTPageState::btps_buf, BTPageState::btps_level, BTPageState::btps_lowkey, BTPageState::btps_next, BTREE_METAPAGE, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), fb(), IndexRelationGetNumberOfKeyAttributes, P_LEFTMOST, P_NONE, pfree(), and smgr_bulk_get_buf().

Referenced by _bt_load().

◆ btbuild()

IndexBuildResult * btbuild ( Relation  heap,
Relation  index,
IndexInfo indexInfo 
)

Definition at line 299 of file nbtsort.c.

300{
301 IndexBuildResult *result;
303 double reltuples;
304
305#ifdef BTREE_BUILD_STATS
307 ResetUsage();
308#endif /* BTREE_BUILD_STATS */
309
310 buildstate.isunique = indexInfo->ii_Unique;
311 buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
312 buildstate.havedead = false;
313 buildstate.heap = heap;
314 buildstate.spool = NULL;
315 buildstate.spool2 = NULL;
316 buildstate.indtuples = 0;
317 buildstate.btleader = NULL;
318
319 /*
320 * We expect to be called exactly once for any index relation. If that's
321 * not the case, big trouble's what we have.
322 */
324 elog(ERROR, "index \"%s\" already contains data",
326
327 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
328
329 /*
330 * Finish the build by (1) completing the sort of the spool file, (2)
331 * inserting the sorted tuples into btree pages and (3) building the upper
332 * levels. Finally, it may also be necessary to end use of parallelism.
333 */
334 _bt_leafbuild(buildstate.spool, buildstate.spool2);
336 if (buildstate.spool2)
338 if (buildstate.btleader)
340
342
343 result->heap_tuples = reltuples;
344 result->index_tuples = buildstate.indtuples;
345
346#ifdef BTREE_BUILD_STATS
348 {
349 ShowUsage("BTREE BUILD STATS");
350 ResetUsage();
351 }
352#endif /* BTREE_BUILD_STATS */
353
354 return result;
355}

References _bt_end_parallel(), _bt_leafbuild(), _bt_spooldestroy(), _bt_spools_heapscan(), elog, ERROR, fb(), IndexBuildResult::heap_tuples, IndexInfo::ii_NullsNotDistinct, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, log_btree_build_stats, palloc_object, RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), and ShowUsage().

Referenced by bthandler().