PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtsort.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/parallel.h"
#include "access/relscan.h"
#include "access/table.h"
#include "access/tableam.h"
#include "access/xact.h"
#include "catalog/index.h"
#include "commands/progress.h"
#include "executor/instrument.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bulk_write.h"
#include "tcop/tcopprot.h"
#include "utils/rel.h"
#include "utils/sortsupport.h"
#include "utils/tuplesort.h"
Include dependency graph for nbtsort.c:

Go to the source code of this file.

Data Structures

struct  BTSpool
 
struct  BTShared
 
struct  BTLeader
 
struct  BTBuildState
 
struct  BTPageState
 
struct  BTWriteState
 

Macros

#define PARALLEL_KEY_BTREE_SHARED   UINT64CONST(0xA000000000000001)
 
#define PARALLEL_KEY_TUPLESORT   UINT64CONST(0xA000000000000002)
 
#define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
 
#define PARALLEL_KEY_QUERY_TEXT   UINT64CONST(0xA000000000000004)
 
#define PARALLEL_KEY_WAL_USAGE   UINT64CONST(0xA000000000000005)
 
#define PARALLEL_KEY_BUFFER_USAGE   UINT64CONST(0xA000000000000006)
 
#define ParallelTableScanFromBTShared(shared)    (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
 

Typedefs

typedef struct BTSpool BTSpool
 
typedef struct BTShared BTShared
 
typedef struct BTLeader BTLeader
 
typedef struct BTBuildState BTBuildState
 
typedef struct BTPageState BTPageState
 
typedef struct BTWriteState BTWriteState
 

Functions

static double _bt_spools_heapscan (Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
 
static void _bt_spooldestroy (BTSpool *btspool)
 
static void _bt_spool (BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
 
static void _bt_leafbuild (BTSpool *btspool, BTSpool *btspool2)
 
static void _bt_build_callback (Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
 
static BulkWriteBuffer _bt_blnewpage (BTWriteState *wstate, uint32 level)
 
static BTPageState_bt_pagestate (BTWriteState *wstate, uint32 level)
 
static void _bt_slideleft (Page rightmostpage)
 
static void _bt_sortaddtup (Page page, Size itemsize, const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem)
 
static void _bt_buildadd (BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
 
static void _bt_sort_dedup_finish_pending (BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
 
static void _bt_uppershutdown (BTWriteState *wstate, BTPageState *state)
 
static void _bt_load (BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 
static void _bt_begin_parallel (BTBuildState *buildstate, bool isconcurrent, int request)
 
static void _bt_end_parallel (BTLeader *btleader)
 
static Size _bt_parallel_estimate_shared (Relation heap, Snapshot snapshot)
 
static double _bt_parallel_heapscan (BTBuildState *buildstate, bool *brokenhotchain)
 
static void _bt_leader_participate_as_worker (BTBuildState *buildstate)
 
static void _bt_parallel_scan_and_sort (BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
 
IndexBuildResultbtbuild (Relation heap, Relation index, IndexInfo *indexInfo)
 
static void _bt_blwritepage (BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ PARALLEL_KEY_BTREE_SHARED

#define PARALLEL_KEY_BTREE_SHARED   UINT64CONST(0xA000000000000001)

Definition at line 62 of file nbtsort.c.

◆ PARALLEL_KEY_BUFFER_USAGE

#define PARALLEL_KEY_BUFFER_USAGE   UINT64CONST(0xA000000000000006)

Definition at line 67 of file nbtsort.c.

◆ PARALLEL_KEY_QUERY_TEXT

#define PARALLEL_KEY_QUERY_TEXT   UINT64CONST(0xA000000000000004)

Definition at line 65 of file nbtsort.c.

◆ PARALLEL_KEY_TUPLESORT

#define PARALLEL_KEY_TUPLESORT   UINT64CONST(0xA000000000000002)

Definition at line 63 of file nbtsort.c.

◆ PARALLEL_KEY_TUPLESORT_SPOOL2

#define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)

Definition at line 64 of file nbtsort.c.

◆ PARALLEL_KEY_WAL_USAGE

#define PARALLEL_KEY_WAL_USAGE   UINT64CONST(0xA000000000000005)

Definition at line 66 of file nbtsort.c.

◆ ParallelTableScanFromBTShared

#define ParallelTableScanFromBTShared (   shared)     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))

Definition at line 162 of file nbtsort.c.

168{
169 /* parallel context itself */
170 ParallelContext *pcxt;
171
172 /*
173 * nparticipanttuplesorts is the exact number of worker processes
174 * successfully launched, plus one leader process if it participates as a
175 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
176 * participating as a worker).
177 */
178 int nparticipanttuplesorts;
179
180 /*
181 * Leader process convenience pointers to shared state (leader avoids TOC
182 * lookups).
183 *
184 * btshared is the shared state for entire build. sharedsort is the
185 * shared, tuplesort-managed state passed to each process tuplesort.
186 * sharedsort2 is the corresponding btspool2 shared state, used only when
187 * building unique indexes. snapshot is the snapshot used by the scan iff
188 * an MVCC snapshot is required.
189 */
190 BTShared *btshared;
191 Sharedsort *sharedsort;
192 Sharedsort *sharedsort2;
193 Snapshot snapshot;
194 WalUsage *walusage;
195 BufferUsage *bufferusage;
196} BTLeader;
197
198/*
199 * Working state for btbuild and its callback.
200 *
201 * When parallel CREATE INDEX is used, there is a BTBuildState for each
202 * participant.
203 */
204typedef struct BTBuildState
205{
206 bool isunique;
208 bool havedead;
210 BTSpool *spool;
211
212 /*
213 * spool2 is needed only when the index is a unique index. Dead tuples are
214 * put into spool2 instead of spool in order to avoid uniqueness check.
215 */
217 double indtuples;
218
219 /*
220 * btleader is only present when a parallel index build is performed, and
221 * only in the leader process. (Actually, only the leader has a
222 * BTBuildState. Workers have their own spool and spool2, though.)
223 */
226
227/*
228 * Status record for a btree page being built. We have one of these
229 * for each active tree level.
230 */
231typedef struct BTPageState
232{
233 BulkWriteBuffer btps_buf; /* workspace for page building */
234 BlockNumber btps_blkno; /* block # to write this page at */
235 IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
236 OffsetNumber btps_lastoff; /* last item offset loaded */
237 Size btps_lastextra; /* last item's extra posting list space */
238 uint32 btps_level; /* tree level (0 = leaf) */
239 Size btps_full; /* "full" if less than this much free space */
240 struct BTPageState *btps_next; /* link to parent level, if any */
242
243/*
244 * Overall status record for index writing phase.
245 */
246typedef struct BTWriteState
247{
251 BTScanInsert inskey; /* generic insertion scankey */
252 BlockNumber btws_pages_alloced; /* # pages allocated */
254
255
256static double _bt_spools_heapscan(Relation heap, Relation index,
257 BTBuildState *buildstate, IndexInfo *indexInfo);
258static void _bt_spooldestroy(BTSpool *btspool);
259static void _bt_spool(BTSpool *btspool, const ItemPointerData *self,
260 const Datum *values, const bool *isnull);
263 bool *isnull, bool tupleIsAlive, void *state);
266static void _bt_slideleft(Page rightmostpage);
267static void _bt_sortaddtup(Page page, Size itemsize,
269 bool newfirstdataitem);
276static void _bt_load(BTWriteState *wstate,
278static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
279 int request);
280static void _bt_end_parallel(BTLeader *btleader);
283 bool *brokenhotchain);
286 BTShared *btshared, Sharedsort *sharedsort,
287 Sharedsort *sharedsort2, int sortmem,
288 bool progress);
289
290
291/*
292 * btbuild() -- build a new btree index.
293 */
295btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
296{
297 IndexBuildResult *result;
299 double reltuples;
300
301#ifdef BTREE_BUILD_STATS
303 ResetUsage();
304#endif /* BTREE_BUILD_STATS */
305
306 buildstate.isunique = indexInfo->ii_Unique;
307 buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
308 buildstate.havedead = false;
309 buildstate.heap = heap;
310 buildstate.spool = NULL;
311 buildstate.spool2 = NULL;
312 buildstate.indtuples = 0;
313 buildstate.btleader = NULL;
314
315 /*
316 * We expect to be called exactly once for any index relation. If that's
317 * not the case, big trouble's what we have.
318 */
320 elog(ERROR, "index \"%s\" already contains data",
322
323 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
324
325 /*
326 * Finish the build by (1) completing the sort of the spool file, (2)
327 * inserting the sorted tuples into btree pages and (3) building the upper
328 * levels. Finally, it may also be necessary to end use of parallelism.
329 */
330 _bt_leafbuild(buildstate.spool, buildstate.spool2);
332 if (buildstate.spool2)
334 if (buildstate.btleader)
336
338
339 result->heap_tuples = reltuples;
340 result->index_tuples = buildstate.indtuples;
341
342#ifdef BTREE_BUILD_STATS
344 {
345 ShowUsage("BTREE BUILD STATS");
346 ResetUsage();
347 }
348#endif /* BTREE_BUILD_STATS */
349
350 return result;
351}
352
353/*
354 * Create and initialize one or two spool structures, and save them in caller's
355 * buildstate argument. May also fill-in fields within indexInfo used by index
356 * builds.
357 *
358 * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
359 * routine encapsulates all aspects of managing parallelism. Caller need only
360 * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
361 *
362 * Returns the total number of heap tuples scanned.
363 */
364static double
366 IndexInfo *indexInfo)
367{
370 double reltuples = 0;
371
372 /*
373 * We size the sort area as maintenance_work_mem rather than work_mem to
374 * speed index creation. This should be OK since a single backend can't
375 * run multiple index creations in parallel (see also: notes on
376 * parallelism and maintenance_work_mem below).
377 */
378 btspool->heap = heap;
379 btspool->index = index;
380 btspool->isunique = indexInfo->ii_Unique;
381 btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
382
383 /* Save as primary spool */
384 buildstate->spool = btspool;
385
386 /* Report table scan phase started */
389
390 /* Attempt to launch parallel worker scan when required */
391 if (indexInfo->ii_ParallelWorkers > 0)
393 indexInfo->ii_ParallelWorkers);
394
395 /*
396 * If parallel build requested and at least one worker process was
397 * successfully launched, set up coordination state
398 */
399 if (buildstate->btleader)
400 {
402 coordinate->isWorker = false;
403 coordinate->nParticipants =
404 buildstate->btleader->nparticipanttuplesorts;
405 coordinate->sharedsort = buildstate->btleader->sharedsort;
406 }
407
408 /*
409 * Begin serial/leader tuplesort.
410 *
411 * In cases where parallelism is involved, the leader receives the same
412 * share of maintenance_work_mem as a serial sort (it is generally treated
413 * in the same way as a serial sort once we return). Parallel worker
414 * Tuplesortstates will have received only a fraction of
415 * maintenance_work_mem, though.
416 *
417 * We rely on the lifetime of the Leader Tuplesortstate almost not
418 * overlapping with any worker Tuplesortstate's lifetime. There may be
419 * some small overlap, but that's okay because we rely on leader
420 * Tuplesortstate only allocating a small, fixed amount of memory here.
421 * When its tuplesort_performsort() is called (by our caller), and
422 * significant amounts of memory are likely to be used, all workers must
423 * have already freed almost all memory held by their Tuplesortstates
424 * (they are about to go away completely, too). The overall effect is
425 * that maintenance_work_mem always represents an absolute high watermark
426 * on the amount of memory used by a CREATE INDEX operation, regardless of
427 * the use of parallelism or any other factor.
428 */
429 buildstate->spool->sortstate =
431 buildstate->nulls_not_distinct,
434
435 /*
436 * If building a unique index, put dead tuples in a second spool to keep
437 * them out of the uniqueness check. We expect that the second spool (for
438 * dead tuples) won't get very full, so we give it only work_mem.
439 */
440 if (indexInfo->ii_Unique)
441 {
444
445 /* Initialize secondary spool */
446 btspool2->heap = heap;
447 btspool2->index = index;
448 btspool2->isunique = false;
449 /* Save as secondary spool */
450 buildstate->spool2 = btspool2;
451
452 if (buildstate->btleader)
453 {
454 /*
455 * Set up non-private state that is passed to
456 * tuplesort_begin_index_btree() about the basic high level
457 * coordination of a parallel sort.
458 */
460 coordinate2->isWorker = false;
461 coordinate2->nParticipants =
462 buildstate->btleader->nparticipanttuplesorts;
463 coordinate2->sharedsort = buildstate->btleader->sharedsort2;
464 }
465
466 /*
467 * We expect that the second one (for dead tuples) won't get very
468 * full, so we give it only work_mem
469 */
470 buildstate->spool2->sortstate =
471 tuplesort_begin_index_btree(heap, index, false, false, work_mem,
473 }
474
475 /* Fill spool using either serial or parallel heap scan */
476 if (!buildstate->btleader)
477 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
479 NULL);
480 else
482 &indexInfo->ii_BrokenHotChain);
483
484 /*
485 * Set the progress target for the next phase. Reset the block number
486 * values set by table_index_build_scan
487 */
488 {
489 const int progress_index[] = {
493 };
494 const int64 progress_vals[] = {
495 buildstate->indtuples,
496 0, 0
497 };
498
500 }
501
502 /* okay, all heap tuples are spooled */
503 if (buildstate->spool2 && !buildstate->havedead)
504 {
505 /* spool2 turns out to be unnecessary */
507 buildstate->spool2 = NULL;
508 }
509
510 return reltuples;
511}
512
513/*
514 * clean up a spool structure and its substructures.
515 */
516static void
518{
519 tuplesort_end(btspool->sortstate);
520 pfree(btspool);
521}
522
523/*
524 * spool an index entry into the sort file.
525 */
526static void
527_bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
528{
530 self, values, isnull);
531}
532
533/*
534 * given a spool loaded by successive calls to _bt_spool,
535 * create an entire btree.
536 */
537static void
539{
541
542#ifdef BTREE_BUILD_STATS
544 {
545 ShowUsage("BTREE BUILD (Spool) STATISTICS");
546 ResetUsage();
547 }
548#endif /* BTREE_BUILD_STATS */
549
550 /* Execute the sort */
553 tuplesort_performsort(btspool->sortstate);
554 if (btspool2)
555 {
559 }
560
561 wstate.heap = btspool->heap;
562 wstate.index = btspool->index;
563 wstate.inskey = _bt_mkscankey(wstate.index, NULL);
564 /* _bt_mkscankey() won't set allequalimage without metapage */
565 wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
566
567 /* reserve the metapage */
568 wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
569
573}
574
575/*
576 * Per-tuple callback for table_index_build_scan
577 */
578static void
580 ItemPointer tid,
581 Datum *values,
582 bool *isnull,
583 bool tupleIsAlive,
584 void *state)
585{
587
588 /*
589 * insert the index tuple into the appropriate spool file for subsequent
590 * processing
591 */
592 if (tupleIsAlive || buildstate->spool2 == NULL)
593 _bt_spool(buildstate->spool, tid, values, isnull);
594 else
595 {
596 /* dead tuples are put into spool2 */
597 buildstate->havedead = true;
598 _bt_spool(buildstate->spool2, tid, values, isnull);
599 }
600
601 buildstate->indtuples += 1;
602}
603
604/*
605 * allocate workspace for a new, clean btree page, not linked to any siblings.
606 */
607static BulkWriteBuffer
609{
611 Page page;
612 BTPageOpaque opaque;
613
614 buf = smgr_bulk_get_buf(wstate->bulkstate);
615 page = (Page) buf;
616
617 /* Zero the page and set up standard page header info */
618 _bt_pageinit(page, BLCKSZ);
619
620 /* Initialize BT opaque state */
621 opaque = BTPageGetOpaque(page);
622 opaque->btpo_prev = opaque->btpo_next = P_NONE;
623 opaque->btpo_level = level;
624 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
625 opaque->btpo_cycleid = 0;
626
627 /* Make the P_HIKEY line pointer appear allocated */
628 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
629
630 return buf;
631}
632
633/*
634 * emit a completed btree page, and release the working storage.
635 */
636static void
638{
639 smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
640 /* smgr_bulk_write took ownership of 'buf' */
641}
642
643/*
644 * allocate and initialize a new BTPageState. the returned structure
645 * is suitable for immediate use by _bt_buildadd.
646 */
647static BTPageState *
649{
651
652 /* create initial page for level */
653 state->btps_buf = _bt_blnewpage(wstate, level);
654
655 /* and assign it a page position */
656 state->btps_blkno = wstate->btws_pages_alloced++;
657
658 state->btps_lowkey = NULL;
659 /* initialize lastoff so first item goes into P_FIRSTKEY */
660 state->btps_lastoff = P_HIKEY;
661 state->btps_lastextra = 0;
662 state->btps_level = level;
663 /* set "full" threshold based on level. See notes at head of file. */
664 if (level > 0)
665 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
666 else
667 state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
668
669 /* no parent level, yet */
670 state->btps_next = NULL;
671
672 return state;
673}
674
675/*
676 * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
677 * P_HIKEY, overwriting P_HIKEY).
678 *
679 * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
680 * rightmost page on its level is not supposed to get a high key. Now that
681 * it's clear that this page is a rightmost page, remove the unneeded empty
682 * P_HIKEY line pointer space.
683 */
684static void
686{
687 OffsetNumber off;
688 OffsetNumber maxoff;
690
692 Assert(maxoff >= P_FIRSTKEY);
694 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
695 {
697
698 *previi = *thisii;
699 previi = thisii;
700 }
701 ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
702}
703
704/*
705 * Add an item to a page being built.
706 *
707 * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
708 * raises an error directly.
709 *
710 * Note that our nbtsort.c caller does not know yet if the page will be
711 * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
712 * caller. Page that turns out to be the rightmost on its level is fixed by
713 * calling _bt_slideleft().
714 */
715static void
718 const IndexTupleData *itup,
720 bool newfirstdataitem)
721{
723
725 {
726 trunctuple = *itup;
728 BTreeTupleSetNAtts(&trunctuple, 0, false);
729 itup = &trunctuple;
730 itemsize = sizeof(IndexTupleData);
731 }
732
733 if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
734 elog(ERROR, "failed to add item to the index page");
735}
736
737/*----------
738 * Add an item to a disk page from the sort output (or add a posting list
739 * item formed from the sort output).
740 *
741 * We must be careful to observe the page layout conventions of nbtsearch.c:
742 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
743 * - on non-leaf pages, the key portion of the first item need not be
744 * stored, we should store only the link.
745 *
746 * A leaf page being built looks like:
747 *
748 * +----------------+---------------------------------+
749 * | PageHeaderData | linp0 linp1 linp2 ... |
750 * +-----------+----+---------------------------------+
751 * | ... linpN | |
752 * +-----------+--------------------------------------+
753 * | ^ last |
754 * | |
755 * +-------------+------------------------------------+
756 * | | itemN ... |
757 * +-------------+------------------+-----------------+
758 * | ... item3 item2 item1 | "special space" |
759 * +--------------------------------+-----------------+
760 *
761 * Contrast this with the diagram in bufpage.h; note the mismatch
762 * between linps and items. This is because we reserve linp0 as a
763 * placeholder for the pointer to the "high key" item; when we have
764 * filled up the page, we will set linp0 to point to itemN and clear
765 * linpN. On the other hand, if we find this is the last (rightmost)
766 * page, we leave the items alone and slide the linp array over. If
767 * the high key is to be truncated, offset 1 is deleted, and we insert
768 * the truncated high key at offset 1.
769 *
770 * 'last' pointer indicates the last offset added to the page.
771 *
772 * 'truncextra' is the size of the posting list in itup, if any. This
773 * information is stashed for the next call here, when we may benefit
774 * from considering the impact of truncating away the posting list on
775 * the page before deciding to finish the page off. Posting lists are
776 * often relatively large, so it is worth going to the trouble of
777 * accounting for the saving from truncating away the posting list of
778 * the tuple that becomes the high key (that may be the only way to
779 * get close to target free space on the page). Note that this is
780 * only used for the soft fillfactor-wise limit, not the critical hard
781 * limit.
782 *----------
783 */
784static void
787{
789 Page npage;
793 Size pgspc;
794 Size itupsz;
795 bool isleaf;
796
797 /*
798 * This is a handy place to check for cancel interrupts during the btree
799 * load phase of index creation.
800 */
802
803 nbuf = state->btps_buf;
804 npage = (Page) nbuf;
805 nblkno = state->btps_blkno;
806 last_off = state->btps_lastoff;
807 last_truncextra = state->btps_lastextra;
808 state->btps_lastextra = truncextra;
809
810 pgspc = PageGetFreeSpace(npage);
811 itupsz = IndexTupleSize(itup);
813 /* Leaf case has slightly different rules due to suffix truncation */
814 isleaf = (state->btps_level == 0);
815
816 /*
817 * Check whether the new item can fit on a btree page on current level at
818 * all.
819 *
820 * Every newly built index will treat heap TID as part of the keyspace,
821 * which imposes the requirement that new high keys must occasionally have
822 * a heap TID appended within _bt_truncate(). That may leave a new pivot
823 * tuple one or two MAXALIGN() quantums larger than the original
824 * firstright tuple it's derived from. v4 deals with the problem by
825 * decreasing the limit on the size of tuples inserted on the leaf level
826 * by the same small amount. Enforce the new v4+ limit on the leaf level,
827 * and the old limit on internal levels, since pivot tuples may need to
828 * make use of the reserved space. This should never fail on internal
829 * pages.
830 */
832 _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
833 itup);
834
835 /*
836 * Check to see if current page will fit new item, with space left over to
837 * append a heap TID during suffix truncation when page is a leaf page.
838 *
839 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
840 * high key with heap TID when finishing off a leaf page, since we rely on
841 * _bt_check_third_page() rejecting oversized non-pivot tuples. On
842 * internal pages we can always fit 3 pivot tuples with larger internal
843 * page tuple limit (includes page high key).
844 *
845 * Most of the time, a page is only "full" in the sense that the soft
846 * fillfactor-wise limit has been exceeded. However, we must always leave
847 * at least two items plus a high key on each page before starting a new
848 * page. Disregard fillfactor and insert on "full" current page if we
849 * don't have the minimum number of items yet. (Note that we deliberately
850 * assume that suffix truncation neither enlarges nor shrinks new high key
851 * when applying soft limit, except when last tuple has a posting list.)
852 */
854 if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
855 (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
856 {
857 /*
858 * Finish off the page and write it out.
859 */
861 Page opage = npage;
863 ItemId ii;
864 ItemId hii;
866
867 /* Create new page of same level */
868 nbuf = _bt_blnewpage(wstate, state->btps_level);
869 npage = (Page) nbuf;
870
871 /* and assign it a page position */
872 nblkno = wstate->btws_pages_alloced++;
873
874 /*
875 * We copy the last item on the page into the new page, and then
876 * rearrange the old page so that the 'last item' becomes its high key
877 * rather than a true data item. There had better be at least two
878 * items on the page already, else the page would be empty of useful
879 * data.
880 */
885 !isleaf);
886
887 /*
888 * Move 'last' into the high key position on opage. _bt_blnewpage()
889 * allocated empty space for a line pointer when opage was first
890 * created, so this is a matter of rearranging already-allocated space
891 * on page, and initializing high key line pointer. (Actually, leaf
892 * pages must also swap oitup with a truncated version of oitup, which
893 * is sometimes larger than oitup, though never by more than the space
894 * needed to append a heap TID.)
895 */
897 *hii = *ii;
898 ItemIdSetUnused(ii); /* redundant */
899 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
900
901 if (isleaf)
902 {
903 IndexTuple lastleft;
904 IndexTuple truncated;
905
906 /*
907 * Truncate away any unneeded attributes from high key on leaf
908 * level. This is only done at the leaf level because downlinks
909 * in internal pages are either negative infinity items, or get
910 * their contents from copying from one level down. See also:
911 * _bt_split().
912 *
913 * We don't try to bias our choice of split point to make it more
914 * likely that _bt_truncate() can truncate away more attributes,
915 * whereas the split point used within _bt_split() is chosen much
916 * more delicately. Even still, the lastleft and firstright
917 * tuples passed to _bt_truncate() here are at least not fully
918 * equal to each other when deduplication is used, unless there is
919 * a large group of duplicates (also, unique index builds usually
920 * have few or no spool2 duplicates). When the split point is
921 * between two unequal tuples, _bt_truncate() will avoid including
922 * a heap TID in the new high key, which is the most important
923 * benefit of suffix truncation.
924 *
925 * Overwrite the old item with new truncated high key directly.
926 * oitup is already located at the physical beginning of tuple
927 * space, so this should directly reuse the existing tuple space.
928 */
930 lastleft = (IndexTuple) PageGetItem(opage, ii);
931
933 truncated = _bt_truncate(wstate->index, lastleft, oitup,
934 wstate->inskey);
935 if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
936 elog(ERROR, "failed to add high key to the index page");
937 pfree(truncated);
938
939 /* oitup should continue to point to the page's high key */
942 }
943
944 /*
945 * Link the old page into its parent, using its low key. If we don't
946 * have a parent, we have to create one; this adds a new btree level.
947 */
948 if (state->btps_next == NULL)
949 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
950
951 Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
953 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
955 Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
957 BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
958 _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
959 pfree(state->btps_lowkey);
960
961 /*
962 * Save a copy of the high key from the old page. It is also the low
963 * key for the new page.
964 */
965 state->btps_lowkey = CopyIndexTuple(oitup);
966
967 /*
968 * Set the sibling links for both pages.
969 */
970 {
973
974 oopaque->btpo_next = nblkno;
975 nopaque->btpo_prev = oblkno;
976 nopaque->btpo_next = P_NONE; /* redundant */
977 }
978
979 /*
980 * Write out the old page. _bt_blwritepage takes ownership of the
981 * 'opage' buffer.
982 */
984
985 /*
986 * Reset last_off to point to new page
987 */
989 }
990
991 /*
992 * By here, either original page is still the current page, or a new page
993 * was created that became the current page. Either way, the current page
994 * definitely has space for new item.
995 *
996 * If the new item is the first for its page, it must also be the first
997 * item on its entire level. On later same-level pages, a low key for a
998 * page will be copied from the prior page in the code above. Generate a
999 * minus infinity low key here instead.
1000 */
1001 if (last_off == P_HIKEY)
1002 {
1003 Assert(state->btps_lowkey == NULL);
1004 state->btps_lowkey = palloc0_object(IndexTupleData);
1005 state->btps_lowkey->t_info = sizeof(IndexTupleData);
1006 BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1007 }
1008
1009 /*
1010 * Add the new item into the current page.
1011 */
1013 _bt_sortaddtup(npage, itupsz, itup, last_off,
1014 !isleaf && last_off == P_FIRSTKEY);
1015
1016 state->btps_buf = nbuf;
1017 state->btps_blkno = nblkno;
1018 state->btps_lastoff = last_off;
1019}
1020
1021/*
1022 * Finalize pending posting list tuple, and add it to the index. Final tuple
1023 * is based on saved base tuple, and saved list of heap TIDs.
1024 *
1025 * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1026 * using _bt_buildadd().
1027 */
1028static void
1031{
1032 Assert(dstate->nitems > 0);
1033
1034 if (dstate->nitems == 1)
1035 _bt_buildadd(wstate, state, dstate->base, 0);
1036 else
1037 {
1040
1041 /* form a tuple with a posting list */
1043 dstate->htids,
1044 dstate->nhtids);
1045 /* Calculate posting list overhead */
1048
1051 }
1052
1053 dstate->nmaxitems = 0;
1054 dstate->nhtids = 0;
1055 dstate->nitems = 0;
1056 dstate->phystupsize = 0;
1057}
1058
1059/*
1060 * Finish writing out the completed btree.
1061 */
1062static void
1064{
1065 BTPageState *s;
1067 uint32 rootlevel = 0;
1069
1070 /*
1071 * Each iteration of this loop completes one more level of the tree.
1072 */
1073 for (s = state; s != NULL; s = s->btps_next)
1074 {
1075 BlockNumber blkno;
1076 BTPageOpaque opaque;
1077
1078 blkno = s->btps_blkno;
1079 opaque = BTPageGetOpaque((Page) s->btps_buf);
1080
1081 /*
1082 * We have to link the last page on this level to somewhere.
1083 *
1084 * If we're at the top, it's the root, so attach it to the metapage.
1085 * Otherwise, add an entry for it to its parent using its low key.
1086 * This may cause the last page of the parent level to split, but
1087 * that's not a problem -- we haven't gotten to it yet.
1088 */
1089 if (s->btps_next == NULL)
1090 {
1091 opaque->btpo_flags |= BTP_ROOT;
1092 rootblkno = blkno;
1093 rootlevel = s->btps_level;
1094 }
1095 else
1096 {
1099 BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1100 P_LEFTMOST(opaque));
1101 Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1102 !P_LEFTMOST(opaque));
1105 pfree(s->btps_lowkey);
1106 s->btps_lowkey = NULL;
1107 }
1108
1109 /*
1110 * This is the rightmost page, so the ItemId array needs to be slid
1111 * back one slot. Then we can dump out the page.
1112 */
1115 s->btps_buf = NULL; /* writepage took ownership of the buffer */
1116 }
1117
1118 /*
1119 * As the last step in the process, construct the metapage and make it
1120 * point to the new root (unless we had no data at all, in which case it's
1121 * set to point to "P_NONE"). This changes the index to the "valid" state
1122 * by filling in a valid magic number in the metapage.
1123 */
1124 metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1125 _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
1126 wstate->inskey->allequalimage);
1128}
1129
1130/*
1131 * Read tuples in correct sort order from tuplesort, and load them into
1132 * btree leaves.
1133 */
1134static void
1136{
1138 bool merge = (btspool2 != NULL);
1139 IndexTuple itup,
1140 itup2 = NULL;
1141 bool load1;
1143 int i,
1145 SortSupport sortKeys;
1146 int64 tuples_done = 0;
1147 bool deduplicate;
1148
1149 wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1150
1151 deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1153
1154 if (merge)
1155 {
1156 /*
1157 * Another BTSpool for dead tuples exists. Now we have to merge
1158 * btspool and btspool2.
1159 */
1160
1161 /* the preparation of merge */
1162 itup = tuplesort_getindextuple(btspool->sortstate, true);
1163 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1164
1165 /* Prepare SortSupport data for each column */
1166 sortKeys = palloc0_array(SortSupportData, keysz);
1167
1168 for (i = 0; i < keysz; i++)
1169 {
1170 SortSupport sortKey = sortKeys + i;
1171 ScanKey scanKey = wstate->inskey->scankeys + i;
1172 bool reverse;
1173
1174 sortKey->ssup_cxt = CurrentMemoryContext;
1175 sortKey->ssup_collation = scanKey->sk_collation;
1176 sortKey->ssup_nulls_first =
1177 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1178 sortKey->ssup_attno = scanKey->sk_attno;
1179 /* Abbreviation is not supported here */
1180 sortKey->abbreviate = false;
1181
1182 Assert(sortKey->ssup_attno != 0);
1183
1184 reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
1185
1187 }
1188
1189 for (;;)
1190 {
1191 load1 = true; /* load BTSpool next ? */
1192 if (itup2 == NULL)
1193 {
1194 if (itup == NULL)
1195 break;
1196 }
1197 else if (itup != NULL)
1198 {
1199 int32 compare = 0;
1200
1201 for (i = 1; i <= keysz; i++)
1202 {
1203 SortSupport entry;
1205 attrDatum2;
1206 bool isNull1,
1207 isNull2;
1208
1209 entry = sortKeys + i - 1;
1212
1215 entry);
1216 if (compare > 0)
1217 {
1218 load1 = false;
1219 break;
1220 }
1221 else if (compare < 0)
1222 break;
1223 }
1224
1225 /*
1226 * If key values are equal, we sort on ItemPointer. This is
1227 * required for btree indexes, since heap TID is treated as an
1228 * implicit last key attribute in order to ensure that all
1229 * keys in the index are physically unique.
1230 */
1231 if (compare == 0)
1232 {
1233 compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1234 Assert(compare != 0);
1235 if (compare > 0)
1236 load1 = false;
1237 }
1238 }
1239 else
1240 load1 = false;
1241
1242 /* When we see first tuple, create first index page */
1243 if (state == NULL)
1245
1246 if (load1)
1247 {
1248 _bt_buildadd(wstate, state, itup, 0);
1249 itup = tuplesort_getindextuple(btspool->sortstate, true);
1250 }
1251 else
1252 {
1254 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1255 }
1256
1257 /* Report progress */
1259 ++tuples_done);
1260 }
1261 pfree(sortKeys);
1262 }
1263 else if (deduplicate)
1264 {
1265 /* merge is unnecessary, deduplicate into posting lists */
1267
1269 dstate->deduplicate = true; /* unused */
1270 dstate->nmaxitems = 0; /* unused */
1271 dstate->maxpostingsize = 0; /* set later */
1272 /* Metadata about base tuple of current pending posting list */
1273 dstate->base = NULL;
1274 dstate->baseoff = InvalidOffsetNumber; /* unused */
1275 dstate->basetupsize = 0;
1276 /* Metadata about current pending posting list TIDs */
1277 dstate->htids = NULL;
1278 dstate->nhtids = 0;
1279 dstate->nitems = 0;
1280 dstate->phystupsize = 0; /* unused */
1281 dstate->nintervals = 0; /* unused */
1282
1283 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1284 true)) != NULL)
1285 {
1286 /* When we see first tuple, create first index page */
1287 if (state == NULL)
1288 {
1290
1291 /*
1292 * Limit size of posting list tuples to 1/10 space we want to
1293 * leave behind on the page, plus space for final item's line
1294 * pointer. This is equal to the space that we'd like to
1295 * leave behind on each leaf page when fillfactor is 90,
1296 * allowing us to get close to fillfactor% space utilization
1297 * when there happen to be a great many duplicates. (This
1298 * makes higher leaf fillfactor settings ineffective when
1299 * building indexes that have many duplicates, but packing
1300 * leaf pages full with few very large tuples doesn't seem
1301 * like a useful goal.)
1302 */
1303 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1304 sizeof(ItemIdData);
1305 Assert(dstate->maxpostingsize <= BTMaxItemSize &&
1306 dstate->maxpostingsize <= INDEX_SIZE_MASK);
1307 dstate->htids = palloc(dstate->maxpostingsize);
1308
1309 /* start new pending posting list with itup copy */
1312 }
1313 else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1314 itup) > keysz &&
1316 {
1317 /*
1318 * Tuple is equal to base tuple of pending posting list. Heap
1319 * TID from itup has been saved in state.
1320 */
1321 }
1322 else
1323 {
1324 /*
1325 * Tuple is not equal to pending posting list tuple, or
1326 * _bt_dedup_save_htid() opted to not merge current item into
1327 * pending posting list.
1328 */
1330 pfree(dstate->base);
1331
1332 /* start new pending posting list with itup copy */
1335 }
1336
1337 /* Report progress */
1339 ++tuples_done);
1340 }
1341
1342 if (state)
1343 {
1344 /*
1345 * Handle the last item (there must be a last item when the
1346 * tuplesort returned one or more tuples)
1347 */
1349 pfree(dstate->base);
1350 pfree(dstate->htids);
1351 }
1352
1353 pfree(dstate);
1354 }
1355 else
1356 {
1357 /* merging and deduplication are both unnecessary */
1358 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1359 true)) != NULL)
1360 {
1361 /* When we see first tuple, create first index page */
1362 if (state == NULL)
1364
1365 _bt_buildadd(wstate, state, itup, 0);
1366
1367 /* Report progress */
1369 ++tuples_done);
1370 }
1371 }
1372
1373 /* Close down final pages and write the metapage */
1375 smgr_bulk_finish(wstate->bulkstate);
1376}
1377
1378/*
1379 * Create parallel context, and launch workers for leader.
1380 *
1381 * buildstate argument should be initialized (with the exception of the
1382 * tuplesort state in spools, which may later be created based on shared
1383 * state initially set up here).
1384 *
1385 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1386 *
1387 * request is the target number of parallel worker processes to launch.
1388 *
1389 * Sets buildstate's BTLeader, which caller must use to shut down parallel
1390 * mode by passing it to _bt_end_parallel() at the very end of its index
1391 * build. If not even a single worker process can be launched, this is
1392 * never set, and caller should proceed with a serial index build.
1393 */
1394static void
1395_bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1396{
1397 ParallelContext *pcxt;
1398 int scantuplesortstates;
1399 Snapshot snapshot;
1401 Size estsort;
1402 BTShared *btshared;
1403 Sharedsort *sharedsort;
1404 Sharedsort *sharedsort2;
1405 BTSpool *btspool = buildstate->spool;
1406 BTLeader *btleader = palloc0_object(BTLeader);
1407 WalUsage *walusage;
1408 BufferUsage *bufferusage;
1409 bool leaderparticipates = true;
1410 int querylen;
1411
1412#ifdef DISABLE_LEADER_PARTICIPATION
1413 leaderparticipates = false;
1414#endif
1415
1416 /*
1417 * Enter parallel mode, and create context for parallel build of btree
1418 * index
1419 */
1421 Assert(request > 0);
1422 pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1423 request);
1424
1425 scantuplesortstates = leaderparticipates ? request + 1 : request;
1426
1427 /*
1428 * Prepare for scan of the base relation. In a normal index build, we use
1429 * SnapshotAny because we must retrieve all tuples and do our own time
1430 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1431 * concurrent build, we take a regular MVCC snapshot and index whatever's
1432 * live according to that.
1433 */
1434 if (!isconcurrent)
1435 snapshot = SnapshotAny;
1436 else
1438
1439 /*
1440 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1441 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1442 */
1445 estsort = tuplesort_estimate_shared(scantuplesortstates);
1447
1448 /*
1449 * Unique case requires a second spool, and so we may have to account for
1450 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1451 */
1452 if (!btspool->isunique)
1454 else
1455 {
1458 }
1459
1460 /*
1461 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1462 * and PARALLEL_KEY_BUFFER_USAGE.
1463 *
1464 * If there are no extensions loaded that care, we could skip this. We
1465 * have no way of knowing whether anyone's looking at pgWalUsage or
1466 * pgBufferUsage, so do it unconditionally.
1467 */
1469 mul_size(sizeof(WalUsage), pcxt->nworkers));
1472 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1474
1475 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1477 {
1481 }
1482 else
1483 querylen = 0; /* keep compiler quiet */
1484
1485 /* Everyone's had a chance to ask for space, so now create the DSM */
1487
1488 /* If no DSM segment was available, back out (do serial build) */
1489 if (pcxt->seg == NULL)
1490 {
1491 if (IsMVCCSnapshot(snapshot))
1492 UnregisterSnapshot(snapshot);
1495 return;
1496 }
1497
1498 /* Store shared build state, for which we reserved space */
1499 btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1500 /* Initialize immutable state */
1501 btshared->heaprelid = RelationGetRelid(btspool->heap);
1502 btshared->indexrelid = RelationGetRelid(btspool->index);
1503 btshared->isunique = btspool->isunique;
1504 btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1505 btshared->isconcurrent = isconcurrent;
1506 btshared->scantuplesortstates = scantuplesortstates;
1507 btshared->queryid = pgstat_get_my_query_id();
1509 SpinLockInit(&btshared->mutex);
1510 /* Initialize mutable state */
1511 btshared->nparticipantsdone = 0;
1512 btshared->reltuples = 0.0;
1513 btshared->havedead = false;
1514 btshared->indtuples = 0.0;
1515 btshared->brokenhotchain = false;
1518 snapshot);
1519
1520 /*
1521 * Store shared tuplesort-private state, for which we reserved space.
1522 * Then, initialize opaque state using tuplesort routine.
1523 */
1524 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1525 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1526 pcxt->seg);
1527
1529 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1530
1531 /* Unique case requires a second spool, and associated shared state */
1532 if (!btspool->isunique)
1533 sharedsort2 = NULL;
1534 else
1535 {
1536 /*
1537 * Store additional shared tuplesort-private state, for which we
1538 * reserved space. Then, initialize opaque state using tuplesort
1539 * routine.
1540 */
1541 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1542 tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1543 pcxt->seg);
1544
1546 }
1547
1548 /* Store query string for workers */
1550 {
1551 char *sharedquery;
1552
1553 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1556 }
1557
1558 /*
1559 * Allocate space for each worker's WalUsage and BufferUsage; no need to
1560 * initialize.
1561 */
1562 walusage = shm_toc_allocate(pcxt->toc,
1563 mul_size(sizeof(WalUsage), pcxt->nworkers));
1564 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1565 bufferusage = shm_toc_allocate(pcxt->toc,
1566 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1567 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1568
1569 /* Launch workers, saving status for leader/caller */
1571 btleader->pcxt = pcxt;
1572 btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1574 btleader->nparticipanttuplesorts++;
1575 btleader->btshared = btshared;
1576 btleader->sharedsort = sharedsort;
1577 btleader->sharedsort2 = sharedsort2;
1578 btleader->snapshot = snapshot;
1579 btleader->walusage = walusage;
1580 btleader->bufferusage = bufferusage;
1581
1582 /* If no workers were successfully launched, back out (do serial build) */
1583 if (pcxt->nworkers_launched == 0)
1584 {
1585 _bt_end_parallel(btleader);
1586 return;
1587 }
1588
1589 /* Save leader state now that it's clear build will be parallel */
1590 buildstate->btleader = btleader;
1591
1592 /* Join heap scan ourselves */
1595
1596 /*
1597 * Caller needs to wait for all launched workers when we return. Make
1598 * sure that the failure-to-start case will not hang forever.
1599 */
1601}
1602
1603/*
1604 * Shut down workers, destroy parallel context, and end parallel mode.
1605 */
1606static void
1607_bt_end_parallel(BTLeader *btleader)
1608{
1609 int i;
1610
1611 /* Shutdown worker processes */
1613
1614 /*
1615 * Next, accumulate WAL usage. (This must wait for the workers to finish,
1616 * or we might get incomplete data.)
1617 */
1618 for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1619 InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1620
1621 /* Free last reference to MVCC snapshot, if one was used */
1622 if (IsMVCCSnapshot(btleader->snapshot))
1623 UnregisterSnapshot(btleader->snapshot);
1624 DestroyParallelContext(btleader->pcxt);
1626}
1627
1628/*
1629 * Returns size of shared memory required to store state for a parallel
1630 * btree index build based on the snapshot its parallel scan will use.
1631 */
1632static Size
1634{
1635 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1636 return add_size(BUFFERALIGN(sizeof(BTShared)),
1637 table_parallelscan_estimate(heap, snapshot));
1638}
1639
1640/*
1641 * Within leader, wait for end of heap scan.
1642 *
1643 * When called, parallel heap scan started by _bt_begin_parallel() will
1644 * already be underway within worker processes (when leader participates
1645 * as a worker, we should end up here just as workers are finishing).
1646 *
1647 * Fills in fields needed for ambuild statistics, and lets caller set
1648 * field indicating that some worker encountered a broken HOT chain.
1649 *
1650 * Returns the total number of heap tuples scanned.
1651 */
1652static double
1653_bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1654{
1655 BTShared *btshared = buildstate->btleader->btshared;
1656 int nparticipanttuplesorts;
1657 double reltuples;
1658
1659 nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1660 for (;;)
1661 {
1662 SpinLockAcquire(&btshared->mutex);
1663 if (btshared->nparticipantsdone == nparticipanttuplesorts)
1664 {
1665 buildstate->havedead = btshared->havedead;
1666 buildstate->indtuples = btshared->indtuples;
1667 *brokenhotchain = btshared->brokenhotchain;
1668 reltuples = btshared->reltuples;
1669 SpinLockRelease(&btshared->mutex);
1670 break;
1671 }
1672 SpinLockRelease(&btshared->mutex);
1673
1676 }
1677
1679
1680 return reltuples;
1681}
1682
1683/*
1684 * Within leader, participate as a parallel worker.
1685 */
1686static void
1688{
1689 BTLeader *btleader = buildstate->btleader;
1692 int sortmem;
1693
1694 /* Allocate memory and initialize private spool */
1696 leaderworker->heap = buildstate->spool->heap;
1697 leaderworker->index = buildstate->spool->index;
1698 leaderworker->isunique = buildstate->spool->isunique;
1699 leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1700
1701 /* Initialize second spool, if required */
1702 if (!btleader->btshared->isunique)
1704 else
1705 {
1706 /* Allocate memory for worker's own private secondary spool */
1708
1709 /* Initialize worker's own secondary spool */
1710 leaderworker2->heap = leaderworker->heap;
1711 leaderworker2->index = leaderworker->index;
1712 leaderworker2->isunique = false;
1713 }
1714
1715 /*
1716 * Might as well use reliable figure when doling out maintenance_work_mem
1717 * (when requested number of workers were not launched, this will be
1718 * somewhat higher than it is for other workers).
1719 */
1721
1722 /* Perform work common to all participants */
1724 btleader->sharedsort, btleader->sharedsort2,
1725 sortmem, true);
1726
1727#ifdef BTREE_BUILD_STATS
1729 {
1730 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1731 ResetUsage();
1732 }
1733#endif /* BTREE_BUILD_STATS */
1734}
1735
1736/*
1737 * Perform work within a launched parallel process.
1738 */
1739void
1741{
1742 char *sharedquery;
1745 BTShared *btshared;
1746 Sharedsort *sharedsort;
1747 Sharedsort *sharedsort2;
1748 Relation heapRel;
1749 Relation indexRel;
1752 WalUsage *walusage;
1753 BufferUsage *bufferusage;
1754 int sortmem;
1755
1756#ifdef BTREE_BUILD_STATS
1758 ResetUsage();
1759#endif /* BTREE_BUILD_STATS */
1760
1761 /*
1762 * The only possible status flag that can be set to the parallel worker is
1763 * PROC_IN_SAFE_IC.
1764 */
1765 Assert((MyProc->statusFlags == 0) ||
1767
1768 /* Set debug_query_string for individual workers first */
1771
1772 /* Report the query string from leader */
1774
1775 /* Look up nbtree shared state */
1776 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1777
1778 /* Open relations using lock modes known to be obtained by index.c */
1779 if (!btshared->isconcurrent)
1780 {
1783 }
1784 else
1785 {
1788 }
1789
1790 /* Track query ID */
1791 pgstat_report_query_id(btshared->queryid, false);
1792
1793 /* Open relations within worker */
1794 heapRel = table_open(btshared->heaprelid, heapLockmode);
1795 indexRel = index_open(btshared->indexrelid, indexLockmode);
1796
1797 /* Initialize worker's own spool */
1799 btspool->heap = heapRel;
1800 btspool->index = indexRel;
1801 btspool->isunique = btshared->isunique;
1802 btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1803
1804 /* Look up shared state private to tuplesort.c */
1805 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1806 tuplesort_attach_shared(sharedsort, seg);
1807 if (!btshared->isunique)
1808 {
1809 btspool2 = NULL;
1810 sharedsort2 = NULL;
1811 }
1812 else
1813 {
1814 /* Allocate memory for worker's own private secondary spool */
1816
1817 /* Initialize worker's own secondary spool */
1818 btspool2->heap = btspool->heap;
1819 btspool2->index = btspool->index;
1820 btspool2->isunique = false;
1821 /* Look up shared state private to tuplesort.c */
1822 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1823 tuplesort_attach_shared(sharedsort2, seg);
1824 }
1825
1826 /* Prepare to track buffer usage during parallel execution */
1828
1829 /* Perform sorting of spool, and possibly a spool2 */
1831 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1832 sharedsort2, sortmem, false);
1833
1834 /* Report WAL/buffer usage during parallel execution */
1835 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1836 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1838 &walusage[ParallelWorkerNumber]);
1839
1840#ifdef BTREE_BUILD_STATS
1842 {
1843 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1844 ResetUsage();
1845 }
1846#endif /* BTREE_BUILD_STATS */
1847
1848 index_close(indexRel, indexLockmode);
1849 table_close(heapRel, heapLockmode);
1850}
1851
1852/*
1853 * Perform a worker's portion of a parallel sort.
1854 *
1855 * This generates a tuplesort for passed btspool, and a second tuplesort
1856 * state if a second btspool is need (i.e. for unique index builds). All
1857 * other spool fields should already be set when this is called.
1858 *
1859 * sortmem is the amount of working memory to use within each worker,
1860 * expressed in KBs.
1861 *
1862 * When this returns, workers are done, and need only release resources.
1863 */
1864static void
1866 BTShared *btshared, Sharedsort *sharedsort,
1867 Sharedsort *sharedsort2, int sortmem, bool progress)
1868{
1871 TableScanDesc scan;
1872 double reltuples;
1873 IndexInfo *indexInfo;
1874
1875 /* Initialize local tuplesort coordination state */
1877 coordinate->isWorker = true;
1878 coordinate->nParticipants = -1;
1879 coordinate->sharedsort = sharedsort;
1880
1881 /* Begin "partial" tuplesort */
1882 btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1883 btspool->index,
1884 btspool->isunique,
1885 btspool->nulls_not_distinct,
1888
1889 /*
1890 * Just as with serial case, there may be a second spool. If so, a
1891 * second, dedicated spool2 partial tuplesort is required.
1892 */
1893 if (btspool2)
1894 {
1896
1897 /*
1898 * We expect that the second one (for dead tuples) won't get very
1899 * full, so we give it only work_mem (unless sortmem is less for
1900 * worker). Worker processes are generally permitted to allocate
1901 * work_mem independently.
1902 */
1904 coordinate2->isWorker = true;
1905 coordinate2->nParticipants = -1;
1906 coordinate2->sharedsort = sharedsort2;
1907 btspool2->sortstate =
1908 tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1910 false);
1911 }
1912
1913 /* Fill in buildstate for _bt_build_callback() */
1914 buildstate.isunique = btshared->isunique;
1915 buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1916 buildstate.havedead = false;
1917 buildstate.heap = btspool->heap;
1918 buildstate.spool = btspool;
1919 buildstate.spool2 = btspool2;
1920 buildstate.indtuples = 0;
1921 buildstate.btleader = NULL;
1922
1923 /* Join parallel scan */
1924 indexInfo = BuildIndexInfo(btspool->index);
1925 indexInfo->ii_Concurrent = btshared->isconcurrent;
1926 scan = table_beginscan_parallel(btspool->heap,
1928 reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1930 &buildstate, scan);
1931
1932 /* Execute this worker's part of the sort */
1933 if (progress)
1936 tuplesort_performsort(btspool->sortstate);
1937 if (btspool2)
1938 {
1939 if (progress)
1942 tuplesort_performsort(btspool2->sortstate);
1943 }
1944
1945 /*
1946 * Done. Record ambuild statistics, and whether we encountered a broken
1947 * HOT chain.
1948 */
1949 SpinLockAcquire(&btshared->mutex);
1950 btshared->nparticipantsdone++;
1951 btshared->reltuples += reltuples;
1952 if (buildstate.havedead)
1953 btshared->havedead = true;
1954 btshared->indtuples += buildstate.indtuples;
1955 if (indexInfo->ii_BrokenHotChain)
1956 btshared->brokenhotchain = true;
1957 SpinLockRelease(&btshared->mutex);
1958
1959 /* Notify leader */
1961
1962 /* We can end tuplesorts immediately */
1963 tuplesort_end(btspool->sortstate);
1964 if (btspool2)
1965 tuplesort_end(btspool2->sortstate);
1966}
int ParallelWorkerNumber
Definition parallel.c:115
void InitializeParallelDSM(ParallelContext *pcxt)
Definition parallel.c:211
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition parallel.c:803
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition parallel.c:581
void DestroyParallelContext(ParallelContext *pcxt)
Definition parallel.c:957
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition parallel.c:173
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition parallel.c:700
void pgstat_progress_update_param(int index, int64 val)
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
void pgstat_report_query_id(int64 query_id, bool force)
int64 pgstat_get_my_query_id(void)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition block.h:31
static Datum values[MAXATTR]
Definition bootstrap.c:155
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:307
Size PageGetFreeSpace(const PageData *page)
Definition bufpage.c:906
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, const void *newtup, Size newsize)
Definition bufpage.c:1404
PageHeaderData * PageHeader
Definition bufpage.h:173
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:243
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:353
PageData * Page
Definition bufpage.h:81
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition bufpage.h:471
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:371
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition bulk_write.c:130
#define MAXALIGN_DOWN(LEN)
Definition c.h:838
#define Min(x, y)
Definition c.h:997
#define MAXALIGN(LEN)
Definition c.h:826
#define BUFFERALIGN(LEN)
Definition c.h:828
#define Assert(condition)
Definition c.h:873
int64_t int64
Definition c.h:543
int32_t int32
Definition c.h:542
#define unlikely(x)
Definition c.h:412
uint32_t uint32
Definition c.h:546
size_t Size
Definition c.h:619
bool ConditionVariableCancelSleep(void)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc0_array(type, count)
Definition fe_memutils.h:77
#define palloc0_object(type)
Definition fe_memutils.h:75
static int compare(const void *arg1, const void *arg2)
Definition geqo_pool.c:144
int maintenance_work_mem
Definition globals.c:133
int work_mem
Definition globals.c:131
bool log_btree_build_stats
Definition guc_tables.c:525
IndexInfo * BuildIndexInfo(Relation index)
Definition index.c:2426
void index_close(Relation relation, LOCKMODE lockmode)
Definition indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition indexam.c:133
IndexTuple CopyIndexTuple(IndexTuple source)
Definition indextuple.c:547
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:215
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:205
void InstrStartParallelQuery(void)
Definition instrument.c:197
int i
Definition isn.c:77
#define ItemIdGetLength(itemId)
Definition itemid.h:59
#define ItemIdSetUnused(itemId)
Definition itemid.h:128
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
IndexTupleData * IndexTuple
Definition itup.h:53
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition itup.h:131
static Size IndexTupleSize(const IndexTupleData *itup)
Definition itup.h:71
#define INDEX_SIZE_MASK
Definition itup.h:65
int LOCKMODE
Definition lockdefs.h:26
#define AccessExclusiveLock
Definition lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition lockdefs.h:39
#define ShareLock
Definition lockdefs.h:40
#define RowExclusiveLock
Definition lockdefs.h:38
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition nbtdedup.c:484
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition nbtdedup.c:433
IndexTuple _bt_form_posting(IndexTuple base, const ItemPointerData *htids, int nhtids)
Definition nbtdedup.c:862
void _bt_pageinit(Page page, Size size)
Definition nbtpage.c:1130
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition nbtpage.c:68
#define BTGetDeduplicateItems(relation)
Definition nbtree.h:1135
#define BTGetTargetPageFreeSpace(relation)
Definition nbtree.h:1133
#define BTP_LEAF
Definition nbtree.h:77
#define P_HIKEY
Definition nbtree.h:368
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition nbtree.h:1148
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition nbtree.h:1149
#define P_LEFTMOST(opaque)
Definition nbtree.h:219
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define BTP_ROOT
Definition nbtree.h:78
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition nbtree.h:563
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition nbtree.h:1146
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition nbtree.h:1147
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition nbtree.h:530
#define P_NONE
Definition nbtree.h:213
#define SK_BT_NULLS_FIRST
Definition nbtree.h:1117
#define BTREE_METAPAGE
Definition nbtree.h:149
#define SK_BT_DESC
Definition nbtree.h:1116
#define P_FIRSTKEY
Definition nbtree.h:369
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition nbtree.h:596
#define BTMaxItemSize
Definition nbtree.h:165
#define BTreeTupleGetNAtts(itup, rel)
Definition nbtree.h:578
#define BTREE_NONLEAF_FILLFACTOR
Definition nbtree.h:202
#define PARALLEL_KEY_BUFFER_USAGE
Definition nbtsort.c:67
#define ParallelTableScanFromBTShared(shared)
Definition nbtsort.c:162
static void _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
Definition nbtsort.c:638
static void _bt_slideleft(Page rightmostpage)
Definition nbtsort.c:686
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
Definition nbtsort.c:649
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
Definition nbtsort.c:1136
static void _bt_end_parallel(BTLeader *btleader)
Definition nbtsort.c:1608
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition nbtsort.c:64
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition nbtsort.c:1866
static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition nbtsort.c:1634
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
Definition nbtsort.c:1030
static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
Definition nbtsort.c:1654
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition nbtsort.c:539
#define PARALLEL_KEY_BTREE_SHARED
Definition nbtsort.c:62
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition nbtsort.c:296
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
Definition nbtsort.c:1396
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
Definition nbtsort.c:786
void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition nbtsort.c:1741
static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition nbtsort.c:580
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition nbtsort.c:366
static void _bt_spooldestroy(BTSpool *btspool)
Definition nbtsort.c:518
static void _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
Definition nbtsort.c:528
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
Definition nbtsort.c:1064
#define PARALLEL_KEY_TUPLESORT
Definition nbtsort.c:63
#define PARALLEL_KEY_QUERY_TEXT
Definition nbtsort.c:65
#define PARALLEL_KEY_WAL_USAGE
Definition nbtsort.c:66
static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level)
Definition nbtsort.c:609
static void _bt_leader_participate_as_worker(BTBuildState *buildstate)
Definition nbtsort.c:1688
static void _bt_sortaddtup(Page page, Size itemsize, const IndexTupleData *itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition nbtsort.c:717
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition nbtutils.c:1124
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition nbtutils.c:59
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition nbtutils.c:698
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition nbtutils.c:917
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition nbtutils.c:1181
#define InvalidOffsetNumber
Definition off.h:26
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition off.h:54
static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
Definition pairingheap.c:93
static char buf[DEFAULT_XLOG_SEG_SIZE]
static int progress
Definition pgbench.c:262
const char * debug_query_string
Definition postgres.c:89
void ShowUsage(const char *title)
Definition postgres.c:5068
void ResetUsage(void)
Definition postgres.c:5061
uint64_t Datum
Definition postgres.h:70
static int fb(int x)
#define PROC_IN_SAFE_IC
Definition proc.h:59
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition progress.h:106
#define PROGRESS_SCAN_BLOCKS_DONE
Definition progress.h:142
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition progress.h:107
#define PROGRESS_CREATEIDX_SUBPHASE
Definition progress.h:105
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition progress.h:141
#define RelationGetRelid(relation)
Definition rel.h:514
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetRelationName(relation)
Definition rel.h:548
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:533
@ MAIN_FORKNUM
Definition relpath.h:58
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition shm_toc.c:88
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition shm_toc.c:171
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition shmem.c:482
Size mul_size(Size s1, Size s2)
Definition shmem.c:497
Snapshot GetTransactionSnapshot(void)
Definition snapmgr.c:272
void UnregisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:866
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:824
#define SnapshotAny
Definition snapmgr.h:33
#define IsMVCCSnapshot(snapshot)
Definition snapmgr.h:55
void PrepareSortSupportFromIndexRel(Relation indexRel, bool reverse, SortSupport ssup)
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
#define SpinLockInit(lock)
Definition spin.h:57
#define SpinLockRelease(lock)
Definition spin.h:61
#define SpinLockAcquire(lock)
Definition spin.h:59
PGPROC * MyProc
Definition proc.c:67
bool isunique
Definition nbtsort.c:207
BTSpool * spool
Definition nbtsort.c:211
BTLeader * btleader
Definition nbtsort.c:225
bool nulls_not_distinct
Definition nbtsort.c:208
bool havedead
Definition nbtsort.c:209
Relation heap
Definition nbtsort.c:210
BTSpool * spool2
Definition nbtsort.c:217
double indtuples
Definition nbtsort.c:218
ParallelContext * pcxt
Definition nbtsort.c:171
BTShared * btshared
Definition nbtsort.c:191
Sharedsort * sharedsort
Definition nbtsort.c:192
Sharedsort * sharedsort2
Definition nbtsort.c:193
int nparticipanttuplesorts
Definition nbtsort.c:179
BufferUsage * bufferusage
Definition nbtsort.c:196
Snapshot snapshot
Definition nbtsort.c:194
WalUsage * walusage
Definition nbtsort.c:195
BlockNumber btpo_next
Definition nbtree.h:66
BlockNumber btpo_prev
Definition nbtree.h:65
uint16 btpo_flags
Definition nbtree.h:68
uint32 btpo_level
Definition nbtree.h:67
BTCycleId btpo_cycleid
Definition nbtree.h:69
IndexTuple btps_lowkey
Definition nbtsort.c:236
Size btps_full
Definition nbtsort.c:240
BulkWriteBuffer btps_buf
Definition nbtsort.c:234
OffsetNumber btps_lastoff
Definition nbtsort.c:237
Size btps_lastextra
Definition nbtsort.c:238
BlockNumber btps_blkno
Definition nbtsort.c:235
struct BTPageState * btps_next
Definition nbtsort.c:241
uint32 btps_level
Definition nbtsort.c:239
slock_t mutex
Definition nbtsort.c:125
bool isconcurrent
Definition nbtsort.c:105
double indtuples
Definition nbtsort.c:146
double reltuples
Definition nbtsort.c:144
Oid heaprelid
Definition nbtsort.c:101
bool brokenhotchain
Definition nbtsort.c:147
int64 queryid
Definition nbtsort.c:109
bool isunique
Definition nbtsort.c:103
int nparticipantsdone
Definition nbtsort.c:143
ConditionVariable workersdonecv
Definition nbtsort.c:117
int scantuplesortstates
Definition nbtsort.c:106
Oid indexrelid
Definition nbtsort.c:102
bool havedead
Definition nbtsort.c:145
bool nulls_not_distinct
Definition nbtsort.c:104
BulkWriteState * bulkstate
Definition nbtsort.c:251
Relation heap
Definition nbtsort.c:249
Relation index
Definition nbtsort.c:250
BlockNumber btws_pages_alloced
Definition nbtsort.c:253
BTScanInsert inskey
Definition nbtsort.c:252
double heap_tuples
Definition genam.h:38
double index_tuples
Definition genam.h:39
bool ii_Unique
Definition execnodes.h:202
bool ii_BrokenHotChain
Definition execnodes.h:214
bool ii_NullsNotDistinct
Definition execnodes.h:204
int ii_ParallelWorkers
Definition execnodes.h:220
bool ii_Concurrent
Definition execnodes.h:212
ItemPointerData t_tid
Definition itup.h:37
unsigned short t_info
Definition itup.h:49
uint8 statusFlags
Definition proc.h:265
dsm_segment * seg
Definition parallel.h:42
shm_toc_estimator estimator
Definition parallel.h:41
shm_toc * toc
Definition parallel.h:44
int nworkers_launched
Definition parallel.h:37
Definition type.h:96
void table_close(Relation relation, LOCKMODE lockmode)
Definition table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition table.c:40
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition tableam.c:166
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition tableam.c:131
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition tableam.c:146
static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition tableam.h:1766
void tuplesort_performsort(Tuplesortstate *state)
Definition tuplesort.c:1348
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition tuplesort.c:2921
Size tuplesort_estimate_shared(int nWorkers)
Definition tuplesort.c:2900
void tuplesort_end(Tuplesortstate *state)
Definition tuplesort.c:936
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition tuplesort.c:2944
#define TUPLESORT_NONE
Definition tuplesort.h:67
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, const ItemPointerData *self, const Datum *values, const bool *isnull)
void ExitParallelMode(void)
Definition xact.c:1065
void EnterParallelMode(void)
Definition xact.c:1052

Typedef Documentation

◆ BTBuildState

◆ BTLeader

◆ BTPageState

◆ BTShared

◆ BTSpool

◆ BTWriteState

Function Documentation

◆ _bt_begin_parallel()

static void _bt_begin_parallel ( BTBuildState buildstate,
bool  isconcurrent,
int  request 
)
static

Definition at line 1396 of file nbtsort.c.

1397{
1398 ParallelContext *pcxt;
1399 int scantuplesortstates;
1400 Snapshot snapshot;
1402 Size estsort;
1403 BTShared *btshared;
1404 Sharedsort *sharedsort;
1405 Sharedsort *sharedsort2;
1406 BTSpool *btspool = buildstate->spool;
1407 BTLeader *btleader = palloc0_object(BTLeader);
1408 WalUsage *walusage;
1409 BufferUsage *bufferusage;
1410 bool leaderparticipates = true;
1411 int querylen;
1412
1413#ifdef DISABLE_LEADER_PARTICIPATION
1414 leaderparticipates = false;
1415#endif
1416
1417 /*
1418 * Enter parallel mode, and create context for parallel build of btree
1419 * index
1420 */
1422 Assert(request > 0);
1423 pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1424 request);
1425
1426 scantuplesortstates = leaderparticipates ? request + 1 : request;
1427
1428 /*
1429 * Prepare for scan of the base relation. In a normal index build, we use
1430 * SnapshotAny because we must retrieve all tuples and do our own time
1431 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1432 * concurrent build, we take a regular MVCC snapshot and index whatever's
1433 * live according to that.
1434 */
1435 if (!isconcurrent)
1436 snapshot = SnapshotAny;
1437 else
1439
1440 /*
1441 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1442 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1443 */
1446 estsort = tuplesort_estimate_shared(scantuplesortstates);
1448
1449 /*
1450 * Unique case requires a second spool, and so we may have to account for
1451 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1452 */
1453 if (!btspool->isunique)
1455 else
1456 {
1459 }
1460
1461 /*
1462 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1463 * and PARALLEL_KEY_BUFFER_USAGE.
1464 *
1465 * If there are no extensions loaded that care, we could skip this. We
1466 * have no way of knowing whether anyone's looking at pgWalUsage or
1467 * pgBufferUsage, so do it unconditionally.
1468 */
1470 mul_size(sizeof(WalUsage), pcxt->nworkers));
1473 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1475
1476 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1478 {
1482 }
1483 else
1484 querylen = 0; /* keep compiler quiet */
1485
1486 /* Everyone's had a chance to ask for space, so now create the DSM */
1488
1489 /* If no DSM segment was available, back out (do serial build) */
1490 if (pcxt->seg == NULL)
1491 {
1492 if (IsMVCCSnapshot(snapshot))
1493 UnregisterSnapshot(snapshot);
1496 return;
1497 }
1498
1499 /* Store shared build state, for which we reserved space */
1500 btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1501 /* Initialize immutable state */
1502 btshared->heaprelid = RelationGetRelid(btspool->heap);
1503 btshared->indexrelid = RelationGetRelid(btspool->index);
1504 btshared->isunique = btspool->isunique;
1505 btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1506 btshared->isconcurrent = isconcurrent;
1507 btshared->scantuplesortstates = scantuplesortstates;
1508 btshared->queryid = pgstat_get_my_query_id();
1510 SpinLockInit(&btshared->mutex);
1511 /* Initialize mutable state */
1512 btshared->nparticipantsdone = 0;
1513 btshared->reltuples = 0.0;
1514 btshared->havedead = false;
1515 btshared->indtuples = 0.0;
1516 btshared->brokenhotchain = false;
1519 snapshot);
1520
1521 /*
1522 * Store shared tuplesort-private state, for which we reserved space.
1523 * Then, initialize opaque state using tuplesort routine.
1524 */
1525 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1526 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1527 pcxt->seg);
1528
1530 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1531
1532 /* Unique case requires a second spool, and associated shared state */
1533 if (!btspool->isunique)
1534 sharedsort2 = NULL;
1535 else
1536 {
1537 /*
1538 * Store additional shared tuplesort-private state, for which we
1539 * reserved space. Then, initialize opaque state using tuplesort
1540 * routine.
1541 */
1542 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1543 tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1544 pcxt->seg);
1545
1547 }
1548
1549 /* Store query string for workers */
1551 {
1552 char *sharedquery;
1553
1554 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1557 }
1558
1559 /*
1560 * Allocate space for each worker's WalUsage and BufferUsage; no need to
1561 * initialize.
1562 */
1563 walusage = shm_toc_allocate(pcxt->toc,
1564 mul_size(sizeof(WalUsage), pcxt->nworkers));
1565 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1566 bufferusage = shm_toc_allocate(pcxt->toc,
1567 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1568 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1569
1570 /* Launch workers, saving status for leader/caller */
1572 btleader->pcxt = pcxt;
1573 btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1575 btleader->nparticipanttuplesorts++;
1576 btleader->btshared = btshared;
1577 btleader->sharedsort = sharedsort;
1578 btleader->sharedsort2 = sharedsort2;
1579 btleader->snapshot = snapshot;
1580 btleader->walusage = walusage;
1581 btleader->bufferusage = bufferusage;
1582
1583 /* If no workers were successfully launched, back out (do serial build) */
1584 if (pcxt->nworkers_launched == 0)
1585 {
1586 _bt_end_parallel(btleader);
1587 return;
1588 }
1589
1590 /* Save leader state now that it's clear build will be parallel */
1591 buildstate->btleader = btleader;
1592
1593 /* Join heap scan ourselves */
1596
1597 /*
1598 * Caller needs to wait for all launched workers when we return. Make
1599 * sure that the failure-to-start case will not hang forever.
1600 */
1602}

References _bt_end_parallel(), _bt_leader_participate_as_worker(), _bt_parallel_estimate_shared(), Assert, BTShared::brokenhotchain, BTLeader::btshared, BTLeader::bufferusage, ConditionVariableInit(), CreateParallelContext(), debug_query_string, DestroyParallelContext(), EnterParallelMode(), ParallelContext::estimator, ExitParallelMode(), fb(), GetTransactionSnapshot(), BTShared::havedead, BTShared::heaprelid, BTShared::indexrelid, BTShared::indtuples, InitializeParallelDSM(), BTShared::isconcurrent, IsMVCCSnapshot, BTShared::isunique, LaunchParallelWorkers(), mul_size(), BTShared::mutex, BTShared::nparticipantsdone, BTLeader::nparticipanttuplesorts, BTShared::nulls_not_distinct, ParallelContext::nworkers, ParallelContext::nworkers_launched, palloc0_object, PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelTableScanFromBTShared, BTLeader::pcxt, pgstat_get_my_query_id(), BTShared::queryid, RegisterSnapshot(), RelationGetRelid, BTShared::reltuples, BTShared::scantuplesortstates, ParallelContext::seg, BTLeader::sharedsort, BTLeader::sharedsort2, shm_toc_allocate(), shm_toc_estimate_chunk, shm_toc_estimate_keys, shm_toc_insert(), BTLeader::snapshot, SnapshotAny, SpinLockInit, table_parallelscan_initialize(), ParallelContext::toc, tuplesort_estimate_shared(), tuplesort_initialize_shared(), UnregisterSnapshot(), WaitForParallelWorkersToAttach(), BTLeader::walusage, and BTShared::workersdonecv.

Referenced by _bt_spools_heapscan().

◆ _bt_blnewpage()

static BulkWriteBuffer _bt_blnewpage ( BTWriteState wstate,
uint32  level 
)
static

Definition at line 609 of file nbtsort.c.

610{
612 Page page;
613 BTPageOpaque opaque;
614
615 buf = smgr_bulk_get_buf(wstate->bulkstate);
616 page = (Page) buf;
617
618 /* Zero the page and set up standard page header info */
619 _bt_pageinit(page, BLCKSZ);
620
621 /* Initialize BT opaque state */
622 opaque = BTPageGetOpaque(page);
623 opaque->btpo_prev = opaque->btpo_next = P_NONE;
624 opaque->btpo_level = level;
625 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
626 opaque->btpo_cycleid = 0;
627
628 /* Make the P_HIKEY line pointer appear allocated */
629 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
630
631 return buf;
632}

References _bt_pageinit(), BTP_LEAF, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, buf, fb(), P_NONE, and smgr_bulk_get_buf().

Referenced by _bt_buildadd(), and _bt_pagestate().

◆ _bt_blwritepage()

static void _bt_blwritepage ( BTWriteState wstate,
BulkWriteBuffer  buf,
BlockNumber  blkno 
)
static

Definition at line 638 of file nbtsort.c.

639{
640 smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
641 /* smgr_bulk_write took ownership of 'buf' */
642}

References buf, fb(), and smgr_bulk_write().

Referenced by _bt_buildadd(), and _bt_uppershutdown().

◆ _bt_build_callback()

static void _bt_build_callback ( Relation  index,
ItemPointer  tid,
Datum values,
bool isnull,
bool  tupleIsAlive,
void state 
)
static

Definition at line 580 of file nbtsort.c.

586{
588
589 /*
590 * insert the index tuple into the appropriate spool file for subsequent
591 * processing
592 */
593 if (tupleIsAlive || buildstate->spool2 == NULL)
594 _bt_spool(buildstate->spool, tid, values, isnull);
595 else
596 {
597 /* dead tuples are put into spool2 */
598 buildstate->havedead = true;
599 _bt_spool(buildstate->spool2, tid, values, isnull);
600 }
601
602 buildstate->indtuples += 1;
603}

References _bt_spool(), fb(), and values.

Referenced by _bt_parallel_scan_and_sort(), and _bt_spools_heapscan().

◆ _bt_buildadd()

static void _bt_buildadd ( BTWriteState wstate,
BTPageState state,
IndexTuple  itup,
Size  truncextra 
)
static

Definition at line 786 of file nbtsort.c.

788{
790 Page npage;
794 Size pgspc;
795 Size itupsz;
796 bool isleaf;
797
798 /*
799 * This is a handy place to check for cancel interrupts during the btree
800 * load phase of index creation.
801 */
803
804 nbuf = state->btps_buf;
805 npage = (Page) nbuf;
806 nblkno = state->btps_blkno;
807 last_off = state->btps_lastoff;
808 last_truncextra = state->btps_lastextra;
809 state->btps_lastextra = truncextra;
810
811 pgspc = PageGetFreeSpace(npage);
812 itupsz = IndexTupleSize(itup);
814 /* Leaf case has slightly different rules due to suffix truncation */
815 isleaf = (state->btps_level == 0);
816
817 /*
818 * Check whether the new item can fit on a btree page on current level at
819 * all.
820 *
821 * Every newly built index will treat heap TID as part of the keyspace,
822 * which imposes the requirement that new high keys must occasionally have
823 * a heap TID appended within _bt_truncate(). That may leave a new pivot
824 * tuple one or two MAXALIGN() quantums larger than the original
825 * firstright tuple it's derived from. v4 deals with the problem by
826 * decreasing the limit on the size of tuples inserted on the leaf level
827 * by the same small amount. Enforce the new v4+ limit on the leaf level,
828 * and the old limit on internal levels, since pivot tuples may need to
829 * make use of the reserved space. This should never fail on internal
830 * pages.
831 */
833 _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
834 itup);
835
836 /*
837 * Check to see if current page will fit new item, with space left over to
838 * append a heap TID during suffix truncation when page is a leaf page.
839 *
840 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
841 * high key with heap TID when finishing off a leaf page, since we rely on
842 * _bt_check_third_page() rejecting oversized non-pivot tuples. On
843 * internal pages we can always fit 3 pivot tuples with larger internal
844 * page tuple limit (includes page high key).
845 *
846 * Most of the time, a page is only "full" in the sense that the soft
847 * fillfactor-wise limit has been exceeded. However, we must always leave
848 * at least two items plus a high key on each page before starting a new
849 * page. Disregard fillfactor and insert on "full" current page if we
850 * don't have the minimum number of items yet. (Note that we deliberately
851 * assume that suffix truncation neither enlarges nor shrinks new high key
852 * when applying soft limit, except when last tuple has a posting list.)
853 */
855 if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
856 (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
857 {
858 /*
859 * Finish off the page and write it out.
860 */
862 Page opage = npage;
864 ItemId ii;
865 ItemId hii;
867
868 /* Create new page of same level */
869 nbuf = _bt_blnewpage(wstate, state->btps_level);
870 npage = (Page) nbuf;
871
872 /* and assign it a page position */
873 nblkno = wstate->btws_pages_alloced++;
874
875 /*
876 * We copy the last item on the page into the new page, and then
877 * rearrange the old page so that the 'last item' becomes its high key
878 * rather than a true data item. There had better be at least two
879 * items on the page already, else the page would be empty of useful
880 * data.
881 */
886 !isleaf);
887
888 /*
889 * Move 'last' into the high key position on opage. _bt_blnewpage()
890 * allocated empty space for a line pointer when opage was first
891 * created, so this is a matter of rearranging already-allocated space
892 * on page, and initializing high key line pointer. (Actually, leaf
893 * pages must also swap oitup with a truncated version of oitup, which
894 * is sometimes larger than oitup, though never by more than the space
895 * needed to append a heap TID.)
896 */
898 *hii = *ii;
899 ItemIdSetUnused(ii); /* redundant */
900 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
901
902 if (isleaf)
903 {
904 IndexTuple lastleft;
905 IndexTuple truncated;
906
907 /*
908 * Truncate away any unneeded attributes from high key on leaf
909 * level. This is only done at the leaf level because downlinks
910 * in internal pages are either negative infinity items, or get
911 * their contents from copying from one level down. See also:
912 * _bt_split().
913 *
914 * We don't try to bias our choice of split point to make it more
915 * likely that _bt_truncate() can truncate away more attributes,
916 * whereas the split point used within _bt_split() is chosen much
917 * more delicately. Even still, the lastleft and firstright
918 * tuples passed to _bt_truncate() here are at least not fully
919 * equal to each other when deduplication is used, unless there is
920 * a large group of duplicates (also, unique index builds usually
921 * have few or no spool2 duplicates). When the split point is
922 * between two unequal tuples, _bt_truncate() will avoid including
923 * a heap TID in the new high key, which is the most important
924 * benefit of suffix truncation.
925 *
926 * Overwrite the old item with new truncated high key directly.
927 * oitup is already located at the physical beginning of tuple
928 * space, so this should directly reuse the existing tuple space.
929 */
931 lastleft = (IndexTuple) PageGetItem(opage, ii);
932
934 truncated = _bt_truncate(wstate->index, lastleft, oitup,
935 wstate->inskey);
936 if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
937 elog(ERROR, "failed to add high key to the index page");
938 pfree(truncated);
939
940 /* oitup should continue to point to the page's high key */
943 }
944
945 /*
946 * Link the old page into its parent, using its low key. If we don't
947 * have a parent, we have to create one; this adds a new btree level.
948 */
949 if (state->btps_next == NULL)
950 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
951
952 Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
954 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
956 Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
958 BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
959 _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
960 pfree(state->btps_lowkey);
961
962 /*
963 * Save a copy of the high key from the old page. It is also the low
964 * key for the new page.
965 */
966 state->btps_lowkey = CopyIndexTuple(oitup);
967
968 /*
969 * Set the sibling links for both pages.
970 */
971 {
974
975 oopaque->btpo_next = nblkno;
976 nopaque->btpo_prev = oblkno;
977 nopaque->btpo_next = P_NONE; /* redundant */
978 }
979
980 /*
981 * Write out the old page. _bt_blwritepage takes ownership of the
982 * 'opage' buffer.
983 */
985
986 /*
987 * Reset last_off to point to new page
988 */
990 }
991
992 /*
993 * By here, either original page is still the current page, or a new page
994 * was created that became the current page. Either way, the current page
995 * definitely has space for new item.
996 *
997 * If the new item is the first for its page, it must also be the first
998 * item on its entire level. On later same-level pages, a low key for a
999 * page will be copied from the prior page in the code above. Generate a
1000 * minus infinity low key here instead.
1001 */
1002 if (last_off == P_HIKEY)
1003 {
1004 Assert(state->btps_lowkey == NULL);
1005 state->btps_lowkey = palloc0_object(IndexTupleData);
1006 state->btps_lowkey->t_info = sizeof(IndexTupleData);
1007 BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1008 }
1009
1010 /*
1011 * Add the new item into the current page.
1012 */
1014 _bt_sortaddtup(npage, itupsz, itup, last_off,
1015 !isleaf && last_off == P_FIRSTKEY);
1016
1017 state->btps_buf = nbuf;
1018 state->btps_blkno = nblkno;
1019 state->btps_lastoff = last_off;
1020}

References _bt_blnewpage(), _bt_blwritepage(), _bt_buildadd(), _bt_check_third_page(), _bt_pagestate(), _bt_sortaddtup(), _bt_truncate(), Assert, BTMaxItemSize, BTPageGetOpaque, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), BTreeTupleSetNAtts(), CHECK_FOR_INTERRUPTS, CopyIndexTuple(), elog, ERROR, fb(), IndexRelationGetNumberOfKeyAttributes, IndexTupleSize(), ItemIdGetLength, ItemIdSetUnused, MAXALIGN, OffsetNumberNext, OffsetNumberPrev, P_FIRSTKEY, P_HIKEY, P_LEFTMOST, P_NONE, PageGetFreeSpace(), PageGetItem(), PageGetItemId(), PageIndexTupleOverwrite(), palloc0_object, pfree(), and unlikely.

Referenced by _bt_buildadd(), _bt_load(), _bt_sort_dedup_finish_pending(), and _bt_uppershutdown().

◆ _bt_end_parallel()

static void _bt_end_parallel ( BTLeader btleader)
static

Definition at line 1608 of file nbtsort.c.

1609{
1610 int i;
1611
1612 /* Shutdown worker processes */
1614
1615 /*
1616 * Next, accumulate WAL usage. (This must wait for the workers to finish,
1617 * or we might get incomplete data.)
1618 */
1619 for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1620 InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1621
1622 /* Free last reference to MVCC snapshot, if one was used */
1623 if (IsMVCCSnapshot(btleader->snapshot))
1624 UnregisterSnapshot(btleader->snapshot);
1625 DestroyParallelContext(btleader->pcxt);
1627}

References BTLeader::bufferusage, DestroyParallelContext(), ExitParallelMode(), i, InstrAccumParallelQuery(), IsMVCCSnapshot, ParallelContext::nworkers_launched, BTLeader::pcxt, BTLeader::snapshot, UnregisterSnapshot(), WaitForParallelWorkersToFinish(), and BTLeader::walusage.

Referenced by _bt_begin_parallel(), and btbuild().

◆ _bt_leader_participate_as_worker()

static void _bt_leader_participate_as_worker ( BTBuildState buildstate)
static

Definition at line 1688 of file nbtsort.c.

1689{
1690 BTLeader *btleader = buildstate->btleader;
1693 int sortmem;
1694
1695 /* Allocate memory and initialize private spool */
1697 leaderworker->heap = buildstate->spool->heap;
1698 leaderworker->index = buildstate->spool->index;
1699 leaderworker->isunique = buildstate->spool->isunique;
1700 leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1701
1702 /* Initialize second spool, if required */
1703 if (!btleader->btshared->isunique)
1705 else
1706 {
1707 /* Allocate memory for worker's own private secondary spool */
1709
1710 /* Initialize worker's own secondary spool */
1711 leaderworker2->heap = leaderworker->heap;
1712 leaderworker2->index = leaderworker->index;
1713 leaderworker2->isunique = false;
1714 }
1715
1716 /*
1717 * Might as well use reliable figure when doling out maintenance_work_mem
1718 * (when requested number of workers were not launched, this will be
1719 * somewhat higher than it is for other workers).
1720 */
1722
1723 /* Perform work common to all participants */
1725 btleader->sharedsort, btleader->sharedsort2,
1726 sortmem, true);
1727
1728#ifdef BTREE_BUILD_STATS
1730 {
1731 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1732 ResetUsage();
1733 }
1734#endif /* BTREE_BUILD_STATS */
1735}

References _bt_parallel_scan_and_sort(), BTLeader::btshared, fb(), BTShared::isunique, log_btree_build_stats, maintenance_work_mem, BTLeader::nparticipanttuplesorts, palloc0_object, ResetUsage(), BTLeader::sharedsort, BTLeader::sharedsort2, and ShowUsage().

Referenced by _bt_begin_parallel().

◆ _bt_leafbuild()

static void _bt_leafbuild ( BTSpool btspool,
BTSpool btspool2 
)
static

Definition at line 539 of file nbtsort.c.

540{
542
543#ifdef BTREE_BUILD_STATS
545 {
546 ShowUsage("BTREE BUILD (Spool) STATISTICS");
547 ResetUsage();
548 }
549#endif /* BTREE_BUILD_STATS */
550
551 /* Execute the sort */
554 tuplesort_performsort(btspool->sortstate);
555 if (btspool2)
556 {
560 }
561
562 wstate.heap = btspool->heap;
563 wstate.index = btspool->index;
564 wstate.inskey = _bt_mkscankey(wstate.index, NULL);
565 /* _bt_mkscankey() won't set allequalimage without metapage */
566 wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
567
568 /* reserve the metapage */
569 wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
570
574}

References _bt_allequalimage(), _bt_load(), _bt_mkscankey(), BTREE_METAPAGE, fb(), log_btree_build_stats, pgstat_progress_update_param(), PROGRESS_BTREE_PHASE_LEAF_LOAD, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, PROGRESS_CREATEIDX_SUBPHASE, ResetUsage(), ShowUsage(), and tuplesort_performsort().

Referenced by btbuild().

◆ _bt_load()

static void _bt_load ( BTWriteState wstate,
BTSpool btspool,
BTSpool btspool2 
)
static

Definition at line 1136 of file nbtsort.c.

1137{
1139 bool merge = (btspool2 != NULL);
1140 IndexTuple itup,
1141 itup2 = NULL;
1142 bool load1;
1144 int i,
1146 SortSupport sortKeys;
1147 int64 tuples_done = 0;
1148 bool deduplicate;
1149
1150 wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1151
1152 deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1154
1155 if (merge)
1156 {
1157 /*
1158 * Another BTSpool for dead tuples exists. Now we have to merge
1159 * btspool and btspool2.
1160 */
1161
1162 /* the preparation of merge */
1163 itup = tuplesort_getindextuple(btspool->sortstate, true);
1164 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1165
1166 /* Prepare SortSupport data for each column */
1167 sortKeys = palloc0_array(SortSupportData, keysz);
1168
1169 for (i = 0; i < keysz; i++)
1170 {
1171 SortSupport sortKey = sortKeys + i;
1172 ScanKey scanKey = wstate->inskey->scankeys + i;
1173 bool reverse;
1174
1175 sortKey->ssup_cxt = CurrentMemoryContext;
1176 sortKey->ssup_collation = scanKey->sk_collation;
1177 sortKey->ssup_nulls_first =
1178 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1179 sortKey->ssup_attno = scanKey->sk_attno;
1180 /* Abbreviation is not supported here */
1181 sortKey->abbreviate = false;
1182
1183 Assert(sortKey->ssup_attno != 0);
1184
1185 reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
1186
1188 }
1189
1190 for (;;)
1191 {
1192 load1 = true; /* load BTSpool next ? */
1193 if (itup2 == NULL)
1194 {
1195 if (itup == NULL)
1196 break;
1197 }
1198 else if (itup != NULL)
1199 {
1200 int32 compare = 0;
1201
1202 for (i = 1; i <= keysz; i++)
1203 {
1204 SortSupport entry;
1206 attrDatum2;
1207 bool isNull1,
1208 isNull2;
1209
1210 entry = sortKeys + i - 1;
1213
1216 entry);
1217 if (compare > 0)
1218 {
1219 load1 = false;
1220 break;
1221 }
1222 else if (compare < 0)
1223 break;
1224 }
1225
1226 /*
1227 * If key values are equal, we sort on ItemPointer. This is
1228 * required for btree indexes, since heap TID is treated as an
1229 * implicit last key attribute in order to ensure that all
1230 * keys in the index are physically unique.
1231 */
1232 if (compare == 0)
1233 {
1234 compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1235 Assert(compare != 0);
1236 if (compare > 0)
1237 load1 = false;
1238 }
1239 }
1240 else
1241 load1 = false;
1242
1243 /* When we see first tuple, create first index page */
1244 if (state == NULL)
1246
1247 if (load1)
1248 {
1249 _bt_buildadd(wstate, state, itup, 0);
1250 itup = tuplesort_getindextuple(btspool->sortstate, true);
1251 }
1252 else
1253 {
1255 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1256 }
1257
1258 /* Report progress */
1260 ++tuples_done);
1261 }
1262 pfree(sortKeys);
1263 }
1264 else if (deduplicate)
1265 {
1266 /* merge is unnecessary, deduplicate into posting lists */
1268
1270 dstate->deduplicate = true; /* unused */
1271 dstate->nmaxitems = 0; /* unused */
1272 dstate->maxpostingsize = 0; /* set later */
1273 /* Metadata about base tuple of current pending posting list */
1274 dstate->base = NULL;
1275 dstate->baseoff = InvalidOffsetNumber; /* unused */
1276 dstate->basetupsize = 0;
1277 /* Metadata about current pending posting list TIDs */
1278 dstate->htids = NULL;
1279 dstate->nhtids = 0;
1280 dstate->nitems = 0;
1281 dstate->phystupsize = 0; /* unused */
1282 dstate->nintervals = 0; /* unused */
1283
1284 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1285 true)) != NULL)
1286 {
1287 /* When we see first tuple, create first index page */
1288 if (state == NULL)
1289 {
1291
1292 /*
1293 * Limit size of posting list tuples to 1/10 space we want to
1294 * leave behind on the page, plus space for final item's line
1295 * pointer. This is equal to the space that we'd like to
1296 * leave behind on each leaf page when fillfactor is 90,
1297 * allowing us to get close to fillfactor% space utilization
1298 * when there happen to be a great many duplicates. (This
1299 * makes higher leaf fillfactor settings ineffective when
1300 * building indexes that have many duplicates, but packing
1301 * leaf pages full with few very large tuples doesn't seem
1302 * like a useful goal.)
1303 */
1304 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1305 sizeof(ItemIdData);
1306 Assert(dstate->maxpostingsize <= BTMaxItemSize &&
1307 dstate->maxpostingsize <= INDEX_SIZE_MASK);
1308 dstate->htids = palloc(dstate->maxpostingsize);
1309
1310 /* start new pending posting list with itup copy */
1313 }
1314 else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1315 itup) > keysz &&
1317 {
1318 /*
1319 * Tuple is equal to base tuple of pending posting list. Heap
1320 * TID from itup has been saved in state.
1321 */
1322 }
1323 else
1324 {
1325 /*
1326 * Tuple is not equal to pending posting list tuple, or
1327 * _bt_dedup_save_htid() opted to not merge current item into
1328 * pending posting list.
1329 */
1331 pfree(dstate->base);
1332
1333 /* start new pending posting list with itup copy */
1336 }
1337
1338 /* Report progress */
1340 ++tuples_done);
1341 }
1342
1343 if (state)
1344 {
1345 /*
1346 * Handle the last item (there must be a last item when the
1347 * tuplesort returned one or more tuples)
1348 */
1350 pfree(dstate->base);
1351 pfree(dstate->htids);
1352 }
1353
1354 pfree(dstate);
1355 }
1356 else
1357 {
1358 /* merging and deduplication are both unnecessary */
1359 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1360 true)) != NULL)
1361 {
1362 /* When we see first tuple, create first index page */
1363 if (state == NULL)
1365
1366 _bt_buildadd(wstate, state, itup, 0);
1367
1368 /* Report progress */
1370 ++tuples_done);
1371 }
1372 }
1373
1374 /* Close down final pages and write the metapage */
1376 smgr_bulk_finish(wstate->bulkstate);
1377}

References _bt_buildadd(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_keep_natts_fast(), _bt_pagestate(), _bt_sort_dedup_finish_pending(), _bt_uppershutdown(), ApplySortComparator(), Assert, BTGetDeduplicateItems, BTMaxItemSize, compare(), CopyIndexTuple(), CurrentMemoryContext, fb(), i, index_getattr(), INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemPointerCompare(), MAIN_FORKNUM, MAXALIGN_DOWN, merge(), palloc(), palloc0_array, palloc_object, pfree(), pgstat_progress_update_param(), PrepareSortSupportFromIndexRel(), PROGRESS_CREATEIDX_TUPLES_DONE, RelationGetDescr, SK_BT_DESC, SK_BT_NULLS_FIRST, smgr_bulk_finish(), smgr_bulk_start_rel(), IndexTupleData::t_tid, and tuplesort_getindextuple().

Referenced by _bt_leafbuild().

◆ _bt_pagestate()

static BTPageState * _bt_pagestate ( BTWriteState wstate,
uint32  level 
)
static

Definition at line 649 of file nbtsort.c.

650{
652
653 /* create initial page for level */
654 state->btps_buf = _bt_blnewpage(wstate, level);
655
656 /* and assign it a page position */
657 state->btps_blkno = wstate->btws_pages_alloced++;
658
659 state->btps_lowkey = NULL;
660 /* initialize lastoff so first item goes into P_FIRSTKEY */
661 state->btps_lastoff = P_HIKEY;
662 state->btps_lastextra = 0;
663 state->btps_level = level;
664 /* set "full" threshold based on level. See notes at head of file. */
665 if (level > 0)
666 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
667 else
668 state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
669
670 /* no parent level, yet */
671 state->btps_next = NULL;
672
673 return state;
674}

References _bt_blnewpage(), BTGetTargetPageFreeSpace, BTREE_NONLEAF_FILLFACTOR, fb(), P_HIKEY, and palloc0_object.

Referenced by _bt_buildadd(), and _bt_load().

◆ _bt_parallel_build_main()

void _bt_parallel_build_main ( dsm_segment seg,
shm_toc toc 
)

Definition at line 1741 of file nbtsort.c.

1742{
1743 char *sharedquery;
1746 BTShared *btshared;
1747 Sharedsort *sharedsort;
1748 Sharedsort *sharedsort2;
1749 Relation heapRel;
1750 Relation indexRel;
1753 WalUsage *walusage;
1754 BufferUsage *bufferusage;
1755 int sortmem;
1756
1757#ifdef BTREE_BUILD_STATS
1759 ResetUsage();
1760#endif /* BTREE_BUILD_STATS */
1761
1762 /*
1763 * The only possible status flag that can be set to the parallel worker is
1764 * PROC_IN_SAFE_IC.
1765 */
1766 Assert((MyProc->statusFlags == 0) ||
1768
1769 /* Set debug_query_string for individual workers first */
1772
1773 /* Report the query string from leader */
1775
1776 /* Look up nbtree shared state */
1777 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1778
1779 /* Open relations using lock modes known to be obtained by index.c */
1780 if (!btshared->isconcurrent)
1781 {
1784 }
1785 else
1786 {
1789 }
1790
1791 /* Track query ID */
1792 pgstat_report_query_id(btshared->queryid, false);
1793
1794 /* Open relations within worker */
1795 heapRel = table_open(btshared->heaprelid, heapLockmode);
1796 indexRel = index_open(btshared->indexrelid, indexLockmode);
1797
1798 /* Initialize worker's own spool */
1800 btspool->heap = heapRel;
1801 btspool->index = indexRel;
1802 btspool->isunique = btshared->isunique;
1803 btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1804
1805 /* Look up shared state private to tuplesort.c */
1806 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1807 tuplesort_attach_shared(sharedsort, seg);
1808 if (!btshared->isunique)
1809 {
1810 btspool2 = NULL;
1811 sharedsort2 = NULL;
1812 }
1813 else
1814 {
1815 /* Allocate memory for worker's own private secondary spool */
1817
1818 /* Initialize worker's own secondary spool */
1819 btspool2->heap = btspool->heap;
1820 btspool2->index = btspool->index;
1821 btspool2->isunique = false;
1822 /* Look up shared state private to tuplesort.c */
1823 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1824 tuplesort_attach_shared(sharedsort2, seg);
1825 }
1826
1827 /* Prepare to track buffer usage during parallel execution */
1829
1830 /* Perform sorting of spool, and possibly a spool2 */
1832 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1833 sharedsort2, sortmem, false);
1834
1835 /* Report WAL/buffer usage during parallel execution */
1836 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1837 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1839 &walusage[ParallelWorkerNumber]);
1840
1841#ifdef BTREE_BUILD_STATS
1843 {
1844 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1845 ResetUsage();
1846 }
1847#endif /* BTREE_BUILD_STATS */
1848
1849 index_close(indexRel, indexLockmode);
1850 table_close(heapRel, heapLockmode);
1851}

References _bt_parallel_scan_and_sort(), AccessExclusiveLock, Assert, debug_query_string, fb(), BTShared::heaprelid, index_close(), index_open(), BTShared::indexrelid, InstrEndParallelQuery(), InstrStartParallelQuery(), BTShared::isconcurrent, BTShared::isunique, log_btree_build_stats, maintenance_work_mem, MyProc, BTShared::nulls_not_distinct, palloc0_object, PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelWorkerNumber, pgstat_report_activity(), pgstat_report_query_id(), PROC_IN_SAFE_IC, BTShared::queryid, ResetUsage(), RowExclusiveLock, BTShared::scantuplesortstates, ShareLock, ShareUpdateExclusiveLock, shm_toc_lookup(), ShowUsage(), STATE_RUNNING, PGPROC::statusFlags, table_close(), table_open(), and tuplesort_attach_shared().

◆ _bt_parallel_estimate_shared()

static Size _bt_parallel_estimate_shared ( Relation  heap,
Snapshot  snapshot 
)
static

Definition at line 1634 of file nbtsort.c.

1635{
1636 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1637 return add_size(BUFFERALIGN(sizeof(BTShared)),
1638 table_parallelscan_estimate(heap, snapshot));
1639}

References add_size(), BUFFERALIGN, and table_parallelscan_estimate().

Referenced by _bt_begin_parallel().

◆ _bt_parallel_heapscan()

static double _bt_parallel_heapscan ( BTBuildState buildstate,
bool brokenhotchain 
)
static

Definition at line 1654 of file nbtsort.c.

1655{
1656 BTShared *btshared = buildstate->btleader->btshared;
1657 int nparticipanttuplesorts;
1658 double reltuples;
1659
1660 nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1661 for (;;)
1662 {
1663 SpinLockAcquire(&btshared->mutex);
1664 if (btshared->nparticipantsdone == nparticipanttuplesorts)
1665 {
1666 buildstate->havedead = btshared->havedead;
1667 buildstate->indtuples = btshared->indtuples;
1668 *brokenhotchain = btshared->brokenhotchain;
1669 reltuples = btshared->reltuples;
1670 SpinLockRelease(&btshared->mutex);
1671 break;
1672 }
1673 SpinLockRelease(&btshared->mutex);
1674
1677 }
1678
1680
1681 return reltuples;
1682}

References BTShared::brokenhotchain, ConditionVariableCancelSleep(), ConditionVariableSleep(), fb(), BTShared::havedead, BTShared::indtuples, BTShared::mutex, BTShared::nparticipantsdone, BTShared::reltuples, SpinLockAcquire, SpinLockRelease, and BTShared::workersdonecv.

Referenced by _bt_spools_heapscan().

◆ _bt_parallel_scan_and_sort()

static void _bt_parallel_scan_and_sort ( BTSpool btspool,
BTSpool btspool2,
BTShared btshared,
Sharedsort sharedsort,
Sharedsort sharedsort2,
int  sortmem,
bool  progress 
)
static

Definition at line 1866 of file nbtsort.c.

1869{
1872 TableScanDesc scan;
1873 double reltuples;
1874 IndexInfo *indexInfo;
1875
1876 /* Initialize local tuplesort coordination state */
1878 coordinate->isWorker = true;
1879 coordinate->nParticipants = -1;
1880 coordinate->sharedsort = sharedsort;
1881
1882 /* Begin "partial" tuplesort */
1883 btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1884 btspool->index,
1885 btspool->isunique,
1886 btspool->nulls_not_distinct,
1889
1890 /*
1891 * Just as with serial case, there may be a second spool. If so, a
1892 * second, dedicated spool2 partial tuplesort is required.
1893 */
1894 if (btspool2)
1895 {
1897
1898 /*
1899 * We expect that the second one (for dead tuples) won't get very
1900 * full, so we give it only work_mem (unless sortmem is less for
1901 * worker). Worker processes are generally permitted to allocate
1902 * work_mem independently.
1903 */
1905 coordinate2->isWorker = true;
1906 coordinate2->nParticipants = -1;
1907 coordinate2->sharedsort = sharedsort2;
1908 btspool2->sortstate =
1909 tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1911 false);
1912 }
1913
1914 /* Fill in buildstate for _bt_build_callback() */
1915 buildstate.isunique = btshared->isunique;
1916 buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1917 buildstate.havedead = false;
1918 buildstate.heap = btspool->heap;
1919 buildstate.spool = btspool;
1920 buildstate.spool2 = btspool2;
1921 buildstate.indtuples = 0;
1922 buildstate.btleader = NULL;
1923
1924 /* Join parallel scan */
1925 indexInfo = BuildIndexInfo(btspool->index);
1926 indexInfo->ii_Concurrent = btshared->isconcurrent;
1927 scan = table_beginscan_parallel(btspool->heap,
1929 reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1931 &buildstate, scan);
1932
1933 /* Execute this worker's part of the sort */
1934 if (progress)
1937 tuplesort_performsort(btspool->sortstate);
1938 if (btspool2)
1939 {
1940 if (progress)
1943 tuplesort_performsort(btspool2->sortstate);
1944 }
1945
1946 /*
1947 * Done. Record ambuild statistics, and whether we encountered a broken
1948 * HOT chain.
1949 */
1950 SpinLockAcquire(&btshared->mutex);
1951 btshared->nparticipantsdone++;
1952 btshared->reltuples += reltuples;
1953 if (buildstate.havedead)
1954 btshared->havedead = true;
1955 btshared->indtuples += buildstate.indtuples;
1956 if (indexInfo->ii_BrokenHotChain)
1957 btshared->brokenhotchain = true;
1958 SpinLockRelease(&btshared->mutex);
1959
1960 /* Notify leader */
1962
1963 /* We can end tuplesorts immediately */
1964 tuplesort_end(btspool->sortstate);
1965 if (btspool2)
1966 tuplesort_end(btspool2->sortstate);
1967}

References _bt_build_callback(), BTShared::brokenhotchain, BuildIndexInfo(), ConditionVariableSignal(), fb(), BTShared::havedead, IndexInfo::ii_BrokenHotChain, IndexInfo::ii_Concurrent, BTShared::indtuples, BTShared::isconcurrent, BTShared::isunique, Min, BTShared::mutex, BTShared::nparticipantsdone, BTShared::nulls_not_distinct, palloc0_object, ParallelTableScanFromBTShared, pgstat_progress_update_param(), progress, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, PROGRESS_CREATEIDX_SUBPHASE, BTShared::reltuples, SpinLockAcquire, SpinLockRelease, table_beginscan_parallel(), table_index_build_scan(), tuplesort_begin_index_btree(), tuplesort_end(), TUPLESORT_NONE, tuplesort_performsort(), work_mem, and BTShared::workersdonecv.

Referenced by _bt_leader_participate_as_worker(), and _bt_parallel_build_main().

◆ _bt_slideleft()

static void _bt_slideleft ( Page  rightmostpage)
static

Definition at line 686 of file nbtsort.c.

687{
688 OffsetNumber off;
689 OffsetNumber maxoff;
691
693 Assert(maxoff >= P_FIRSTKEY);
695 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
696 {
698
699 *previi = *thisii;
700 previi = thisii;
701 }
702 ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
703}

References Assert, fb(), OffsetNumberNext, P_FIRSTKEY, P_HIKEY, PageGetItemId(), and PageGetMaxOffsetNumber().

Referenced by _bt_uppershutdown().

◆ _bt_sort_dedup_finish_pending()

static void _bt_sort_dedup_finish_pending ( BTWriteState wstate,
BTPageState state,
BTDedupState  dstate 
)
static

Definition at line 1030 of file nbtsort.c.

1032{
1033 Assert(dstate->nitems > 0);
1034
1035 if (dstate->nitems == 1)
1036 _bt_buildadd(wstate, state, dstate->base, 0);
1037 else
1038 {
1041
1042 /* form a tuple with a posting list */
1044 dstate->htids,
1045 dstate->nhtids);
1046 /* Calculate posting list overhead */
1049
1052 }
1053
1054 dstate->nmaxitems = 0;
1055 dstate->nhtids = 0;
1056 dstate->nitems = 0;
1057 dstate->phystupsize = 0;
1058}

References _bt_buildadd(), _bt_form_posting(), Assert, BTreeTupleGetPostingOffset(), fb(), IndexTupleSize(), and pfree().

Referenced by _bt_load().

◆ _bt_sortaddtup()

static void _bt_sortaddtup ( Page  page,
Size  itemsize,
const IndexTupleData itup,
OffsetNumber  itup_off,
bool  newfirstdataitem 
)
static

Definition at line 717 of file nbtsort.c.

722{
724
726 {
727 trunctuple = *itup;
729 BTreeTupleSetNAtts(&trunctuple, 0, false);
730 itup = &trunctuple;
731 itemsize = sizeof(IndexTupleData);
732 }
733
734 if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
735 elog(ERROR, "failed to add item to the index page");
736}

References BTreeTupleSetNAtts(), elog, ERROR, fb(), InvalidOffsetNumber, PageAddItem, and IndexTupleData::t_info.

Referenced by _bt_buildadd().

◆ _bt_spool()

static void _bt_spool ( BTSpool btspool,
const ItemPointerData self,
const Datum values,
const bool isnull 
)
static

Definition at line 528 of file nbtsort.c.

529{
531 self, values, isnull);
532}

References fb(), tuplesort_putindextuplevalues(), and values.

Referenced by _bt_build_callback().

◆ _bt_spooldestroy()

static void _bt_spooldestroy ( BTSpool btspool)
static

Definition at line 518 of file nbtsort.c.

519{
520 tuplesort_end(btspool->sortstate);
521 pfree(btspool);
522}

References fb(), pfree(), and tuplesort_end().

Referenced by _bt_spools_heapscan(), and btbuild().

◆ _bt_spools_heapscan()

static double _bt_spools_heapscan ( Relation  heap,
Relation  index,
BTBuildState buildstate,
IndexInfo indexInfo 
)
static

Definition at line 366 of file nbtsort.c.

368{
371 double reltuples = 0;
372
373 /*
374 * We size the sort area as maintenance_work_mem rather than work_mem to
375 * speed index creation. This should be OK since a single backend can't
376 * run multiple index creations in parallel (see also: notes on
377 * parallelism and maintenance_work_mem below).
378 */
379 btspool->heap = heap;
380 btspool->index = index;
381 btspool->isunique = indexInfo->ii_Unique;
382 btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
383
384 /* Save as primary spool */
385 buildstate->spool = btspool;
386
387 /* Report table scan phase started */
390
391 /* Attempt to launch parallel worker scan when required */
392 if (indexInfo->ii_ParallelWorkers > 0)
394 indexInfo->ii_ParallelWorkers);
395
396 /*
397 * If parallel build requested and at least one worker process was
398 * successfully launched, set up coordination state
399 */
400 if (buildstate->btleader)
401 {
403 coordinate->isWorker = false;
404 coordinate->nParticipants =
405 buildstate->btleader->nparticipanttuplesorts;
406 coordinate->sharedsort = buildstate->btleader->sharedsort;
407 }
408
409 /*
410 * Begin serial/leader tuplesort.
411 *
412 * In cases where parallelism is involved, the leader receives the same
413 * share of maintenance_work_mem as a serial sort (it is generally treated
414 * in the same way as a serial sort once we return). Parallel worker
415 * Tuplesortstates will have received only a fraction of
416 * maintenance_work_mem, though.
417 *
418 * We rely on the lifetime of the Leader Tuplesortstate almost not
419 * overlapping with any worker Tuplesortstate's lifetime. There may be
420 * some small overlap, but that's okay because we rely on leader
421 * Tuplesortstate only allocating a small, fixed amount of memory here.
422 * When its tuplesort_performsort() is called (by our caller), and
423 * significant amounts of memory are likely to be used, all workers must
424 * have already freed almost all memory held by their Tuplesortstates
425 * (they are about to go away completely, too). The overall effect is
426 * that maintenance_work_mem always represents an absolute high watermark
427 * on the amount of memory used by a CREATE INDEX operation, regardless of
428 * the use of parallelism or any other factor.
429 */
430 buildstate->spool->sortstate =
432 buildstate->nulls_not_distinct,
435
436 /*
437 * If building a unique index, put dead tuples in a second spool to keep
438 * them out of the uniqueness check. We expect that the second spool (for
439 * dead tuples) won't get very full, so we give it only work_mem.
440 */
441 if (indexInfo->ii_Unique)
442 {
445
446 /* Initialize secondary spool */
447 btspool2->heap = heap;
448 btspool2->index = index;
449 btspool2->isunique = false;
450 /* Save as secondary spool */
451 buildstate->spool2 = btspool2;
452
453 if (buildstate->btleader)
454 {
455 /*
456 * Set up non-private state that is passed to
457 * tuplesort_begin_index_btree() about the basic high level
458 * coordination of a parallel sort.
459 */
461 coordinate2->isWorker = false;
462 coordinate2->nParticipants =
463 buildstate->btleader->nparticipanttuplesorts;
464 coordinate2->sharedsort = buildstate->btleader->sharedsort2;
465 }
466
467 /*
468 * We expect that the second one (for dead tuples) won't get very
469 * full, so we give it only work_mem
470 */
471 buildstate->spool2->sortstate =
472 tuplesort_begin_index_btree(heap, index, false, false, work_mem,
474 }
475
476 /* Fill spool using either serial or parallel heap scan */
477 if (!buildstate->btleader)
478 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
480 NULL);
481 else
483 &indexInfo->ii_BrokenHotChain);
484
485 /*
486 * Set the progress target for the next phase. Reset the block number
487 * values set by table_index_build_scan
488 */
489 {
490 const int progress_index[] = {
494 };
495 const int64 progress_vals[] = {
496 buildstate->indtuples,
497 0, 0
498 };
499
501 }
502
503 /* okay, all heap tuples are spooled */
504 if (buildstate->spool2 && !buildstate->havedead)
505 {
506 /* spool2 turns out to be unnecessary */
508 buildstate->spool2 = NULL;
509 }
510
511 return reltuples;
512}

References _bt_begin_parallel(), _bt_build_callback(), _bt_parallel_heapscan(), _bt_spooldestroy(), fb(), IndexInfo::ii_BrokenHotChain, IndexInfo::ii_Concurrent, IndexInfo::ii_NullsNotDistinct, IndexInfo::ii_ParallelWorkers, IndexInfo::ii_Unique, maintenance_work_mem, palloc0_object, pgstat_progress_update_multi_param(), pgstat_progress_update_param(), PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN, PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_CREATEIDX_TUPLES_TOTAL, PROGRESS_SCAN_BLOCKS_DONE, PROGRESS_SCAN_BLOCKS_TOTAL, table_index_build_scan(), tuplesort_begin_index_btree(), TUPLESORT_NONE, and work_mem.

Referenced by btbuild().

◆ _bt_uppershutdown()

static void _bt_uppershutdown ( BTWriteState wstate,
BTPageState state 
)
static

Definition at line 1064 of file nbtsort.c.

1065{
1066 BTPageState *s;
1068 uint32 rootlevel = 0;
1070
1071 /*
1072 * Each iteration of this loop completes one more level of the tree.
1073 */
1074 for (s = state; s != NULL; s = s->btps_next)
1075 {
1076 BlockNumber blkno;
1077 BTPageOpaque opaque;
1078
1079 blkno = s->btps_blkno;
1080 opaque = BTPageGetOpaque((Page) s->btps_buf);
1081
1082 /*
1083 * We have to link the last page on this level to somewhere.
1084 *
1085 * If we're at the top, it's the root, so attach it to the metapage.
1086 * Otherwise, add an entry for it to its parent using its low key.
1087 * This may cause the last page of the parent level to split, but
1088 * that's not a problem -- we haven't gotten to it yet.
1089 */
1090 if (s->btps_next == NULL)
1091 {
1092 opaque->btpo_flags |= BTP_ROOT;
1093 rootblkno = blkno;
1094 rootlevel = s->btps_level;
1095 }
1096 else
1097 {
1100 BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1101 P_LEFTMOST(opaque));
1102 Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1103 !P_LEFTMOST(opaque));
1106 pfree(s->btps_lowkey);
1107 s->btps_lowkey = NULL;
1108 }
1109
1110 /*
1111 * This is the rightmost page, so the ItemId array needs to be slid
1112 * back one slot. Then we can dump out the page.
1113 */
1116 s->btps_buf = NULL; /* writepage took ownership of the buffer */
1117 }
1118
1119 /*
1120 * As the last step in the process, construct the metapage and make it
1121 * point to the new root (unless we had no data at all, in which case it's
1122 * set to point to "P_NONE"). This changes the index to the "valid" state
1123 * by filling in a valid magic number in the metapage.
1124 */
1125 metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1126 _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
1127 wstate->inskey->allequalimage);
1129}

References _bt_blwritepage(), _bt_buildadd(), _bt_initmetapage(), _bt_slideleft(), Assert, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTPageState::btps_blkno, BTPageState::btps_buf, BTPageState::btps_level, BTPageState::btps_lowkey, BTPageState::btps_next, BTREE_METAPAGE, BTreeTupleGetNAtts, BTreeTupleSetDownLink(), fb(), IndexRelationGetNumberOfKeyAttributes, P_LEFTMOST, P_NONE, pfree(), and smgr_bulk_get_buf().

Referenced by _bt_load().

◆ btbuild()

IndexBuildResult * btbuild ( Relation  heap,
Relation  index,
IndexInfo indexInfo 
)

Definition at line 296 of file nbtsort.c.

297{
298 IndexBuildResult *result;
300 double reltuples;
301
302#ifdef BTREE_BUILD_STATS
304 ResetUsage();
305#endif /* BTREE_BUILD_STATS */
306
307 buildstate.isunique = indexInfo->ii_Unique;
308 buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
309 buildstate.havedead = false;
310 buildstate.heap = heap;
311 buildstate.spool = NULL;
312 buildstate.spool2 = NULL;
313 buildstate.indtuples = 0;
314 buildstate.btleader = NULL;
315
316 /*
317 * We expect to be called exactly once for any index relation. If that's
318 * not the case, big trouble's what we have.
319 */
321 elog(ERROR, "index \"%s\" already contains data",
323
324 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
325
326 /*
327 * Finish the build by (1) completing the sort of the spool file, (2)
328 * inserting the sorted tuples into btree pages and (3) building the upper
329 * levels. Finally, it may also be necessary to end use of parallelism.
330 */
331 _bt_leafbuild(buildstate.spool, buildstate.spool2);
333 if (buildstate.spool2)
335 if (buildstate.btleader)
337
339
340 result->heap_tuples = reltuples;
341 result->index_tuples = buildstate.indtuples;
342
343#ifdef BTREE_BUILD_STATS
345 {
346 ShowUsage("BTREE BUILD STATS");
347 ResetUsage();
348 }
349#endif /* BTREE_BUILD_STATS */
350
351 return result;
352}

References _bt_end_parallel(), _bt_leafbuild(), _bt_spooldestroy(), _bt_spools_heapscan(), elog, ERROR, fb(), IndexBuildResult::heap_tuples, IndexInfo::ii_NullsNotDistinct, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, log_btree_build_stats, palloc_object, RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), and ShowUsage().

Referenced by bthandler().