PostgreSQL Source Code  git master
nbtsort.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtsort.c
4  * Build a btree from sorted input by loading leaf pages sequentially.
5  *
6  * NOTES
7  *
8  * We use tuplesort.c to sort the given index tuples into order.
9  * Then we scan the index tuples in order and build the btree pages
10  * for each level. We load source tuples into leaf-level pages.
11  * Whenever we fill a page at one level, we add a link to it to its
12  * parent level (starting a new parent level if necessary). When
13  * done, we write out each final page on each level, adding it to
14  * its parent level. When we have only one page on a level, it must be
15  * the root -- it can be attached to the btree metapage and we are done.
16  *
17  * It is not wise to pack the pages entirely full, since then *any*
18  * insertion would cause a split (and not only of the leaf page; the need
19  * for a split would cascade right up the tree). The steady-state load
20  * factor for btrees is usually estimated at 70%. We choose to pack leaf
21  * pages to the user-controllable fill factor (default 90%) while upper pages
22  * are always packed to 70%. This gives us reasonable density (there aren't
23  * many upper pages if the keys are reasonable-size) without risking a lot of
24  * cascading splits during early insertions.
25  *
26  * We use the bulk smgr loading facility to bypass the buffer cache and
27  * WAL-log the pages efficiently.
28  *
29  * This code isn't concerned about the FSM at all. The caller is responsible
30  * for initializing that.
31  *
32  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
33  * Portions Copyright (c) 1994, Regents of the University of California
34  *
35  * IDENTIFICATION
36  * src/backend/access/nbtree/nbtsort.c
37  *
38  *-------------------------------------------------------------------------
39  */
40 
41 #include "postgres.h"
42 
43 #include "access/nbtree.h"
44 #include "access/parallel.h"
45 #include "access/relscan.h"
46 #include "access/table.h"
47 #include "access/xact.h"
48 #include "access/xloginsert.h"
49 #include "catalog/index.h"
50 #include "commands/progress.h"
51 #include "executor/instrument.h"
52 #include "miscadmin.h"
53 #include "pgstat.h"
54 #include "storage/bulk_write.h"
55 #include "tcop/tcopprot.h" /* pgrminclude ignore */
56 #include "utils/rel.h"
57 #include "utils/sortsupport.h"
58 #include "utils/tuplesort.h"
59 
60 
61 /* Magic numbers for parallel state sharing */
62 #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
63 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
64 #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
65 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
66 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
67 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
68 
69 /*
70  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
71  * parallel index builds. This may be useful as a debugging aid.
72 #undef DISABLE_LEADER_PARTICIPATION
73  */
74 
75 /*
76  * Status record for spooling/sorting phase. (Note we may have two of
77  * these due to the special requirements for uniqueness-checking with
78  * dead tuples.)
79  */
80 typedef struct BTSpool
81 {
82  Tuplesortstate *sortstate; /* state data for tuplesort.c */
85  bool isunique;
88 
89 /*
90  * Status for index builds performed in parallel. This is allocated in a
91  * dynamic shared memory segment. Note that there is a separate tuplesort TOC
92  * entry, private to tuplesort.c but allocated by this module on its behalf.
93  */
94 typedef struct BTShared
95 {
96  /*
97  * These fields are not modified during the sort. They primarily exist
98  * for the benefit of worker processes that need to create BTSpool state
99  * corresponding to that used by the leader.
100  */
103  bool isunique;
107 
108  /*
109  * workersdonecv is used to monitor the progress of workers. All parallel
110  * participants must indicate that they are done before leader can use
111  * mutable state that workers maintain during scan (and before leader can
112  * proceed to tuplesort_performsort()).
113  */
115 
116  /*
117  * mutex protects all fields before heapdesc.
118  *
119  * These fields contain status information of interest to B-Tree index
120  * builds that must work just the same when an index is built in parallel.
121  */
123 
124  /*
125  * Mutable state that is maintained by workers, and reported back to
126  * leader at end of parallel scan.
127  *
128  * nparticipantsdone is number of worker processes finished.
129  *
130  * reltuples is the total number of input heap tuples.
131  *
132  * havedead indicates if RECENTLY_DEAD tuples were encountered during
133  * build.
134  *
135  * indtuples is the total number of tuples that made it into the index.
136  *
137  * brokenhotchain indicates if any worker detected a broken HOT chain
138  * during build.
139  */
141  double reltuples;
142  bool havedead;
143  double indtuples;
145 
146  /*
147  * ParallelTableScanDescData data follows. Can't directly embed here, as
148  * implementations of the parallel table scan desc interface might need
149  * stronger alignment.
150  */
152 
153 /*
154  * Return pointer to a BTShared's parallel table scan.
155  *
156  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
157  * MAXALIGN.
158  */
159 #define ParallelTableScanFromBTShared(shared) \
160  (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
161 
162 /*
163  * Status for leader in parallel index build.
164  */
165 typedef struct BTLeader
166 {
167  /* parallel context itself */
169 
170  /*
171  * nparticipanttuplesorts is the exact number of worker processes
172  * successfully launched, plus one leader process if it participates as a
173  * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
174  * participating as a worker).
175  */
177 
178  /*
179  * Leader process convenience pointers to shared state (leader avoids TOC
180  * lookups).
181  *
182  * btshared is the shared state for entire build. sharedsort is the
183  * shared, tuplesort-managed state passed to each process tuplesort.
184  * sharedsort2 is the corresponding btspool2 shared state, used only when
185  * building unique indexes. snapshot is the snapshot used by the scan iff
186  * an MVCC snapshot is required.
187  */
195 
196 /*
197  * Working state for btbuild and its callback.
198  *
199  * When parallel CREATE INDEX is used, there is a BTBuildState for each
200  * participant.
201  */
202 typedef struct BTBuildState
203 {
204  bool isunique;
206  bool havedead;
209 
210  /*
211  * spool2 is needed only when the index is a unique index. Dead tuples are
212  * put into spool2 instead of spool in order to avoid uniqueness check.
213  */
215  double indtuples;
216 
217  /*
218  * btleader is only present when a parallel index build is performed, and
219  * only in the leader process. (Actually, only the leader has a
220  * BTBuildState. Workers have their own spool and spool2, though.)
221  */
224 
225 /*
226  * Status record for a btree page being built. We have one of these
227  * for each active tree level.
228  */
229 typedef struct BTPageState
230 {
231  BulkWriteBuffer btps_buf; /* workspace for page building */
232  BlockNumber btps_blkno; /* block # to write this page at */
233  IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
234  OffsetNumber btps_lastoff; /* last item offset loaded */
235  Size btps_lastextra; /* last item's extra posting list space */
236  uint32 btps_level; /* tree level (0 = leaf) */
237  Size btps_full; /* "full" if less than this much free space */
238  struct BTPageState *btps_next; /* link to parent level, if any */
240 
241 /*
242  * Overall status record for index writing phase.
243  */
244 typedef struct BTWriteState
245 {
249  BTScanInsert inskey; /* generic insertion scankey */
250  BlockNumber btws_pages_alloced; /* # pages allocated */
252 
253 
254 static double _bt_spools_heapscan(Relation heap, Relation index,
255  BTBuildState *buildstate, IndexInfo *indexInfo);
256 static void _bt_spooldestroy(BTSpool *btspool);
257 static void _bt_spool(BTSpool *btspool, ItemPointer self,
258  Datum *values, bool *isnull);
259 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
261  bool *isnull, bool tupleIsAlive, void *state);
262 static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
263 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
264 static void _bt_slideleft(Page rightmostpage);
265 static void _bt_sortaddtup(Page page, Size itemsize,
266  IndexTuple itup, OffsetNumber itup_off,
267  bool newfirstdataitem);
268 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
269  IndexTuple itup, Size truncextra);
270 static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
272  BTDedupState dstate);
273 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
274 static void _bt_load(BTWriteState *wstate,
275  BTSpool *btspool, BTSpool *btspool2);
276 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
277  int request);
278 static void _bt_end_parallel(BTLeader *btleader);
279 static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
280 static double _bt_parallel_heapscan(BTBuildState *buildstate,
281  bool *brokenhotchain);
282 static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
283 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
284  BTShared *btshared, Sharedsort *sharedsort,
285  Sharedsort *sharedsort2, int sortmem,
286  bool progress);
287 
288 
289 /*
290  * btbuild() -- build a new btree index.
291  */
294 {
295  IndexBuildResult *result;
296  BTBuildState buildstate;
297  double reltuples;
298 
299 #ifdef BTREE_BUILD_STATS
301  ResetUsage();
302 #endif /* BTREE_BUILD_STATS */
303 
304  buildstate.isunique = indexInfo->ii_Unique;
305  buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
306  buildstate.havedead = false;
307  buildstate.heap = heap;
308  buildstate.spool = NULL;
309  buildstate.spool2 = NULL;
310  buildstate.indtuples = 0;
311  buildstate.btleader = NULL;
312 
313  /*
314  * We expect to be called exactly once for any index relation. If that's
315  * not the case, big trouble's what we have.
316  */
318  elog(ERROR, "index \"%s\" already contains data",
320 
321  reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
322 
323  /*
324  * Finish the build by (1) completing the sort of the spool file, (2)
325  * inserting the sorted tuples into btree pages and (3) building the upper
326  * levels. Finally, it may also be necessary to end use of parallelism.
327  */
328  _bt_leafbuild(buildstate.spool, buildstate.spool2);
329  _bt_spooldestroy(buildstate.spool);
330  if (buildstate.spool2)
331  _bt_spooldestroy(buildstate.spool2);
332  if (buildstate.btleader)
333  _bt_end_parallel(buildstate.btleader);
334 
335  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
336 
337  result->heap_tuples = reltuples;
338  result->index_tuples = buildstate.indtuples;
339 
340 #ifdef BTREE_BUILD_STATS
342  {
343  ShowUsage("BTREE BUILD STATS");
344  ResetUsage();
345  }
346 #endif /* BTREE_BUILD_STATS */
347 
348  return result;
349 }
350 
351 /*
352  * Create and initialize one or two spool structures, and save them in caller's
353  * buildstate argument. May also fill-in fields within indexInfo used by index
354  * builds.
355  *
356  * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
357  * routine encapsulates all aspects of managing parallelism. Caller need only
358  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
359  *
360  * Returns the total number of heap tuples scanned.
361  */
362 static double
364  IndexInfo *indexInfo)
365 {
366  BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
367  SortCoordinate coordinate = NULL;
368  double reltuples = 0;
369 
370  /*
371  * We size the sort area as maintenance_work_mem rather than work_mem to
372  * speed index creation. This should be OK since a single backend can't
373  * run multiple index creations in parallel (see also: notes on
374  * parallelism and maintenance_work_mem below).
375  */
376  btspool->heap = heap;
377  btspool->index = index;
378  btspool->isunique = indexInfo->ii_Unique;
379  btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
380 
381  /* Save as primary spool */
382  buildstate->spool = btspool;
383 
384  /* Report table scan phase started */
387 
388  /* Attempt to launch parallel worker scan when required */
389  if (indexInfo->ii_ParallelWorkers > 0)
390  _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
391  indexInfo->ii_ParallelWorkers);
392 
393  /*
394  * If parallel build requested and at least one worker process was
395  * successfully launched, set up coordination state
396  */
397  if (buildstate->btleader)
398  {
399  coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
400  coordinate->isWorker = false;
401  coordinate->nParticipants =
402  buildstate->btleader->nparticipanttuplesorts;
403  coordinate->sharedsort = buildstate->btleader->sharedsort;
404  }
405 
406  /*
407  * Begin serial/leader tuplesort.
408  *
409  * In cases where parallelism is involved, the leader receives the same
410  * share of maintenance_work_mem as a serial sort (it is generally treated
411  * in the same way as a serial sort once we return). Parallel worker
412  * Tuplesortstates will have received only a fraction of
413  * maintenance_work_mem, though.
414  *
415  * We rely on the lifetime of the Leader Tuplesortstate almost not
416  * overlapping with any worker Tuplesortstate's lifetime. There may be
417  * some small overlap, but that's okay because we rely on leader
418  * Tuplesortstate only allocating a small, fixed amount of memory here.
419  * When its tuplesort_performsort() is called (by our caller), and
420  * significant amounts of memory are likely to be used, all workers must
421  * have already freed almost all memory held by their Tuplesortstates
422  * (they are about to go away completely, too). The overall effect is
423  * that maintenance_work_mem always represents an absolute high watermark
424  * on the amount of memory used by a CREATE INDEX operation, regardless of
425  * the use of parallelism or any other factor.
426  */
427  buildstate->spool->sortstate =
428  tuplesort_begin_index_btree(heap, index, buildstate->isunique,
429  buildstate->nulls_not_distinct,
430  maintenance_work_mem, coordinate,
432 
433  /*
434  * If building a unique index, put dead tuples in a second spool to keep
435  * them out of the uniqueness check. We expect that the second spool (for
436  * dead tuples) won't get very full, so we give it only work_mem.
437  */
438  if (indexInfo->ii_Unique)
439  {
440  BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
441  SortCoordinate coordinate2 = NULL;
442 
443  /* Initialize secondary spool */
444  btspool2->heap = heap;
445  btspool2->index = index;
446  btspool2->isunique = false;
447  /* Save as secondary spool */
448  buildstate->spool2 = btspool2;
449 
450  if (buildstate->btleader)
451  {
452  /*
453  * Set up non-private state that is passed to
454  * tuplesort_begin_index_btree() about the basic high level
455  * coordination of a parallel sort.
456  */
457  coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
458  coordinate2->isWorker = false;
459  coordinate2->nParticipants =
460  buildstate->btleader->nparticipanttuplesorts;
461  coordinate2->sharedsort = buildstate->btleader->sharedsort2;
462  }
463 
464  /*
465  * We expect that the second one (for dead tuples) won't get very
466  * full, so we give it only work_mem
467  */
468  buildstate->spool2->sortstate =
469  tuplesort_begin_index_btree(heap, index, false, false, work_mem,
470  coordinate2, TUPLESORT_NONE);
471  }
472 
473  /* Fill spool using either serial or parallel heap scan */
474  if (!buildstate->btleader)
475  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
476  _bt_build_callback, (void *) buildstate,
477  NULL);
478  else
479  reltuples = _bt_parallel_heapscan(buildstate,
480  &indexInfo->ii_BrokenHotChain);
481 
482  /*
483  * Set the progress target for the next phase. Reset the block number
484  * values set by table_index_build_scan
485  */
486  {
487  const int progress_index[] = {
491  };
492  const int64 progress_vals[] = {
493  buildstate->indtuples,
494  0, 0
495  };
496 
497  pgstat_progress_update_multi_param(3, progress_index, progress_vals);
498  }
499 
500  /* okay, all heap tuples are spooled */
501  if (buildstate->spool2 && !buildstate->havedead)
502  {
503  /* spool2 turns out to be unnecessary */
504  _bt_spooldestroy(buildstate->spool2);
505  buildstate->spool2 = NULL;
506  }
507 
508  return reltuples;
509 }
510 
511 /*
512  * clean up a spool structure and its substructures.
513  */
514 static void
516 {
517  tuplesort_end(btspool->sortstate);
518  pfree(btspool);
519 }
520 
521 /*
522  * spool an index entry into the sort file.
523  */
524 static void
525 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
526 {
527  tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
528  self, values, isnull);
529 }
530 
531 /*
532  * given a spool loaded by successive calls to _bt_spool,
533  * create an entire btree.
534  */
535 static void
536 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
537 {
538  BTWriteState wstate;
539 
540 #ifdef BTREE_BUILD_STATS
542  {
543  ShowUsage("BTREE BUILD (Spool) STATISTICS");
544  ResetUsage();
545  }
546 #endif /* BTREE_BUILD_STATS */
547 
548  /* Execute the sort */
552  if (btspool2)
553  {
556  tuplesort_performsort(btspool2->sortstate);
557  }
558 
559  wstate.heap = btspool->heap;
560  wstate.index = btspool->index;
561  wstate.inskey = _bt_mkscankey(wstate.index, NULL);
562  /* _bt_mkscankey() won't set allequalimage without metapage */
563  wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
564 
565  /* reserve the metapage */
566  wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
567 
570  _bt_load(&wstate, btspool, btspool2);
571 }
572 
573 /*
574  * Per-tuple callback for table_index_build_scan
575  */
576 static void
578  ItemPointer tid,
579  Datum *values,
580  bool *isnull,
581  bool tupleIsAlive,
582  void *state)
583 {
584  BTBuildState *buildstate = (BTBuildState *) state;
585 
586  /*
587  * insert the index tuple into the appropriate spool file for subsequent
588  * processing
589  */
590  if (tupleIsAlive || buildstate->spool2 == NULL)
591  _bt_spool(buildstate->spool, tid, values, isnull);
592  else
593  {
594  /* dead tuples are put into spool2 */
595  buildstate->havedead = true;
596  _bt_spool(buildstate->spool2, tid, values, isnull);
597  }
598 
599  buildstate->indtuples += 1;
600 }
601 
602 /*
603  * allocate workspace for a new, clean btree page, not linked to any siblings.
604  */
605 static BulkWriteBuffer
607 {
609  Page page;
610  BTPageOpaque opaque;
611 
612  buf = smgr_bulk_get_buf(wstate->bulkstate);
613  page = (Page) buf;
614 
615  /* Zero the page and set up standard page header info */
616  _bt_pageinit(page, BLCKSZ);
617 
618  /* Initialize BT opaque state */
619  opaque = BTPageGetOpaque(page);
620  opaque->btpo_prev = opaque->btpo_next = P_NONE;
621  opaque->btpo_level = level;
622  opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
623  opaque->btpo_cycleid = 0;
624 
625  /* Make the P_HIKEY line pointer appear allocated */
626  ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
627 
628  return buf;
629 }
630 
631 /*
632  * emit a completed btree page, and release the working storage.
633  */
634 static void
636 {
637  smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
638  /* smgr_bulk_write took ownership of 'buf' */
639 }
640 
641 /*
642  * allocate and initialize a new BTPageState. the returned structure
643  * is suitable for immediate use by _bt_buildadd.
644  */
645 static BTPageState *
647 {
649 
650  /* create initial page for level */
651  state->btps_buf = _bt_blnewpage(wstate, level);
652 
653  /* and assign it a page position */
654  state->btps_blkno = wstate->btws_pages_alloced++;
655 
656  state->btps_lowkey = NULL;
657  /* initialize lastoff so first item goes into P_FIRSTKEY */
658  state->btps_lastoff = P_HIKEY;
659  state->btps_lastextra = 0;
660  state->btps_level = level;
661  /* set "full" threshold based on level. See notes at head of file. */
662  if (level > 0)
663  state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
664  else
665  state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
666 
667  /* no parent level, yet */
668  state->btps_next = NULL;
669 
670  return state;
671 }
672 
673 /*
674  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
675  * P_HIKEY, overwriting P_HIKEY).
676  *
677  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
678  * rightmost page on its level is not supposed to get a high key. Now that
679  * it's clear that this page is a rightmost page, remove the unneeded empty
680  * P_HIKEY line pointer space.
681  */
682 static void
683 _bt_slideleft(Page rightmostpage)
684 {
685  OffsetNumber off;
686  OffsetNumber maxoff;
687  ItemId previi;
688 
689  maxoff = PageGetMaxOffsetNumber(rightmostpage);
690  Assert(maxoff >= P_FIRSTKEY);
691  previi = PageGetItemId(rightmostpage, P_HIKEY);
692  for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
693  {
694  ItemId thisii = PageGetItemId(rightmostpage, off);
695 
696  *previi = *thisii;
697  previi = thisii;
698  }
699  ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
700 }
701 
702 /*
703  * Add an item to a page being built.
704  *
705  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
706  * raises an error directly.
707  *
708  * Note that our nbtsort.c caller does not know yet if the page will be
709  * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
710  * caller. Page that turns out to be the rightmost on its level is fixed by
711  * calling _bt_slideleft().
712  */
713 static void
715  Size itemsize,
716  IndexTuple itup,
717  OffsetNumber itup_off,
718  bool newfirstdataitem)
719 {
720  IndexTupleData trunctuple;
721 
722  if (newfirstdataitem)
723  {
724  trunctuple = *itup;
725  trunctuple.t_info = sizeof(IndexTupleData);
726  BTreeTupleSetNAtts(&trunctuple, 0, false);
727  itup = &trunctuple;
728  itemsize = sizeof(IndexTupleData);
729  }
730 
731  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
732  false, false) == InvalidOffsetNumber)
733  elog(ERROR, "failed to add item to the index page");
734 }
735 
736 /*----------
737  * Add an item to a disk page from the sort output (or add a posting list
738  * item formed from the sort output).
739  *
740  * We must be careful to observe the page layout conventions of nbtsearch.c:
741  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
742  * - on non-leaf pages, the key portion of the first item need not be
743  * stored, we should store only the link.
744  *
745  * A leaf page being built looks like:
746  *
747  * +----------------+---------------------------------+
748  * | PageHeaderData | linp0 linp1 linp2 ... |
749  * +-----------+----+---------------------------------+
750  * | ... linpN | |
751  * +-----------+--------------------------------------+
752  * | ^ last |
753  * | |
754  * +-------------+------------------------------------+
755  * | | itemN ... |
756  * +-------------+------------------+-----------------+
757  * | ... item3 item2 item1 | "special space" |
758  * +--------------------------------+-----------------+
759  *
760  * Contrast this with the diagram in bufpage.h; note the mismatch
761  * between linps and items. This is because we reserve linp0 as a
762  * placeholder for the pointer to the "high key" item; when we have
763  * filled up the page, we will set linp0 to point to itemN and clear
764  * linpN. On the other hand, if we find this is the last (rightmost)
765  * page, we leave the items alone and slide the linp array over. If
766  * the high key is to be truncated, offset 1 is deleted, and we insert
767  * the truncated high key at offset 1.
768  *
769  * 'last' pointer indicates the last offset added to the page.
770  *
771  * 'truncextra' is the size of the posting list in itup, if any. This
772  * information is stashed for the next call here, when we may benefit
773  * from considering the impact of truncating away the posting list on
774  * the page before deciding to finish the page off. Posting lists are
775  * often relatively large, so it is worth going to the trouble of
776  * accounting for the saving from truncating away the posting list of
777  * the tuple that becomes the high key (that may be the only way to
778  * get close to target free space on the page). Note that this is
779  * only used for the soft fillfactor-wise limit, not the critical hard
780  * limit.
781  *----------
782  */
783 static void
785  Size truncextra)
786 {
787  BulkWriteBuffer nbuf;
788  Page npage;
789  BlockNumber nblkno;
790  OffsetNumber last_off;
791  Size last_truncextra;
792  Size pgspc;
793  Size itupsz;
794  bool isleaf;
795 
796  /*
797  * This is a handy place to check for cancel interrupts during the btree
798  * load phase of index creation.
799  */
801 
802  nbuf = state->btps_buf;
803  npage = (Page) nbuf;
804  nblkno = state->btps_blkno;
805  last_off = state->btps_lastoff;
806  last_truncextra = state->btps_lastextra;
807  state->btps_lastextra = truncextra;
808 
809  pgspc = PageGetFreeSpace(npage);
810  itupsz = IndexTupleSize(itup);
811  itupsz = MAXALIGN(itupsz);
812  /* Leaf case has slightly different rules due to suffix truncation */
813  isleaf = (state->btps_level == 0);
814 
815  /*
816  * Check whether the new item can fit on a btree page on current level at
817  * all.
818  *
819  * Every newly built index will treat heap TID as part of the keyspace,
820  * which imposes the requirement that new high keys must occasionally have
821  * a heap TID appended within _bt_truncate(). That may leave a new pivot
822  * tuple one or two MAXALIGN() quantums larger than the original
823  * firstright tuple it's derived from. v4 deals with the problem by
824  * decreasing the limit on the size of tuples inserted on the leaf level
825  * by the same small amount. Enforce the new v4+ limit on the leaf level,
826  * and the old limit on internal levels, since pivot tuples may need to
827  * make use of the reserved space. This should never fail on internal
828  * pages.
829  */
830  if (unlikely(itupsz > BTMaxItemSize(npage)))
831  _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
832  itup);
833 
834  /*
835  * Check to see if current page will fit new item, with space left over to
836  * append a heap TID during suffix truncation when page is a leaf page.
837  *
838  * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
839  * high key with heap TID when finishing off a leaf page, since we rely on
840  * _bt_check_third_page() rejecting oversized non-pivot tuples. On
841  * internal pages we can always fit 3 pivot tuples with larger internal
842  * page tuple limit (includes page high key).
843  *
844  * Most of the time, a page is only "full" in the sense that the soft
845  * fillfactor-wise limit has been exceeded. However, we must always leave
846  * at least two items plus a high key on each page before starting a new
847  * page. Disregard fillfactor and insert on "full" current page if we
848  * don't have the minimum number of items yet. (Note that we deliberately
849  * assume that suffix truncation neither enlarges nor shrinks new high key
850  * when applying soft limit, except when last tuple has a posting list.)
851  */
852  Assert(last_truncextra == 0 || isleaf);
853  if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
854  (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
855  {
856  /*
857  * Finish off the page and write it out.
858  */
859  BulkWriteBuffer obuf = nbuf;
860  Page opage = npage;
861  BlockNumber oblkno = nblkno;
862  ItemId ii;
863  ItemId hii;
864  IndexTuple oitup;
865 
866  /* Create new page of same level */
867  nbuf = _bt_blnewpage(wstate, state->btps_level);
868  npage = (Page) nbuf;
869 
870  /* and assign it a page position */
871  nblkno = wstate->btws_pages_alloced++;
872 
873  /*
874  * We copy the last item on the page into the new page, and then
875  * rearrange the old page so that the 'last item' becomes its high key
876  * rather than a true data item. There had better be at least two
877  * items on the page already, else the page would be empty of useful
878  * data.
879  */
880  Assert(last_off > P_FIRSTKEY);
881  ii = PageGetItemId(opage, last_off);
882  oitup = (IndexTuple) PageGetItem(opage, ii);
883  _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
884  !isleaf);
885 
886  /*
887  * Move 'last' into the high key position on opage. _bt_blnewpage()
888  * allocated empty space for a line pointer when opage was first
889  * created, so this is a matter of rearranging already-allocated space
890  * on page, and initializing high key line pointer. (Actually, leaf
891  * pages must also swap oitup with a truncated version of oitup, which
892  * is sometimes larger than oitup, though never by more than the space
893  * needed to append a heap TID.)
894  */
895  hii = PageGetItemId(opage, P_HIKEY);
896  *hii = *ii;
897  ItemIdSetUnused(ii); /* redundant */
898  ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
899 
900  if (isleaf)
901  {
902  IndexTuple lastleft;
903  IndexTuple truncated;
904 
905  /*
906  * Truncate away any unneeded attributes from high key on leaf
907  * level. This is only done at the leaf level because downlinks
908  * in internal pages are either negative infinity items, or get
909  * their contents from copying from one level down. See also:
910  * _bt_split().
911  *
912  * We don't try to bias our choice of split point to make it more
913  * likely that _bt_truncate() can truncate away more attributes,
914  * whereas the split point used within _bt_split() is chosen much
915  * more delicately. Even still, the lastleft and firstright
916  * tuples passed to _bt_truncate() here are at least not fully
917  * equal to each other when deduplication is used, unless there is
918  * a large group of duplicates (also, unique index builds usually
919  * have few or no spool2 duplicates). When the split point is
920  * between two unequal tuples, _bt_truncate() will avoid including
921  * a heap TID in the new high key, which is the most important
922  * benefit of suffix truncation.
923  *
924  * Overwrite the old item with new truncated high key directly.
925  * oitup is already located at the physical beginning of tuple
926  * space, so this should directly reuse the existing tuple space.
927  */
928  ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
929  lastleft = (IndexTuple) PageGetItem(opage, ii);
930 
931  Assert(IndexTupleSize(oitup) > last_truncextra);
932  truncated = _bt_truncate(wstate->index, lastleft, oitup,
933  wstate->inskey);
934  if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
935  IndexTupleSize(truncated)))
936  elog(ERROR, "failed to add high key to the index page");
937  pfree(truncated);
938 
939  /* oitup should continue to point to the page's high key */
940  hii = PageGetItemId(opage, P_HIKEY);
941  oitup = (IndexTuple) PageGetItem(opage, hii);
942  }
943 
944  /*
945  * Link the old page into its parent, using its low key. If we don't
946  * have a parent, we have to create one; this adds a new btree level.
947  */
948  if (state->btps_next == NULL)
949  state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
950 
951  Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
953  BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
954  P_LEFTMOST(BTPageGetOpaque(opage)));
955  Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
956  !P_LEFTMOST(BTPageGetOpaque(opage)));
957  BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
958  _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
959  pfree(state->btps_lowkey);
960 
961  /*
962  * Save a copy of the high key from the old page. It is also the low
963  * key for the new page.
964  */
965  state->btps_lowkey = CopyIndexTuple(oitup);
966 
967  /*
968  * Set the sibling links for both pages.
969  */
970  {
971  BTPageOpaque oopaque = BTPageGetOpaque(opage);
972  BTPageOpaque nopaque = BTPageGetOpaque(npage);
973 
974  oopaque->btpo_next = nblkno;
975  nopaque->btpo_prev = oblkno;
976  nopaque->btpo_next = P_NONE; /* redundant */
977  }
978 
979  /*
980  * Write out the old page. _bt_blwritepage takes ownership of the
981  * 'opage' buffer.
982  */
983  _bt_blwritepage(wstate, obuf, oblkno);
984 
985  /*
986  * Reset last_off to point to new page
987  */
988  last_off = P_FIRSTKEY;
989  }
990 
991  /*
992  * By here, either original page is still the current page, or a new page
993  * was created that became the current page. Either way, the current page
994  * definitely has space for new item.
995  *
996  * If the new item is the first for its page, it must also be the first
997  * item on its entire level. On later same-level pages, a low key for a
998  * page will be copied from the prior page in the code above. Generate a
999  * minus infinity low key here instead.
1000  */
1001  if (last_off == P_HIKEY)
1002  {
1003  Assert(state->btps_lowkey == NULL);
1004  state->btps_lowkey = palloc0(sizeof(IndexTupleData));
1005  state->btps_lowkey->t_info = sizeof(IndexTupleData);
1006  BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1007  }
1008 
1009  /*
1010  * Add the new item into the current page.
1011  */
1012  last_off = OffsetNumberNext(last_off);
1013  _bt_sortaddtup(npage, itupsz, itup, last_off,
1014  !isleaf && last_off == P_FIRSTKEY);
1015 
1016  state->btps_buf = nbuf;
1017  state->btps_blkno = nblkno;
1018  state->btps_lastoff = last_off;
1019 }
1020 
1021 /*
1022  * Finalize pending posting list tuple, and add it to the index. Final tuple
1023  * is based on saved base tuple, and saved list of heap TIDs.
1024  *
1025  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1026  * using _bt_buildadd().
1027  */
1028 static void
1030  BTDedupState dstate)
1031 {
1032  Assert(dstate->nitems > 0);
1033 
1034  if (dstate->nitems == 1)
1035  _bt_buildadd(wstate, state, dstate->base, 0);
1036  else
1037  {
1038  IndexTuple postingtuple;
1039  Size truncextra;
1040 
1041  /* form a tuple with a posting list */
1042  postingtuple = _bt_form_posting(dstate->base,
1043  dstate->htids,
1044  dstate->nhtids);
1045  /* Calculate posting list overhead */
1046  truncextra = IndexTupleSize(postingtuple) -
1047  BTreeTupleGetPostingOffset(postingtuple);
1048 
1049  _bt_buildadd(wstate, state, postingtuple, truncextra);
1050  pfree(postingtuple);
1051  }
1052 
1053  dstate->nmaxitems = 0;
1054  dstate->nhtids = 0;
1055  dstate->nitems = 0;
1056  dstate->phystupsize = 0;
1057 }
1058 
1059 /*
1060  * Finish writing out the completed btree.
1061  */
1062 static void
1064 {
1065  BTPageState *s;
1066  BlockNumber rootblkno = P_NONE;
1067  uint32 rootlevel = 0;
1068  BulkWriteBuffer metabuf;
1069 
1070  /*
1071  * Each iteration of this loop completes one more level of the tree.
1072  */
1073  for (s = state; s != NULL; s = s->btps_next)
1074  {
1075  BlockNumber blkno;
1076  BTPageOpaque opaque;
1077 
1078  blkno = s->btps_blkno;
1079  opaque = BTPageGetOpaque((Page) s->btps_buf);
1080 
1081  /*
1082  * We have to link the last page on this level to somewhere.
1083  *
1084  * If we're at the top, it's the root, so attach it to the metapage.
1085  * Otherwise, add an entry for it to its parent using its low key.
1086  * This may cause the last page of the parent level to split, but
1087  * that's not a problem -- we haven't gotten to it yet.
1088  */
1089  if (s->btps_next == NULL)
1090  {
1091  opaque->btpo_flags |= BTP_ROOT;
1092  rootblkno = blkno;
1093  rootlevel = s->btps_level;
1094  }
1095  else
1096  {
1097  Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1099  BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1100  P_LEFTMOST(opaque));
1101  Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1102  !P_LEFTMOST(opaque));
1104  _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1105  pfree(s->btps_lowkey);
1106  s->btps_lowkey = NULL;
1107  }
1108 
1109  /*
1110  * This is the rightmost page, so the ItemId array needs to be slid
1111  * back one slot. Then we can dump out the page.
1112  */
1113  _bt_slideleft((Page) s->btps_buf);
1114  _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
1115  s->btps_buf = NULL; /* writepage took ownership of the buffer */
1116  }
1117 
1118  /*
1119  * As the last step in the process, construct the metapage and make it
1120  * point to the new root (unless we had no data at all, in which case it's
1121  * set to point to "P_NONE"). This changes the index to the "valid" state
1122  * by filling in a valid magic number in the metapage.
1123  */
1124  metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1125  _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
1126  wstate->inskey->allequalimage);
1127  _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
1128 }
1129 
1130 /*
1131  * Read tuples in correct sort order from tuplesort, and load them into
1132  * btree leaves.
1133  */
1134 static void
1135 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1136 {
1137  BTPageState *state = NULL;
1138  bool merge = (btspool2 != NULL);
1139  IndexTuple itup,
1140  itup2 = NULL;
1141  bool load1;
1142  TupleDesc tupdes = RelationGetDescr(wstate->index);
1143  int i,
1145  SortSupport sortKeys;
1146  int64 tuples_done = 0;
1147  bool deduplicate;
1148 
1149  wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1150 
1151  deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1152  BTGetDeduplicateItems(wstate->index);
1153 
1154  if (merge)
1155  {
1156  /*
1157  * Another BTSpool for dead tuples exists. Now we have to merge
1158  * btspool and btspool2.
1159  */
1160 
1161  /* the preparation of merge */
1162  itup = tuplesort_getindextuple(btspool->sortstate, true);
1163  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1164 
1165  /* Prepare SortSupport data for each column */
1166  sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1167 
1168  for (i = 0; i < keysz; i++)
1169  {
1170  SortSupport sortKey = sortKeys + i;
1171  ScanKey scanKey = wstate->inskey->scankeys + i;
1172  int16 strategy;
1173 
1174  sortKey->ssup_cxt = CurrentMemoryContext;
1175  sortKey->ssup_collation = scanKey->sk_collation;
1176  sortKey->ssup_nulls_first =
1177  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1178  sortKey->ssup_attno = scanKey->sk_attno;
1179  /* Abbreviation is not supported here */
1180  sortKey->abbreviate = false;
1181 
1182  Assert(sortKey->ssup_attno != 0);
1183 
1184  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1186 
1187  PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1188  }
1189 
1190  for (;;)
1191  {
1192  load1 = true; /* load BTSpool next ? */
1193  if (itup2 == NULL)
1194  {
1195  if (itup == NULL)
1196  break;
1197  }
1198  else if (itup != NULL)
1199  {
1200  int32 compare = 0;
1201 
1202  for (i = 1; i <= keysz; i++)
1203  {
1204  SortSupport entry;
1205  Datum attrDatum1,
1206  attrDatum2;
1207  bool isNull1,
1208  isNull2;
1209 
1210  entry = sortKeys + i - 1;
1211  attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1212  attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1213 
1214  compare = ApplySortComparator(attrDatum1, isNull1,
1215  attrDatum2, isNull2,
1216  entry);
1217  if (compare > 0)
1218  {
1219  load1 = false;
1220  break;
1221  }
1222  else if (compare < 0)
1223  break;
1224  }
1225 
1226  /*
1227  * If key values are equal, we sort on ItemPointer. This is
1228  * required for btree indexes, since heap TID is treated as an
1229  * implicit last key attribute in order to ensure that all
1230  * keys in the index are physically unique.
1231  */
1232  if (compare == 0)
1233  {
1234  compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1235  Assert(compare != 0);
1236  if (compare > 0)
1237  load1 = false;
1238  }
1239  }
1240  else
1241  load1 = false;
1242 
1243  /* When we see first tuple, create first index page */
1244  if (state == NULL)
1245  state = _bt_pagestate(wstate, 0);
1246 
1247  if (load1)
1248  {
1249  _bt_buildadd(wstate, state, itup, 0);
1250  itup = tuplesort_getindextuple(btspool->sortstate, true);
1251  }
1252  else
1253  {
1254  _bt_buildadd(wstate, state, itup2, 0);
1255  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1256  }
1257 
1258  /* Report progress */
1260  ++tuples_done);
1261  }
1262  pfree(sortKeys);
1263  }
1264  else if (deduplicate)
1265  {
1266  /* merge is unnecessary, deduplicate into posting lists */
1267  BTDedupState dstate;
1268 
1269  dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
1270  dstate->deduplicate = true; /* unused */
1271  dstate->nmaxitems = 0; /* unused */
1272  dstate->maxpostingsize = 0; /* set later */
1273  /* Metadata about base tuple of current pending posting list */
1274  dstate->base = NULL;
1275  dstate->baseoff = InvalidOffsetNumber; /* unused */
1276  dstate->basetupsize = 0;
1277  /* Metadata about current pending posting list TIDs */
1278  dstate->htids = NULL;
1279  dstate->nhtids = 0;
1280  dstate->nitems = 0;
1281  dstate->phystupsize = 0; /* unused */
1282  dstate->nintervals = 0; /* unused */
1283 
1284  while ((itup = tuplesort_getindextuple(btspool->sortstate,
1285  true)) != NULL)
1286  {
1287  /* When we see first tuple, create first index page */
1288  if (state == NULL)
1289  {
1290  state = _bt_pagestate(wstate, 0);
1291 
1292  /*
1293  * Limit size of posting list tuples to 1/10 space we want to
1294  * leave behind on the page, plus space for final item's line
1295  * pointer. This is equal to the space that we'd like to
1296  * leave behind on each leaf page when fillfactor is 90,
1297  * allowing us to get close to fillfactor% space utilization
1298  * when there happen to be a great many duplicates. (This
1299  * makes higher leaf fillfactor settings ineffective when
1300  * building indexes that have many duplicates, but packing
1301  * leaf pages full with few very large tuples doesn't seem
1302  * like a useful goal.)
1303  */
1304  dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1305  sizeof(ItemIdData);
1306  Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) &&
1307  dstate->maxpostingsize <= INDEX_SIZE_MASK);
1308  dstate->htids = palloc(dstate->maxpostingsize);
1309 
1310  /* start new pending posting list with itup copy */
1313  }
1314  else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1315  itup) > keysz &&
1316  _bt_dedup_save_htid(dstate, itup))
1317  {
1318  /*
1319  * Tuple is equal to base tuple of pending posting list. Heap
1320  * TID from itup has been saved in state.
1321  */
1322  }
1323  else
1324  {
1325  /*
1326  * Tuple is not equal to pending posting list tuple, or
1327  * _bt_dedup_save_htid() opted to not merge current item into
1328  * pending posting list.
1329  */
1330  _bt_sort_dedup_finish_pending(wstate, state, dstate);
1331  pfree(dstate->base);
1332 
1333  /* start new pending posting list with itup copy */
1336  }
1337 
1338  /* Report progress */
1340  ++tuples_done);
1341  }
1342 
1343  if (state)
1344  {
1345  /*
1346  * Handle the last item (there must be a last item when the
1347  * tuplesort returned one or more tuples)
1348  */
1349  _bt_sort_dedup_finish_pending(wstate, state, dstate);
1350  pfree(dstate->base);
1351  pfree(dstate->htids);
1352  }
1353 
1354  pfree(dstate);
1355  }
1356  else
1357  {
1358  /* merging and deduplication are both unnecessary */
1359  while ((itup = tuplesort_getindextuple(btspool->sortstate,
1360  true)) != NULL)
1361  {
1362  /* When we see first tuple, create first index page */
1363  if (state == NULL)
1364  state = _bt_pagestate(wstate, 0);
1365 
1366  _bt_buildadd(wstate, state, itup, 0);
1367 
1368  /* Report progress */
1370  ++tuples_done);
1371  }
1372  }
1373 
1374  /* Close down final pages and write the metapage */
1375  _bt_uppershutdown(wstate, state);
1376  smgr_bulk_finish(wstate->bulkstate);
1377 }
1378 
1379 /*
1380  * Create parallel context, and launch workers for leader.
1381  *
1382  * buildstate argument should be initialized (with the exception of the
1383  * tuplesort state in spools, which may later be created based on shared
1384  * state initially set up here).
1385  *
1386  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1387  *
1388  * request is the target number of parallel worker processes to launch.
1389  *
1390  * Sets buildstate's BTLeader, which caller must use to shut down parallel
1391  * mode by passing it to _bt_end_parallel() at the very end of its index
1392  * build. If not even a single worker process can be launched, this is
1393  * never set, and caller should proceed with a serial index build.
1394  */
1395 static void
1396 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1397 {
1398  ParallelContext *pcxt;
1399  int scantuplesortstates;
1400  Snapshot snapshot;
1401  Size estbtshared;
1402  Size estsort;
1403  BTShared *btshared;
1404  Sharedsort *sharedsort;
1405  Sharedsort *sharedsort2;
1406  BTSpool *btspool = buildstate->spool;
1407  BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1408  WalUsage *walusage;
1409  BufferUsage *bufferusage;
1410  bool leaderparticipates = true;
1411  int querylen;
1412 
1413 #ifdef DISABLE_LEADER_PARTICIPATION
1414  leaderparticipates = false;
1415 #endif
1416 
1417  /*
1418  * Enter parallel mode, and create context for parallel build of btree
1419  * index
1420  */
1422  Assert(request > 0);
1423  pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1424  request);
1425 
1426  scantuplesortstates = leaderparticipates ? request + 1 : request;
1427 
1428  /*
1429  * Prepare for scan of the base relation. In a normal index build, we use
1430  * SnapshotAny because we must retrieve all tuples and do our own time
1431  * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1432  * concurrent build, we take a regular MVCC snapshot and index whatever's
1433  * live according to that.
1434  */
1435  if (!isconcurrent)
1436  snapshot = SnapshotAny;
1437  else
1439 
1440  /*
1441  * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1442  * PARALLEL_KEY_TUPLESORT tuplesort workspace
1443  */
1444  estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1445  shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1446  estsort = tuplesort_estimate_shared(scantuplesortstates);
1447  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1448 
1449  /*
1450  * Unique case requires a second spool, and so we may have to account for
1451  * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1452  */
1453  if (!btspool->isunique)
1454  shm_toc_estimate_keys(&pcxt->estimator, 2);
1455  else
1456  {
1457  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1458  shm_toc_estimate_keys(&pcxt->estimator, 3);
1459  }
1460 
1461  /*
1462  * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1463  * and PARALLEL_KEY_BUFFER_USAGE.
1464  *
1465  * If there are no extensions loaded that care, we could skip this. We
1466  * have no way of knowing whether anyone's looking at pgWalUsage or
1467  * pgBufferUsage, so do it unconditionally.
1468  */
1470  mul_size(sizeof(WalUsage), pcxt->nworkers));
1471  shm_toc_estimate_keys(&pcxt->estimator, 1);
1473  mul_size(sizeof(BufferUsage), pcxt->nworkers));
1474  shm_toc_estimate_keys(&pcxt->estimator, 1);
1475 
1476  /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1477  if (debug_query_string)
1478  {
1479  querylen = strlen(debug_query_string);
1480  shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1481  shm_toc_estimate_keys(&pcxt->estimator, 1);
1482  }
1483  else
1484  querylen = 0; /* keep compiler quiet */
1485 
1486  /* Everyone's had a chance to ask for space, so now create the DSM */
1487  InitializeParallelDSM(pcxt);
1488 
1489  /* If no DSM segment was available, back out (do serial build) */
1490  if (pcxt->seg == NULL)
1491  {
1492  if (IsMVCCSnapshot(snapshot))
1493  UnregisterSnapshot(snapshot);
1494  DestroyParallelContext(pcxt);
1495  ExitParallelMode();
1496  return;
1497  }
1498 
1499  /* Store shared build state, for which we reserved space */
1500  btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1501  /* Initialize immutable state */
1502  btshared->heaprelid = RelationGetRelid(btspool->heap);
1503  btshared->indexrelid = RelationGetRelid(btspool->index);
1504  btshared->isunique = btspool->isunique;
1505  btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1506  btshared->isconcurrent = isconcurrent;
1507  btshared->scantuplesortstates = scantuplesortstates;
1509  SpinLockInit(&btshared->mutex);
1510  /* Initialize mutable state */
1511  btshared->nparticipantsdone = 0;
1512  btshared->reltuples = 0.0;
1513  btshared->havedead = false;
1514  btshared->indtuples = 0.0;
1515  btshared->brokenhotchain = false;
1518  snapshot);
1519 
1520  /*
1521  * Store shared tuplesort-private state, for which we reserved space.
1522  * Then, initialize opaque state using tuplesort routine.
1523  */
1524  sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1525  tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1526  pcxt->seg);
1527 
1528  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1529  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1530 
1531  /* Unique case requires a second spool, and associated shared state */
1532  if (!btspool->isunique)
1533  sharedsort2 = NULL;
1534  else
1535  {
1536  /*
1537  * Store additional shared tuplesort-private state, for which we
1538  * reserved space. Then, initialize opaque state using tuplesort
1539  * routine.
1540  */
1541  sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1542  tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1543  pcxt->seg);
1544 
1545  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1546  }
1547 
1548  /* Store query string for workers */
1549  if (debug_query_string)
1550  {
1551  char *sharedquery;
1552 
1553  sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1554  memcpy(sharedquery, debug_query_string, querylen + 1);
1555  shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1556  }
1557 
1558  /*
1559  * Allocate space for each worker's WalUsage and BufferUsage; no need to
1560  * initialize.
1561  */
1562  walusage = shm_toc_allocate(pcxt->toc,
1563  mul_size(sizeof(WalUsage), pcxt->nworkers));
1564  shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1565  bufferusage = shm_toc_allocate(pcxt->toc,
1566  mul_size(sizeof(BufferUsage), pcxt->nworkers));
1567  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1568 
1569  /* Launch workers, saving status for leader/caller */
1570  LaunchParallelWorkers(pcxt);
1571  btleader->pcxt = pcxt;
1572  btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1573  if (leaderparticipates)
1574  btleader->nparticipanttuplesorts++;
1575  btleader->btshared = btshared;
1576  btleader->sharedsort = sharedsort;
1577  btleader->sharedsort2 = sharedsort2;
1578  btleader->snapshot = snapshot;
1579  btleader->walusage = walusage;
1580  btleader->bufferusage = bufferusage;
1581 
1582  /* If no workers were successfully launched, back out (do serial build) */
1583  if (pcxt->nworkers_launched == 0)
1584  {
1585  _bt_end_parallel(btleader);
1586  return;
1587  }
1588 
1589  /* Save leader state now that it's clear build will be parallel */
1590  buildstate->btleader = btleader;
1591 
1592  /* Join heap scan ourselves */
1593  if (leaderparticipates)
1595 
1596  /*
1597  * Caller needs to wait for all launched workers when we return. Make
1598  * sure that the failure-to-start case will not hang forever.
1599  */
1601 }
1602 
1603 /*
1604  * Shut down workers, destroy parallel context, and end parallel mode.
1605  */
1606 static void
1608 {
1609  int i;
1610 
1611  /* Shutdown worker processes */
1613 
1614  /*
1615  * Next, accumulate WAL usage. (This must wait for the workers to finish,
1616  * or we might get incomplete data.)
1617  */
1618  for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1619  InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1620 
1621  /* Free last reference to MVCC snapshot, if one was used */
1622  if (IsMVCCSnapshot(btleader->snapshot))
1623  UnregisterSnapshot(btleader->snapshot);
1624  DestroyParallelContext(btleader->pcxt);
1625  ExitParallelMode();
1626 }
1627 
1628 /*
1629  * Returns size of shared memory required to store state for a parallel
1630  * btree index build based on the snapshot its parallel scan will use.
1631  */
1632 static Size
1634 {
1635  /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1636  return add_size(BUFFERALIGN(sizeof(BTShared)),
1637  table_parallelscan_estimate(heap, snapshot));
1638 }
1639 
1640 /*
1641  * Within leader, wait for end of heap scan.
1642  *
1643  * When called, parallel heap scan started by _bt_begin_parallel() will
1644  * already be underway within worker processes (when leader participates
1645  * as a worker, we should end up here just as workers are finishing).
1646  *
1647  * Fills in fields needed for ambuild statistics, and lets caller set
1648  * field indicating that some worker encountered a broken HOT chain.
1649  *
1650  * Returns the total number of heap tuples scanned.
1651  */
1652 static double
1653 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1654 {
1655  BTShared *btshared = buildstate->btleader->btshared;
1656  int nparticipanttuplesorts;
1657  double reltuples;
1658 
1659  nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1660  for (;;)
1661  {
1662  SpinLockAcquire(&btshared->mutex);
1663  if (btshared->nparticipantsdone == nparticipanttuplesorts)
1664  {
1665  buildstate->havedead = btshared->havedead;
1666  buildstate->indtuples = btshared->indtuples;
1667  *brokenhotchain = btshared->brokenhotchain;
1668  reltuples = btshared->reltuples;
1669  SpinLockRelease(&btshared->mutex);
1670  break;
1671  }
1672  SpinLockRelease(&btshared->mutex);
1673 
1675  WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1676  }
1677 
1679 
1680  return reltuples;
1681 }
1682 
1683 /*
1684  * Within leader, participate as a parallel worker.
1685  */
1686 static void
1688 {
1689  BTLeader *btleader = buildstate->btleader;
1690  BTSpool *leaderworker;
1691  BTSpool *leaderworker2;
1692  int sortmem;
1693 
1694  /* Allocate memory and initialize private spool */
1695  leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1696  leaderworker->heap = buildstate->spool->heap;
1697  leaderworker->index = buildstate->spool->index;
1698  leaderworker->isunique = buildstate->spool->isunique;
1699  leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1700 
1701  /* Initialize second spool, if required */
1702  if (!btleader->btshared->isunique)
1703  leaderworker2 = NULL;
1704  else
1705  {
1706  /* Allocate memory for worker's own private secondary spool */
1707  leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1708 
1709  /* Initialize worker's own secondary spool */
1710  leaderworker2->heap = leaderworker->heap;
1711  leaderworker2->index = leaderworker->index;
1712  leaderworker2->isunique = false;
1713  }
1714 
1715  /*
1716  * Might as well use reliable figure when doling out maintenance_work_mem
1717  * (when requested number of workers were not launched, this will be
1718  * somewhat higher than it is for other workers).
1719  */
1720  sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1721 
1722  /* Perform work common to all participants */
1723  _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1724  btleader->sharedsort, btleader->sharedsort2,
1725  sortmem, true);
1726 
1727 #ifdef BTREE_BUILD_STATS
1729  {
1730  ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1731  ResetUsage();
1732  }
1733 #endif /* BTREE_BUILD_STATS */
1734 }
1735 
1736 /*
1737  * Perform work within a launched parallel process.
1738  */
1739 void
1741 {
1742  char *sharedquery;
1743  BTSpool *btspool;
1744  BTSpool *btspool2;
1745  BTShared *btshared;
1746  Sharedsort *sharedsort;
1747  Sharedsort *sharedsort2;
1748  Relation heapRel;
1749  Relation indexRel;
1750  LOCKMODE heapLockmode;
1751  LOCKMODE indexLockmode;
1752  WalUsage *walusage;
1753  BufferUsage *bufferusage;
1754  int sortmem;
1755 
1756 #ifdef BTREE_BUILD_STATS
1758  ResetUsage();
1759 #endif /* BTREE_BUILD_STATS */
1760 
1761  /*
1762  * The only possible status flag that can be set to the parallel worker is
1763  * PROC_IN_SAFE_IC.
1764  */
1765  Assert((MyProc->statusFlags == 0) ||
1767 
1768  /* Set debug_query_string for individual workers first */
1769  sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1770  debug_query_string = sharedquery;
1771 
1772  /* Report the query string from leader */
1774 
1775  /* Look up nbtree shared state */
1776  btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1777 
1778  /* Open relations using lock modes known to be obtained by index.c */
1779  if (!btshared->isconcurrent)
1780  {
1781  heapLockmode = ShareLock;
1782  indexLockmode = AccessExclusiveLock;
1783  }
1784  else
1785  {
1786  heapLockmode = ShareUpdateExclusiveLock;
1787  indexLockmode = RowExclusiveLock;
1788  }
1789 
1790  /* Open relations within worker */
1791  heapRel = table_open(btshared->heaprelid, heapLockmode);
1792  indexRel = index_open(btshared->indexrelid, indexLockmode);
1793 
1794  /* Initialize worker's own spool */
1795  btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1796  btspool->heap = heapRel;
1797  btspool->index = indexRel;
1798  btspool->isunique = btshared->isunique;
1799  btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1800 
1801  /* Look up shared state private to tuplesort.c */
1802  sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1803  tuplesort_attach_shared(sharedsort, seg);
1804  if (!btshared->isunique)
1805  {
1806  btspool2 = NULL;
1807  sharedsort2 = NULL;
1808  }
1809  else
1810  {
1811  /* Allocate memory for worker's own private secondary spool */
1812  btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1813 
1814  /* Initialize worker's own secondary spool */
1815  btspool2->heap = btspool->heap;
1816  btspool2->index = btspool->index;
1817  btspool2->isunique = false;
1818  /* Look up shared state private to tuplesort.c */
1819  sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1820  tuplesort_attach_shared(sharedsort2, seg);
1821  }
1822 
1823  /* Prepare to track buffer usage during parallel execution */
1825 
1826  /* Perform sorting of spool, and possibly a spool2 */
1827  sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1828  _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1829  sharedsort2, sortmem, false);
1830 
1831  /* Report WAL/buffer usage during parallel execution */
1832  bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1833  walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1835  &walusage[ParallelWorkerNumber]);
1836 
1837 #ifdef BTREE_BUILD_STATS
1839  {
1840  ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1841  ResetUsage();
1842  }
1843 #endif /* BTREE_BUILD_STATS */
1844 
1845  index_close(indexRel, indexLockmode);
1846  table_close(heapRel, heapLockmode);
1847 }
1848 
1849 /*
1850  * Perform a worker's portion of a parallel sort.
1851  *
1852  * This generates a tuplesort for passed btspool, and a second tuplesort
1853  * state if a second btspool is need (i.e. for unique index builds). All
1854  * other spool fields should already be set when this is called.
1855  *
1856  * sortmem is the amount of working memory to use within each worker,
1857  * expressed in KBs.
1858  *
1859  * When this returns, workers are done, and need only release resources.
1860  */
1861 static void
1863  BTShared *btshared, Sharedsort *sharedsort,
1864  Sharedsort *sharedsort2, int sortmem, bool progress)
1865 {
1866  SortCoordinate coordinate;
1867  BTBuildState buildstate;
1868  TableScanDesc scan;
1869  double reltuples;
1870  IndexInfo *indexInfo;
1871 
1872  /* Initialize local tuplesort coordination state */
1873  coordinate = palloc0(sizeof(SortCoordinateData));
1874  coordinate->isWorker = true;
1875  coordinate->nParticipants = -1;
1876  coordinate->sharedsort = sharedsort;
1877 
1878  /* Begin "partial" tuplesort */
1879  btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1880  btspool->index,
1881  btspool->isunique,
1882  btspool->nulls_not_distinct,
1883  sortmem, coordinate,
1884  TUPLESORT_NONE);
1885 
1886  /*
1887  * Just as with serial case, there may be a second spool. If so, a
1888  * second, dedicated spool2 partial tuplesort is required.
1889  */
1890  if (btspool2)
1891  {
1892  SortCoordinate coordinate2;
1893 
1894  /*
1895  * We expect that the second one (for dead tuples) won't get very
1896  * full, so we give it only work_mem (unless sortmem is less for
1897  * worker). Worker processes are generally permitted to allocate
1898  * work_mem independently.
1899  */
1900  coordinate2 = palloc0(sizeof(SortCoordinateData));
1901  coordinate2->isWorker = true;
1902  coordinate2->nParticipants = -1;
1903  coordinate2->sharedsort = sharedsort2;
1904  btspool2->sortstate =
1905  tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1906  Min(sortmem, work_mem), coordinate2,
1907  false);
1908  }
1909 
1910  /* Fill in buildstate for _bt_build_callback() */
1911  buildstate.isunique = btshared->isunique;
1912  buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1913  buildstate.havedead = false;
1914  buildstate.heap = btspool->heap;
1915  buildstate.spool = btspool;
1916  buildstate.spool2 = btspool2;
1917  buildstate.indtuples = 0;
1918  buildstate.btleader = NULL;
1919 
1920  /* Join parallel scan */
1921  indexInfo = BuildIndexInfo(btspool->index);
1922  indexInfo->ii_Concurrent = btshared->isconcurrent;
1923  scan = table_beginscan_parallel(btspool->heap,
1924  ParallelTableScanFromBTShared(btshared));
1925  reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1927  (void *) &buildstate, scan);
1928 
1929  /* Execute this worker's part of the sort */
1930  if (progress)
1933  tuplesort_performsort(btspool->sortstate);
1934  if (btspool2)
1935  {
1936  if (progress)
1939  tuplesort_performsort(btspool2->sortstate);
1940  }
1941 
1942  /*
1943  * Done. Record ambuild statistics, and whether we encountered a broken
1944  * HOT chain.
1945  */
1946  SpinLockAcquire(&btshared->mutex);
1947  btshared->nparticipantsdone++;
1948  btshared->reltuples += reltuples;
1949  if (buildstate.havedead)
1950  btshared->havedead = true;
1951  btshared->indtuples += buildstate.indtuples;
1952  if (indexInfo->ii_BrokenHotChain)
1953  btshared->brokenhotchain = true;
1954  SpinLockRelease(&btshared->mutex);
1955 
1956  /* Notify leader */
1958 
1959  /* We can end tuplesorts immediately */
1960  tuplesort_end(btspool->sortstate);
1961  if (btspool2)
1962  tuplesort_end(btspool2->sortstate);
1963 }
int ParallelWorkerNumber
Definition: parallel.c:112
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:205
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:775
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:552
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:929
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition: parallel.c:167
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition: parallel.c:672
void pgstat_progress_update_param(int index, int64 val)
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition: block.h:31
static Datum values[MAXATTR]
Definition: bootstrap.c:152
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:281
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1405
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:907
PageHeaderData * PageHeader
Definition: bufpage.h:170
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:468
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:271
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:295
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:129
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:86
unsigned int uint32
Definition: c.h:506
#define MAXALIGN_DOWN(LEN)
Definition: c.h:823
#define Min(x, y)
Definition: c.h:1004
signed short int16
Definition: c.h:493
#define MAXALIGN(LEN)
Definition: c.h:811
signed int int32
Definition: c.h:494
#define BUFFERALIGN(LEN)
Definition: c.h:813
#define Assert(condition)
Definition: c.h:858
#define unlikely(x)
Definition: c.h:311
size_t Size
Definition: c.h:605
bool ConditionVariableCancelSleep(void)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
int maintenance_work_mem
Definition: globals.c:130
int work_mem
Definition: globals.c:128
bool log_btree_build_stats
Definition: guc_tables.c:510
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2407
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:133
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:218
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:208
void InstrStartParallelQuery(void)
Definition: instrument.c:200
int i
Definition: isn.c:73
Pointer Item
Definition: item.h:17
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
struct ItemIdData ItemIdData
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
struct IndexTupleData IndexTupleData
#define INDEX_SIZE_MASK
Definition: itup.h:65
int LOCKMODE
Definition: lockdefs.h:26
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
#define ShareLock
Definition: lockdefs.h:40
#define RowExclusiveLock
Definition: lockdefs.h:38
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc0(Size size)
Definition: mcxt.c:1346
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void * palloc(Size size)
Definition: mcxt.c:1316
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:484
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:433
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:864
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1129
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:1146
#define BTGetTargetPageFreeSpace(relation)
Definition: nbtree.h:1144
#define BTP_LEAF
Definition: nbtree.h:76
#define P_HIKEY
Definition: nbtree.h:367
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition: nbtree.h:1159
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition: nbtree.h:1160
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define BTP_ROOT
Definition: nbtree.h:77
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:562
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition: nbtree.h:1157
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition: nbtree.h:1158
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529
#define P_NONE
Definition: nbtree.h:212
#define BTMaxItemSize(page)
Definition: nbtree.h:164
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1128
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define SK_BT_DESC
Definition: nbtree.h:1127
BTDedupStateData * BTDedupState
Definition: nbtree.h:893
#define P_FIRSTKEY
Definition: nbtree.h:368
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:595
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:201
#define PARALLEL_KEY_BUFFER_USAGE
Definition: nbtsort.c:67
#define ParallelTableScanFromBTShared(shared)
Definition: nbtsort.c:159
static void _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
Definition: nbtsort.c:635
static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtsort.c:714
static void _bt_slideleft(Page rightmostpage)
Definition: nbtsort.c:683
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
Definition: nbtsort.c:646
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:1135
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1607
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:293
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:64
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition: nbtsort.c:1862
static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition: nbtsort.c:1633
struct BTPageState BTPageState
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
Definition: nbtsort.c:1029
struct BTSpool BTSpool
static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
Definition: nbtsort.c:1653
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:536
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:62
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
Definition: nbtsort.c:1396
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
Definition: nbtsort.c:784
struct BTBuildState BTBuildState
void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition: nbtsort.c:1740
struct BTLeader BTLeader
static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: nbtsort.c:577
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition: nbtsort.c:363
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:515
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
Definition: nbtsort.c:1063
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:63
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:65
#define PARALLEL_KEY_WAL_USAGE
Definition: nbtsort.c:66
static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level)
Definition: nbtsort.c:606
static void _bt_leader_participate_as_worker(BTBuildState *buildstate)
Definition: nbtsort.c:1687
struct BTWriteState BTWriteState
static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
Definition: nbtsort.c:525
struct BTShared BTShared
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:5085
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:129
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:4659
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:4878
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:5143
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
Definition: pairingheap.c:79
static char * buf
Definition: pg_test_fsync.c:73
int progress
Definition: pgbench.c:261
const char * debug_query_string
Definition: postgres.c:87
void ShowUsage(const char *title)
Definition: postgres.c:5000
void ResetUsage(void)
Definition: postgres.c:4993
uintptr_t Datum
Definition: postgres.h:64
unsigned int Oid
Definition: postgres_ext.h:31
#define PROC_IN_SAFE_IC
Definition: proc.h:59
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:86
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:122
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition: progress.h:87
#define PROGRESS_CREATEIDX_SUBPHASE
Definition: progress.h:85
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:121
#define RelationGetRelid(relation)
Definition: rel.h:505
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:524
@ MAIN_FORKNUM
Definition: relpath.h:50
int slock_t
Definition: s_lock.h:735
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition: shmem.c:493
Size mul_size(Size s1, Size s2)
Definition: shmem.c:510
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:216
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:836
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:794
#define SnapshotAny
Definition: snapmgr.h:33
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:62
void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup)
Definition: sortsupport.c:161
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:200
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
PGPROC * MyProc
Definition: proc.c:66
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define BTLessStrategyNumber
Definition: stratnum.h:29
bool isunique
Definition: nbtsort.c:204
BTSpool * spool
Definition: nbtsort.c:208
BTLeader * btleader
Definition: nbtsort.c:222
bool nulls_not_distinct
Definition: nbtsort.c:205
bool havedead
Definition: nbtsort.c:206
Relation heap
Definition: nbtsort.c:207
BTSpool * spool2
Definition: nbtsort.c:214
double indtuples
Definition: nbtsort.c:215
Size maxpostingsize
Definition: nbtree.h:870
ItemPointer htids
Definition: nbtree.h:878
bool deduplicate
Definition: nbtree.h:868
OffsetNumber baseoff
Definition: nbtree.h:874
Size basetupsize
Definition: nbtree.h:875
IndexTuple base
Definition: nbtree.h:873
Size phystupsize
Definition: nbtree.h:881
ParallelContext * pcxt
Definition: nbtsort.c:168
BTShared * btshared
Definition: nbtsort.c:188
Sharedsort * sharedsort
Definition: nbtsort.c:189
Sharedsort * sharedsort2
Definition: nbtsort.c:190
int nparticipanttuplesorts
Definition: nbtsort.c:176
BufferUsage * bufferusage
Definition: nbtsort.c:193
Snapshot snapshot
Definition: nbtsort.c:191
WalUsage * walusage
Definition: nbtsort.c:192
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint16 btpo_flags
Definition: nbtree.h:67
uint32 btpo_level
Definition: nbtree.h:66
BTCycleId btpo_cycleid
Definition: nbtree.h:68
IndexTuple btps_lowkey
Definition: nbtsort.c:233
Size btps_full
Definition: nbtsort.c:237
BulkWriteBuffer btps_buf
Definition: nbtsort.c:231
OffsetNumber btps_lastoff
Definition: nbtsort.c:234
Size btps_lastextra
Definition: nbtsort.c:235
BlockNumber btps_blkno
Definition: nbtsort.c:232
struct BTPageState * btps_next
Definition: nbtsort.c:238
uint32 btps_level
Definition: nbtsort.c:236
bool allequalimage
Definition: nbtree.h:787
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:793
slock_t mutex
Definition: nbtsort.c:122
bool isconcurrent
Definition: nbtsort.c:105
double indtuples
Definition: nbtsort.c:143
double reltuples
Definition: nbtsort.c:141
Oid heaprelid
Definition: nbtsort.c:101
bool brokenhotchain
Definition: nbtsort.c:144
bool isunique
Definition: nbtsort.c:103
int nparticipantsdone
Definition: nbtsort.c:140
ConditionVariable workersdonecv
Definition: nbtsort.c:114
int scantuplesortstates
Definition: nbtsort.c:106
Oid indexrelid
Definition: nbtsort.c:102
bool havedead
Definition: nbtsort.c:142
bool nulls_not_distinct
Definition: nbtsort.c:104
bool isunique
Definition: nbtsort.c:85
bool nulls_not_distinct
Definition: nbtsort.c:86
Relation heap
Definition: nbtsort.c:83
Relation index
Definition: nbtsort.c:84
Tuplesortstate * sortstate
Definition: nbtsort.c:82
BulkWriteState * bulkstate
Definition: nbtsort.c:248
Relation heap
Definition: nbtsort.c:246
Relation index
Definition: nbtsort.c:247
BlockNumber btws_pages_alloced
Definition: nbtsort.c:250
BTScanInsert inskey
Definition: nbtsort.c:249
double heap_tuples
Definition: genam.h:32
double index_tuples
Definition: genam.h:33
bool ii_Unique
Definition: execnodes.h:197
bool ii_BrokenHotChain
Definition: execnodes.h:203
bool ii_NullsNotDistinct
Definition: execnodes.h:198
int ii_ParallelWorkers
Definition: execnodes.h:205
bool ii_Concurrent
Definition: execnodes.h:202
ItemPointerData t_tid
Definition: itup.h:37
unsigned short t_info
Definition: itup.h:49
uint8 statusFlags
Definition: proc.h:238
dsm_segment * seg
Definition: parallel.h:42
shm_toc_estimator estimator
Definition: parallel.h:41
shm_toc * toc
Definition: parallel.h:44
int nworkers_launched
Definition: parallel.h:37
int sk_flags
Definition: skey.h:66
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67
Sharedsort * sharedsort
Definition: tuplesort.h:58
AttrNumber ssup_attno
Definition: sortsupport.h:81
bool ssup_nulls_first
Definition: sortsupport.h:75
MemoryContext ssup_cxt
Definition: sortsupport.h:66
Definition: type.h:95
Definition: regguts.h:323
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition: tableam.c:165
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition: tableam.c:130
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition: tableam.c:145
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1785
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1385
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:2976
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:2955
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:971
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2999
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:61
#define TUPLESORT_NONE
Definition: tuplesort.h:93
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, const Datum *values, const bool *isnull)
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt)
void ExitParallelMode(void)
Definition: xact.c:1061
void EnterParallelMode(void)
Definition: xact.c:1048