PostgreSQL Source Code  git master
nbtsort.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtsort.c
4  * Build a btree from sorted input by loading leaf pages sequentially.
5  *
6  * NOTES
7  *
8  * We use tuplesort.c to sort the given index tuples into order.
9  * Then we scan the index tuples in order and build the btree pages
10  * for each level. We load source tuples into leaf-level pages.
11  * Whenever we fill a page at one level, we add a link to it to its
12  * parent level (starting a new parent level if necessary). When
13  * done, we write out each final page on each level, adding it to
14  * its parent level. When we have only one page on a level, it must be
15  * the root -- it can be attached to the btree metapage and we are done.
16  *
17  * It is not wise to pack the pages entirely full, since then *any*
18  * insertion would cause a split (and not only of the leaf page; the need
19  * for a split would cascade right up the tree). The steady-state load
20  * factor for btrees is usually estimated at 70%. We choose to pack leaf
21  * pages to the user-controllable fill factor (default 90%) while upper pages
22  * are always packed to 70%. This gives us reasonable density (there aren't
23  * many upper pages if the keys are reasonable-size) without risking a lot of
24  * cascading splits during early insertions.
25  *
26  * Formerly the index pages being built were kept in shared buffers, but
27  * that is of no value (since other backends have no interest in them yet)
28  * and it created locking problems for CHECKPOINT, because the upper-level
29  * pages were held exclusive-locked for long periods. Now we just build
30  * the pages in local memory and smgrwrite or smgrextend them as we finish
31  * them. They will need to be re-read into shared buffers on first use after
32  * the build finishes.
33  *
34  * This code isn't concerned about the FSM at all. The caller is responsible
35  * for initializing that.
36  *
37  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
38  * Portions Copyright (c) 1994, Regents of the University of California
39  *
40  * IDENTIFICATION
41  * src/backend/access/nbtree/nbtsort.c
42  *
43  *-------------------------------------------------------------------------
44  */
45 
46 #include "postgres.h"
47 
48 #include "access/nbtree.h"
49 #include "access/parallel.h"
50 #include "access/relscan.h"
51 #include "access/table.h"
52 #include "access/xact.h"
53 #include "access/xlog.h"
54 #include "access/xloginsert.h"
55 #include "catalog/index.h"
56 #include "commands/progress.h"
57 #include "executor/instrument.h"
58 #include "miscadmin.h"
59 #include "pgstat.h"
60 #include "storage/smgr.h"
61 #include "tcop/tcopprot.h" /* pgrminclude ignore */
62 #include "utils/rel.h"
63 #include "utils/sortsupport.h"
64 #include "utils/tuplesort.h"
65 
66 
67 /* Magic numbers for parallel state sharing */
68 #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
69 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
70 #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
71 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
72 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
73 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
74 
75 /*
76  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
77  * parallel index builds. This may be useful as a debugging aid.
78 #undef DISABLE_LEADER_PARTICIPATION
79  */
80 
81 /*
82  * Status record for spooling/sorting phase. (Note we may have two of
83  * these due to the special requirements for uniqueness-checking with
84  * dead tuples.)
85  */
86 typedef struct BTSpool
87 {
88  Tuplesortstate *sortstate; /* state data for tuplesort.c */
91  bool isunique;
92 } BTSpool;
93 
94 /*
95  * Status for index builds performed in parallel. This is allocated in a
96  * dynamic shared memory segment. Note that there is a separate tuplesort TOC
97  * entry, private to tuplesort.c but allocated by this module on its behalf.
98  */
99 typedef struct BTShared
100 {
101  /*
102  * These fields are not modified during the sort. They primarily exist
103  * for the benefit of worker processes that need to create BTSpool state
104  * corresponding to that used by the leader.
105  */
108  bool isunique;
111 
112  /*
113  * workersdonecv is used to monitor the progress of workers. All parallel
114  * participants must indicate that they are done before leader can use
115  * mutable state that workers maintain during scan (and before leader can
116  * proceed to tuplesort_performsort()).
117  */
119 
120  /*
121  * mutex protects all fields before heapdesc.
122  *
123  * These fields contain status information of interest to B-Tree index
124  * builds that must work just the same when an index is built in parallel.
125  */
127 
128  /*
129  * Mutable state that is maintained by workers, and reported back to
130  * leader at end of parallel scan.
131  *
132  * nparticipantsdone is number of worker processes finished.
133  *
134  * reltuples is the total number of input heap tuples.
135  *
136  * havedead indicates if RECENTLY_DEAD tuples were encountered during
137  * build.
138  *
139  * indtuples is the total number of tuples that made it into the index.
140  *
141  * brokenhotchain indicates if any worker detected a broken HOT chain
142  * during build.
143  */
145  double reltuples;
146  bool havedead;
147  double indtuples;
149 
150  /*
151  * ParallelTableScanDescData data follows. Can't directly embed here, as
152  * implementations of the parallel table scan desc interface might need
153  * stronger alignment.
154  */
155 } BTShared;
156 
157 /*
158  * Return pointer to a BTShared's parallel table scan.
159  *
160  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
161  * MAXALIGN.
162  */
163 #define ParallelTableScanFromBTShared(shared) \
164  (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
165 
166 /*
167  * Status for leader in parallel index build.
168  */
169 typedef struct BTLeader
170 {
171  /* parallel context itself */
173 
174  /*
175  * nparticipanttuplesorts is the exact number of worker processes
176  * successfully launched, plus one leader process if it participates as a
177  * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
178  * participating as a worker).
179  */
181 
182  /*
183  * Leader process convenience pointers to shared state (leader avoids TOC
184  * lookups).
185  *
186  * btshared is the shared state for entire build. sharedsort is the
187  * shared, tuplesort-managed state passed to each process tuplesort.
188  * sharedsort2 is the corresponding btspool2 shared state, used only when
189  * building unique indexes. snapshot is the snapshot used by the scan iff
190  * an MVCC snapshot is required.
191  */
198 } BTLeader;
199 
200 /*
201  * Working state for btbuild and its callback.
202  *
203  * When parallel CREATE INDEX is used, there is a BTBuildState for each
204  * participant.
205  */
206 typedef struct BTBuildState
207 {
208  bool isunique;
209  bool havedead;
212 
213  /*
214  * spool2 is needed only when the index is a unique index. Dead tuples are
215  * put into spool2 instead of spool in order to avoid uniqueness check.
216  */
218  double indtuples;
219 
220  /*
221  * btleader is only present when a parallel index build is performed, and
222  * only in the leader process. (Actually, only the leader has a
223  * BTBuildState. Workers have their own spool and spool2, though.)
224  */
226 } BTBuildState;
227 
228 /*
229  * Status record for a btree page being built. We have one of these
230  * for each active tree level.
231  */
232 typedef struct BTPageState
233 {
234  Page btps_page; /* workspace for page building */
235  BlockNumber btps_blkno; /* block # to write this page at */
236  IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
237  OffsetNumber btps_lastoff; /* last item offset loaded */
238  Size btps_lastextra; /* last item's extra posting list space */
239  uint32 btps_level; /* tree level (0 = leaf) */
240  Size btps_full; /* "full" if less than this much free space */
241  struct BTPageState *btps_next; /* link to parent level, if any */
242 } BTPageState;
243 
244 /*
245  * Overall status record for index writing phase.
246  */
247 typedef struct BTWriteState
248 {
251  BTScanInsert inskey; /* generic insertion scankey */
252  bool btws_use_wal; /* dump pages to WAL? */
253  BlockNumber btws_pages_alloced; /* # pages allocated */
254  BlockNumber btws_pages_written; /* # pages written out */
255  Page btws_zeropage; /* workspace for filling zeroes */
256 } BTWriteState;
257 
258 
260  BTBuildState *buildstate, IndexInfo *indexInfo);
261 static void _bt_spooldestroy(BTSpool *btspool);
262 static void _bt_spool(BTSpool *btspool, ItemPointer self,
263  Datum *values, bool *isnull);
264 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
266  bool *isnull, bool tupleIsAlive, void *state);
267 static Page _bt_blnewpage(uint32 level);
268 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
269 static void _bt_slideleft(Page rightmostpage);
270 static void _bt_sortaddtup(Page page, Size itemsize,
271  IndexTuple itup, OffsetNumber itup_off,
272  bool newfirstdataitem);
273 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
274  IndexTuple itup, Size truncextra);
275 static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
277  BTDedupState dstate);
278 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
279 static void _bt_load(BTWriteState *wstate,
280  BTSpool *btspool, BTSpool *btspool2);
281 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
282  int request);
283 static void _bt_end_parallel(BTLeader *btleader);
285 static double _bt_parallel_heapscan(BTBuildState *buildstate,
286  bool *brokenhotchain);
287 static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
288 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
289  BTShared *btshared, Sharedsort *sharedsort,
290  Sharedsort *sharedsort2, int sortmem,
291  bool progress);
292 
293 
294 /*
295  * btbuild() -- build a new btree index.
296  */
299 {
300  IndexBuildResult *result;
301  BTBuildState buildstate;
302  double reltuples;
303 
304 #ifdef BTREE_BUILD_STATS
306  ResetUsage();
307 #endif /* BTREE_BUILD_STATS */
308 
309  buildstate.isunique = indexInfo->ii_Unique;
310  buildstate.havedead = false;
311  buildstate.heap = heap;
312  buildstate.spool = NULL;
313  buildstate.spool2 = NULL;
314  buildstate.indtuples = 0;
315  buildstate.btleader = NULL;
316 
317  /*
318  * We expect to be called exactly once for any index relation. If that's
319  * not the case, big trouble's what we have.
320  */
321  if (RelationGetNumberOfBlocks(index) != 0)
322  elog(ERROR, "index \"%s\" already contains data",
323  RelationGetRelationName(index));
324 
325  reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
326 
327  /*
328  * Finish the build by (1) completing the sort of the spool file, (2)
329  * inserting the sorted tuples into btree pages and (3) building the upper
330  * levels. Finally, it may also be necessary to end use of parallelism.
331  */
332  _bt_leafbuild(buildstate.spool, buildstate.spool2);
333  _bt_spooldestroy(buildstate.spool);
334  if (buildstate.spool2)
335  _bt_spooldestroy(buildstate.spool2);
336  if (buildstate.btleader)
337  _bt_end_parallel(buildstate.btleader);
338 
339  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
340 
341  result->heap_tuples = reltuples;
342  result->index_tuples = buildstate.indtuples;
343 
344 #ifdef BTREE_BUILD_STATS
346  {
347  ShowUsage("BTREE BUILD STATS");
348  ResetUsage();
349  }
350 #endif /* BTREE_BUILD_STATS */
351 
352  return result;
353 }
354 
355 /*
356  * Create and initialize one or two spool structures, and save them in caller's
357  * buildstate argument. May also fill-in fields within indexInfo used by index
358  * builds.
359  *
360  * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
361  * routine encapsulates all aspects of managing parallelism. Caller need only
362  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
363  *
364  * Returns the total number of heap tuples scanned.
365  */
366 static double
368  IndexInfo *indexInfo)
369 {
370  BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
371  SortCoordinate coordinate = NULL;
372  double reltuples = 0;
373 
374  /*
375  * We size the sort area as maintenance_work_mem rather than work_mem to
376  * speed index creation. This should be OK since a single backend can't
377  * run multiple index creations in parallel (see also: notes on
378  * parallelism and maintenance_work_mem below).
379  */
380  btspool->heap = heap;
381  btspool->index = index;
382  btspool->isunique = indexInfo->ii_Unique;
383 
384  /* Save as primary spool */
385  buildstate->spool = btspool;
386 
387  /* Report table scan phase started */
390 
391  /* Attempt to launch parallel worker scan when required */
392  if (indexInfo->ii_ParallelWorkers > 0)
393  _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
394  indexInfo->ii_ParallelWorkers);
395 
396  /*
397  * If parallel build requested and at least one worker process was
398  * successfully launched, set up coordination state
399  */
400  if (buildstate->btleader)
401  {
402  coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
403  coordinate->isWorker = false;
404  coordinate->nParticipants =
405  buildstate->btleader->nparticipanttuplesorts;
406  coordinate->sharedsort = buildstate->btleader->sharedsort;
407  }
408 
409  /*
410  * Begin serial/leader tuplesort.
411  *
412  * In cases where parallelism is involved, the leader receives the same
413  * share of maintenance_work_mem as a serial sort (it is generally treated
414  * in the same way as a serial sort once we return). Parallel worker
415  * Tuplesortstates will have received only a fraction of
416  * maintenance_work_mem, though.
417  *
418  * We rely on the lifetime of the Leader Tuplesortstate almost not
419  * overlapping with any worker Tuplesortstate's lifetime. There may be
420  * some small overlap, but that's okay because we rely on leader
421  * Tuplesortstate only allocating a small, fixed amount of memory here.
422  * When its tuplesort_performsort() is called (by our caller), and
423  * significant amounts of memory are likely to be used, all workers must
424  * have already freed almost all memory held by their Tuplesortstates
425  * (they are about to go away completely, too). The overall effect is
426  * that maintenance_work_mem always represents an absolute high watermark
427  * on the amount of memory used by a CREATE INDEX operation, regardless of
428  * the use of parallelism or any other factor.
429  */
430  buildstate->spool->sortstate =
431  tuplesort_begin_index_btree(heap, index, buildstate->isunique,
432  maintenance_work_mem, coordinate,
433  false);
434 
435  /*
436  * If building a unique index, put dead tuples in a second spool to keep
437  * them out of the uniqueness check. We expect that the second spool (for
438  * dead tuples) won't get very full, so we give it only work_mem.
439  */
440  if (indexInfo->ii_Unique)
441  {
442  BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
443  SortCoordinate coordinate2 = NULL;
444 
445  /* Initialize secondary spool */
446  btspool2->heap = heap;
447  btspool2->index = index;
448  btspool2->isunique = false;
449  /* Save as secondary spool */
450  buildstate->spool2 = btspool2;
451 
452  if (buildstate->btleader)
453  {
454  /*
455  * Set up non-private state that is passed to
456  * tuplesort_begin_index_btree() about the basic high level
457  * coordination of a parallel sort.
458  */
459  coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
460  coordinate2->isWorker = false;
461  coordinate2->nParticipants =
462  buildstate->btleader->nparticipanttuplesorts;
463  coordinate2->sharedsort = buildstate->btleader->sharedsort2;
464  }
465 
466  /*
467  * We expect that the second one (for dead tuples) won't get very
468  * full, so we give it only work_mem
469  */
470  buildstate->spool2->sortstate =
471  tuplesort_begin_index_btree(heap, index, false, work_mem,
472  coordinate2, false);
473  }
474 
475  /* Fill spool using either serial or parallel heap scan */
476  if (!buildstate->btleader)
477  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
478  _bt_build_callback, (void *) buildstate,
479  NULL);
480  else
481  reltuples = _bt_parallel_heapscan(buildstate,
482  &indexInfo->ii_BrokenHotChain);
483 
484  /*
485  * Set the progress target for the next phase. Reset the block number
486  * values set by table_index_build_scan
487  */
488  {
489  const int progress_index[] = {
493  };
494  const int64 progress_vals[] = {
495  buildstate->indtuples,
496  0, 0
497  };
498 
499  pgstat_progress_update_multi_param(3, progress_index, progress_vals);
500  }
501 
502  /* okay, all heap tuples are spooled */
503  if (buildstate->spool2 && !buildstate->havedead)
504  {
505  /* spool2 turns out to be unnecessary */
506  _bt_spooldestroy(buildstate->spool2);
507  buildstate->spool2 = NULL;
508  }
509 
510  return reltuples;
511 }
512 
513 /*
514  * clean up a spool structure and its substructures.
515  */
516 static void
518 {
519  tuplesort_end(btspool->sortstate);
520  pfree(btspool);
521 }
522 
523 /*
524  * spool an index entry into the sort file.
525  */
526 static void
527 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
528 {
529  tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
530  self, values, isnull);
531 }
532 
533 /*
534  * given a spool loaded by successive calls to _bt_spool,
535  * create an entire btree.
536  */
537 static void
538 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
539 {
540  BTWriteState wstate;
541 
542 #ifdef BTREE_BUILD_STATS
544  {
545  ShowUsage("BTREE BUILD (Spool) STATISTICS");
546  ResetUsage();
547  }
548 #endif /* BTREE_BUILD_STATS */
549 
553  if (btspool2)
554  {
557  tuplesort_performsort(btspool2->sortstate);
558  }
559 
560  wstate.heap = btspool->heap;
561  wstate.index = btspool->index;
562  wstate.inskey = _bt_mkscankey(wstate.index, NULL);
563  /* _bt_mkscankey() won't set allequalimage without metapage */
564  wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
565  wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
566 
567  /* reserve the metapage */
568  wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
569  wstate.btws_pages_written = 0;
570  wstate.btws_zeropage = NULL; /* until needed */
571 
574  _bt_load(&wstate, btspool, btspool2);
575 }
576 
577 /*
578  * Per-tuple callback for table_index_build_scan
579  */
580 static void
582  ItemPointer tid,
583  Datum *values,
584  bool *isnull,
585  bool tupleIsAlive,
586  void *state)
587 {
588  BTBuildState *buildstate = (BTBuildState *) state;
589 
590  /*
591  * insert the index tuple into the appropriate spool file for subsequent
592  * processing
593  */
594  if (tupleIsAlive || buildstate->spool2 == NULL)
595  _bt_spool(buildstate->spool, tid, values, isnull);
596  else
597  {
598  /* dead tuples are put into spool2 */
599  buildstate->havedead = true;
600  _bt_spool(buildstate->spool2, tid, values, isnull);
601  }
602 
603  buildstate->indtuples += 1;
604 }
605 
606 /*
607  * allocate workspace for a new, clean btree page, not linked to any siblings.
608  */
609 static Page
611 {
612  Page page;
613  BTPageOpaque opaque;
614 
615  page = (Page) palloc(BLCKSZ);
616 
617  /* Zero the page and set up standard page header info */
618  _bt_pageinit(page, BLCKSZ);
619 
620  /* Initialize BT opaque state */
621  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
622  opaque->btpo_prev = opaque->btpo_next = P_NONE;
623  opaque->btpo_level = level;
624  opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
625  opaque->btpo_cycleid = 0;
626 
627  /* Make the P_HIKEY line pointer appear allocated */
628  ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
629 
630  return page;
631 }
632 
633 /*
634  * emit a completed btree page, and release the working storage.
635  */
636 static void
638 {
639  /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
640  RelationOpenSmgr(wstate->index);
641 
642  /* XLOG stuff */
643  if (wstate->btws_use_wal)
644  {
645  /* We use the XLOG_FPI record type for this */
646  log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
647  }
648 
649  /*
650  * If we have to write pages nonsequentially, fill in the space with
651  * zeroes until we come back and overwrite. This is not logically
652  * necessary on standard Unix filesystems (unwritten space will read as
653  * zeroes anyway), but it should help to avoid fragmentation. The dummy
654  * pages aren't WAL-logged though.
655  */
656  while (blkno > wstate->btws_pages_written)
657  {
658  if (!wstate->btws_zeropage)
659  wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
660  /* don't set checksum for all-zero page */
662  wstate->btws_pages_written++,
663  (char *) wstate->btws_zeropage,
664  true);
665  }
666 
667  PageSetChecksumInplace(page, blkno);
668 
669  /*
670  * Now write the page. There's no need for smgr to schedule an fsync for
671  * this write; we'll do it ourselves before ending the build.
672  */
673  if (blkno == wstate->btws_pages_written)
674  {
675  /* extending the file... */
676  smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
677  (char *) page, true);
678  wstate->btws_pages_written++;
679  }
680  else
681  {
682  /* overwriting a block we zero-filled before */
683  smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
684  (char *) page, true);
685  }
686 
687  pfree(page);
688 }
689 
690 /*
691  * allocate and initialize a new BTPageState. the returned structure
692  * is suitable for immediate use by _bt_buildadd.
693  */
694 static BTPageState *
696 {
698 
699  /* create initial page for level */
700  state->btps_page = _bt_blnewpage(level);
701 
702  /* and assign it a page position */
703  state->btps_blkno = wstate->btws_pages_alloced++;
704 
705  state->btps_lowkey = NULL;
706  /* initialize lastoff so first item goes into P_FIRSTKEY */
707  state->btps_lastoff = P_HIKEY;
708  state->btps_lastextra = 0;
709  state->btps_level = level;
710  /* set "full" threshold based on level. See notes at head of file. */
711  if (level > 0)
712  state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
713  else
714  state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
715 
716  /* no parent level, yet */
717  state->btps_next = NULL;
718 
719  return state;
720 }
721 
722 /*
723  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
724  * P_HIKEY, overwriting P_HIKEY).
725  *
726  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
727  * rightmost page on its level is not supposed to get a high key. Now that
728  * it's clear that this page is a rightmost page, remove the unneeded empty
729  * P_HIKEY line pointer space.
730  */
731 static void
732 _bt_slideleft(Page rightmostpage)
733 {
734  OffsetNumber off;
735  OffsetNumber maxoff;
736  ItemId previi;
737 
738  maxoff = PageGetMaxOffsetNumber(rightmostpage);
739  Assert(maxoff >= P_FIRSTKEY);
740  previi = PageGetItemId(rightmostpage, P_HIKEY);
741  for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
742  {
743  ItemId thisii = PageGetItemId(rightmostpage, off);
744 
745  *previi = *thisii;
746  previi = thisii;
747  }
748  ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
749 }
750 
751 /*
752  * Add an item to a page being built.
753  *
754  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
755  * raises an error directly.
756  *
757  * Note that our nbtsort.c caller does not know yet if the page will be
758  * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
759  * caller. Page that turns out to be the rightmost on its level is fixed by
760  * calling _bt_slideleft().
761  */
762 static void
764  Size itemsize,
765  IndexTuple itup,
766  OffsetNumber itup_off,
767  bool newfirstdataitem)
768 {
769  IndexTupleData trunctuple;
770 
771  if (newfirstdataitem)
772  {
773  trunctuple = *itup;
774  trunctuple.t_info = sizeof(IndexTupleData);
775  BTreeTupleSetNAtts(&trunctuple, 0, false);
776  itup = &trunctuple;
777  itemsize = sizeof(IndexTupleData);
778  }
779 
780  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
781  false, false) == InvalidOffsetNumber)
782  elog(ERROR, "failed to add item to the index page");
783 }
784 
785 /*----------
786  * Add an item to a disk page from the sort output (or add a posting list
787  * item formed from the sort output).
788  *
789  * We must be careful to observe the page layout conventions of nbtsearch.c:
790  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
791  * - on non-leaf pages, the key portion of the first item need not be
792  * stored, we should store only the link.
793  *
794  * A leaf page being built looks like:
795  *
796  * +----------------+---------------------------------+
797  * | PageHeaderData | linp0 linp1 linp2 ... |
798  * +-----------+----+---------------------------------+
799  * | ... linpN | |
800  * +-----------+--------------------------------------+
801  * | ^ last |
802  * | |
803  * +-------------+------------------------------------+
804  * | | itemN ... |
805  * +-------------+------------------+-----------------+
806  * | ... item3 item2 item1 | "special space" |
807  * +--------------------------------+-----------------+
808  *
809  * Contrast this with the diagram in bufpage.h; note the mismatch
810  * between linps and items. This is because we reserve linp0 as a
811  * placeholder for the pointer to the "high key" item; when we have
812  * filled up the page, we will set linp0 to point to itemN and clear
813  * linpN. On the other hand, if we find this is the last (rightmost)
814  * page, we leave the items alone and slide the linp array over. If
815  * the high key is to be truncated, offset 1 is deleted, and we insert
816  * the truncated high key at offset 1.
817  *
818  * 'last' pointer indicates the last offset added to the page.
819  *
820  * 'truncextra' is the size of the posting list in itup, if any. This
821  * information is stashed for the next call here, when we may benefit
822  * from considering the impact of truncating away the posting list on
823  * the page before deciding to finish the page off. Posting lists are
824  * often relatively large, so it is worth going to the trouble of
825  * accounting for the saving from truncating away the posting list of
826  * the tuple that becomes the high key (that may be the only way to
827  * get close to target free space on the page). Note that this is
828  * only used for the soft fillfactor-wise limit, not the critical hard
829  * limit.
830  *----------
831  */
832 static void
834  Size truncextra)
835 {
836  Page npage;
837  BlockNumber nblkno;
838  OffsetNumber last_off;
839  Size last_truncextra;
840  Size pgspc;
841  Size itupsz;
842  bool isleaf;
843 
844  /*
845  * This is a handy place to check for cancel interrupts during the btree
846  * load phase of index creation.
847  */
849 
850  npage = state->btps_page;
851  nblkno = state->btps_blkno;
852  last_off = state->btps_lastoff;
853  last_truncextra = state->btps_lastextra;
854  state->btps_lastextra = truncextra;
855 
856  pgspc = PageGetFreeSpace(npage);
857  itupsz = IndexTupleSize(itup);
858  itupsz = MAXALIGN(itupsz);
859  /* Leaf case has slightly different rules due to suffix truncation */
860  isleaf = (state->btps_level == 0);
861 
862  /*
863  * Check whether the new item can fit on a btree page on current level at
864  * all.
865  *
866  * Every newly built index will treat heap TID as part of the keyspace,
867  * which imposes the requirement that new high keys must occasionally have
868  * a heap TID appended within _bt_truncate(). That may leave a new pivot
869  * tuple one or two MAXALIGN() quantums larger than the original
870  * firstright tuple it's derived from. v4 deals with the problem by
871  * decreasing the limit on the size of tuples inserted on the leaf level
872  * by the same small amount. Enforce the new v4+ limit on the leaf level,
873  * and the old limit on internal levels, since pivot tuples may need to
874  * make use of the reserved space. This should never fail on internal
875  * pages.
876  */
877  if (unlikely(itupsz > BTMaxItemSize(npage)))
878  _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
879  itup);
880 
881  /*
882  * Check to see if current page will fit new item, with space left over to
883  * append a heap TID during suffix truncation when page is a leaf page.
884  *
885  * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
886  * high key with heap TID when finishing off a leaf page, since we rely on
887  * _bt_check_third_page() rejecting oversized non-pivot tuples. On
888  * internal pages we can always fit 3 pivot tuples with larger internal
889  * page tuple limit (includes page high key).
890  *
891  * Most of the time, a page is only "full" in the sense that the soft
892  * fillfactor-wise limit has been exceeded. However, we must always leave
893  * at least two items plus a high key on each page before starting a new
894  * page. Disregard fillfactor and insert on "full" current page if we
895  * don't have the minimum number of items yet. (Note that we deliberately
896  * assume that suffix truncation neither enlarges nor shrinks new high key
897  * when applying soft limit, except when last tuple has a posting list.)
898  */
899  Assert(last_truncextra == 0 || isleaf);
900  if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
901  (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
902  {
903  /*
904  * Finish off the page and write it out.
905  */
906  Page opage = npage;
907  BlockNumber oblkno = nblkno;
908  ItemId ii;
909  ItemId hii;
910  IndexTuple oitup;
911 
912  /* Create new page of same level */
913  npage = _bt_blnewpage(state->btps_level);
914 
915  /* and assign it a page position */
916  nblkno = wstate->btws_pages_alloced++;
917 
918  /*
919  * We copy the last item on the page into the new page, and then
920  * rearrange the old page so that the 'last item' becomes its high key
921  * rather than a true data item. There had better be at least two
922  * items on the page already, else the page would be empty of useful
923  * data.
924  */
925  Assert(last_off > P_FIRSTKEY);
926  ii = PageGetItemId(opage, last_off);
927  oitup = (IndexTuple) PageGetItem(opage, ii);
928  _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
929  !isleaf);
930 
931  /*
932  * Move 'last' into the high key position on opage. _bt_blnewpage()
933  * allocated empty space for a line pointer when opage was first
934  * created, so this is a matter of rearranging already-allocated space
935  * on page, and initializing high key line pointer. (Actually, leaf
936  * pages must also swap oitup with a truncated version of oitup, which
937  * is sometimes larger than oitup, though never by more than the space
938  * needed to append a heap TID.)
939  */
940  hii = PageGetItemId(opage, P_HIKEY);
941  *hii = *ii;
942  ItemIdSetUnused(ii); /* redundant */
943  ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
944 
945  if (isleaf)
946  {
947  IndexTuple lastleft;
948  IndexTuple truncated;
949 
950  /*
951  * Truncate away any unneeded attributes from high key on leaf
952  * level. This is only done at the leaf level because downlinks
953  * in internal pages are either negative infinity items, or get
954  * their contents from copying from one level down. See also:
955  * _bt_split().
956  *
957  * We don't try to bias our choice of split point to make it more
958  * likely that _bt_truncate() can truncate away more attributes,
959  * whereas the split point used within _bt_split() is chosen much
960  * more delicately. Even still, the lastleft and firstright
961  * tuples passed to _bt_truncate() here are at least not fully
962  * equal to each other when deduplication is used, unless there is
963  * a large group of duplicates (also, unique index builds usually
964  * have few or no spool2 duplicates). When the split point is
965  * between two unequal tuples, _bt_truncate() will avoid including
966  * a heap TID in the new high key, which is the most important
967  * benefit of suffix truncation.
968  *
969  * Overwrite the old item with new truncated high key directly.
970  * oitup is already located at the physical beginning of tuple
971  * space, so this should directly reuse the existing tuple space.
972  */
973  ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
974  lastleft = (IndexTuple) PageGetItem(opage, ii);
975 
976  Assert(IndexTupleSize(oitup) > last_truncextra);
977  truncated = _bt_truncate(wstate->index, lastleft, oitup,
978  wstate->inskey);
979  if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
980  IndexTupleSize(truncated)))
981  elog(ERROR, "failed to add high key to the index page");
982  pfree(truncated);
983 
984  /* oitup should continue to point to the page's high key */
985  hii = PageGetItemId(opage, P_HIKEY);
986  oitup = (IndexTuple) PageGetItem(opage, hii);
987  }
988 
989  /*
990  * Link the old page into its parent, using its low key. If we don't
991  * have a parent, we have to create one; this adds a new btree level.
992  */
993  if (state->btps_next == NULL)
994  state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
995 
996  Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
998  BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
1000  Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
1002  BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
1003  _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
1004  pfree(state->btps_lowkey);
1005 
1006  /*
1007  * Save a copy of the high key from the old page. It is also the low
1008  * key for the new page.
1009  */
1010  state->btps_lowkey = CopyIndexTuple(oitup);
1011 
1012  /*
1013  * Set the sibling links for both pages.
1014  */
1015  {
1016  BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
1017  BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
1018 
1019  oopaque->btpo_next = nblkno;
1020  nopaque->btpo_prev = oblkno;
1021  nopaque->btpo_next = P_NONE; /* redundant */
1022  }
1023 
1024  /*
1025  * Write out the old page. We never need to touch it again, so we can
1026  * free the opage workspace too.
1027  */
1028  _bt_blwritepage(wstate, opage, oblkno);
1029 
1030  /*
1031  * Reset last_off to point to new page
1032  */
1033  last_off = P_FIRSTKEY;
1034  }
1035 
1036  /*
1037  * By here, either original page is still the current page, or a new page
1038  * was created that became the current page. Either way, the current page
1039  * definitely has space for new item.
1040  *
1041  * If the new item is the first for its page, it must also be the first
1042  * item on its entire level. On later same-level pages, a low key for a
1043  * page will be copied from the prior page in the code above. Generate a
1044  * minus infinity low key here instead.
1045  */
1046  if (last_off == P_HIKEY)
1047  {
1048  Assert(state->btps_lowkey == NULL);
1049  state->btps_lowkey = palloc0(sizeof(IndexTupleData));
1050  state->btps_lowkey->t_info = sizeof(IndexTupleData);
1051  BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1052  }
1053 
1054  /*
1055  * Add the new item into the current page.
1056  */
1057  last_off = OffsetNumberNext(last_off);
1058  _bt_sortaddtup(npage, itupsz, itup, last_off,
1059  !isleaf && last_off == P_FIRSTKEY);
1060 
1061  state->btps_page = npage;
1062  state->btps_blkno = nblkno;
1063  state->btps_lastoff = last_off;
1064 }
1065 
1066 /*
1067  * Finalize pending posting list tuple, and add it to the index. Final tuple
1068  * is based on saved base tuple, and saved list of heap TIDs.
1069  *
1070  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1071  * using _bt_buildadd().
1072  */
1073 static void
1075  BTDedupState dstate)
1076 {
1077  Assert(dstate->nitems > 0);
1078 
1079  if (dstate->nitems == 1)
1080  _bt_buildadd(wstate, state, dstate->base, 0);
1081  else
1082  {
1083  IndexTuple postingtuple;
1084  Size truncextra;
1085 
1086  /* form a tuple with a posting list */
1087  postingtuple = _bt_form_posting(dstate->base,
1088  dstate->htids,
1089  dstate->nhtids);
1090  /* Calculate posting list overhead */
1091  truncextra = IndexTupleSize(postingtuple) -
1092  BTreeTupleGetPostingOffset(postingtuple);
1093 
1094  _bt_buildadd(wstate, state, postingtuple, truncextra);
1095  pfree(postingtuple);
1096  }
1097 
1098  dstate->nmaxitems = 0;
1099  dstate->nhtids = 0;
1100  dstate->nitems = 0;
1101  dstate->phystupsize = 0;
1102 }
1103 
1104 /*
1105  * Finish writing out the completed btree.
1106  */
1107 static void
1109 {
1110  BTPageState *s;
1111  BlockNumber rootblkno = P_NONE;
1112  uint32 rootlevel = 0;
1113  Page metapage;
1114 
1115  /*
1116  * Each iteration of this loop completes one more level of the tree.
1117  */
1118  for (s = state; s != NULL; s = s->btps_next)
1119  {
1120  BlockNumber blkno;
1121  BTPageOpaque opaque;
1122 
1123  blkno = s->btps_blkno;
1125 
1126  /*
1127  * We have to link the last page on this level to somewhere.
1128  *
1129  * If we're at the top, it's the root, so attach it to the metapage.
1130  * Otherwise, add an entry for it to its parent using its low key.
1131  * This may cause the last page of the parent level to split, but
1132  * that's not a problem -- we haven't gotten to it yet.
1133  */
1134  if (s->btps_next == NULL)
1135  {
1136  opaque->btpo_flags |= BTP_ROOT;
1137  rootblkno = blkno;
1138  rootlevel = s->btps_level;
1139  }
1140  else
1141  {
1142  Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1144  BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1145  P_LEFTMOST(opaque));
1146  Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1147  !P_LEFTMOST(opaque));
1149  _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1150  pfree(s->btps_lowkey);
1151  s->btps_lowkey = NULL;
1152  }
1153 
1154  /*
1155  * This is the rightmost page, so the ItemId array needs to be slid
1156  * back one slot. Then we can dump out the page.
1157  */
1159  _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1160  s->btps_page = NULL; /* writepage freed the workspace */
1161  }
1162 
1163  /*
1164  * As the last step in the process, construct the metapage and make it
1165  * point to the new root (unless we had no data at all, in which case it's
1166  * set to point to "P_NONE"). This changes the index to the "valid" state
1167  * by filling in a valid magic number in the metapage.
1168  */
1169  metapage = (Page) palloc(BLCKSZ);
1170  _bt_initmetapage(metapage, rootblkno, rootlevel,
1171  wstate->inskey->allequalimage);
1172  _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1173 }
1174 
1175 /*
1176  * Read tuples in correct sort order from tuplesort, and load them into
1177  * btree leaves.
1178  */
1179 static void
1180 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1181 {
1182  BTPageState *state = NULL;
1183  bool merge = (btspool2 != NULL);
1184  IndexTuple itup,
1185  itup2 = NULL;
1186  bool load1;
1187  TupleDesc tupdes = RelationGetDescr(wstate->index);
1188  int i,
1190  SortSupport sortKeys;
1191  int64 tuples_done = 0;
1192  bool deduplicate;
1193 
1194  deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1195  BTGetDeduplicateItems(wstate->index);
1196 
1197  if (merge)
1198  {
1199  /*
1200  * Another BTSpool for dead tuples exists. Now we have to merge
1201  * btspool and btspool2.
1202  */
1203 
1204  /* the preparation of merge */
1205  itup = tuplesort_getindextuple(btspool->sortstate, true);
1206  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1207 
1208  /* Prepare SortSupport data for each column */
1209  sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1210 
1211  for (i = 0; i < keysz; i++)
1212  {
1213  SortSupport sortKey = sortKeys + i;
1214  ScanKey scanKey = wstate->inskey->scankeys + i;
1215  int16 strategy;
1216 
1217  sortKey->ssup_cxt = CurrentMemoryContext;
1218  sortKey->ssup_collation = scanKey->sk_collation;
1219  sortKey->ssup_nulls_first =
1220  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1221  sortKey->ssup_attno = scanKey->sk_attno;
1222  /* Abbreviation is not supported here */
1223  sortKey->abbreviate = false;
1224 
1225  AssertState(sortKey->ssup_attno != 0);
1226 
1227  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1229 
1230  PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1231  }
1232 
1233  for (;;)
1234  {
1235  load1 = true; /* load BTSpool next ? */
1236  if (itup2 == NULL)
1237  {
1238  if (itup == NULL)
1239  break;
1240  }
1241  else if (itup != NULL)
1242  {
1243  int32 compare = 0;
1244 
1245  for (i = 1; i <= keysz; i++)
1246  {
1247  SortSupport entry;
1248  Datum attrDatum1,
1249  attrDatum2;
1250  bool isNull1,
1251  isNull2;
1252 
1253  entry = sortKeys + i - 1;
1254  attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1255  attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1256 
1257  compare = ApplySortComparator(attrDatum1, isNull1,
1258  attrDatum2, isNull2,
1259  entry);
1260  if (compare > 0)
1261  {
1262  load1 = false;
1263  break;
1264  }
1265  else if (compare < 0)
1266  break;
1267  }
1268 
1269  /*
1270  * If key values are equal, we sort on ItemPointer. This is
1271  * required for btree indexes, since heap TID is treated as an
1272  * implicit last key attribute in order to ensure that all
1273  * keys in the index are physically unique.
1274  */
1275  if (compare == 0)
1276  {
1277  compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1278  Assert(compare != 0);
1279  if (compare > 0)
1280  load1 = false;
1281  }
1282  }
1283  else
1284  load1 = false;
1285 
1286  /* When we see first tuple, create first index page */
1287  if (state == NULL)
1288  state = _bt_pagestate(wstate, 0);
1289 
1290  if (load1)
1291  {
1292  _bt_buildadd(wstate, state, itup, 0);
1293  itup = tuplesort_getindextuple(btspool->sortstate, true);
1294  }
1295  else
1296  {
1297  _bt_buildadd(wstate, state, itup2, 0);
1298  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1299  }
1300 
1301  /* Report progress */
1303  ++tuples_done);
1304  }
1305  pfree(sortKeys);
1306  }
1307  else if (deduplicate)
1308  {
1309  /* merge is unnecessary, deduplicate into posting lists */
1310  BTDedupState dstate;
1311 
1312  dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
1313  dstate->deduplicate = true; /* unused */
1314  dstate->nmaxitems = 0; /* unused */
1315  dstate->maxpostingsize = 0; /* set later */
1316  /* Metadata about base tuple of current pending posting list */
1317  dstate->base = NULL;
1318  dstate->baseoff = InvalidOffsetNumber; /* unused */
1319  dstate->basetupsize = 0;
1320  /* Metadata about current pending posting list TIDs */
1321  dstate->htids = NULL;
1322  dstate->nhtids = 0;
1323  dstate->nitems = 0;
1324  dstate->phystupsize = 0; /* unused */
1325  dstate->nintervals = 0; /* unused */
1326 
1327  while ((itup = tuplesort_getindextuple(btspool->sortstate,
1328  true)) != NULL)
1329  {
1330  /* When we see first tuple, create first index page */
1331  if (state == NULL)
1332  {
1333  state = _bt_pagestate(wstate, 0);
1334 
1335  /*
1336  * Limit size of posting list tuples to 1/10 space we want to
1337  * leave behind on the page, plus space for final item's line
1338  * pointer. This is equal to the space that we'd like to
1339  * leave behind on each leaf page when fillfactor is 90,
1340  * allowing us to get close to fillfactor% space utilization
1341  * when there happen to be a great many duplicates. (This
1342  * makes higher leaf fillfactor settings ineffective when
1343  * building indexes that have many duplicates, but packing
1344  * leaf pages full with few very large tuples doesn't seem
1345  * like a useful goal.)
1346  */
1347  dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1348  sizeof(ItemIdData);
1349  Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
1350  dstate->maxpostingsize <= INDEX_SIZE_MASK);
1351  dstate->htids = palloc(dstate->maxpostingsize);
1352 
1353  /* start new pending posting list with itup copy */
1356  }
1357  else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1358  itup) > keysz &&
1359  _bt_dedup_save_htid(dstate, itup))
1360  {
1361  /*
1362  * Tuple is equal to base tuple of pending posting list. Heap
1363  * TID from itup has been saved in state.
1364  */
1365  }
1366  else
1367  {
1368  /*
1369  * Tuple is not equal to pending posting list tuple, or
1370  * _bt_dedup_save_htid() opted to not merge current item into
1371  * pending posting list.
1372  */
1373  _bt_sort_dedup_finish_pending(wstate, state, dstate);
1374  pfree(dstate->base);
1375 
1376  /* start new pending posting list with itup copy */
1379  }
1380 
1381  /* Report progress */
1383  ++tuples_done);
1384  }
1385 
1386  if (state)
1387  {
1388  /*
1389  * Handle the last item (there must be a last item when the
1390  * tuplesort returned one or more tuples)
1391  */
1392  _bt_sort_dedup_finish_pending(wstate, state, dstate);
1393  pfree(dstate->base);
1394  pfree(dstate->htids);
1395  }
1396 
1397  pfree(dstate);
1398  }
1399  else
1400  {
1401  /* merging and deduplication are both unnecessary */
1402  while ((itup = tuplesort_getindextuple(btspool->sortstate,
1403  true)) != NULL)
1404  {
1405  /* When we see first tuple, create first index page */
1406  if (state == NULL)
1407  state = _bt_pagestate(wstate, 0);
1408 
1409  _bt_buildadd(wstate, state, itup, 0);
1410 
1411  /* Report progress */
1413  ++tuples_done);
1414  }
1415  }
1416 
1417  /* Close down final pages and write the metapage */
1418  _bt_uppershutdown(wstate, state);
1419 
1420  /*
1421  * When we WAL-logged index pages, we must nonetheless fsync index files.
1422  * Since we're building outside shared buffers, a CHECKPOINT occurring
1423  * during the build has no way to flush the previously written data to
1424  * disk (indeed it won't know the index even exists). A crash later on
1425  * would replay WAL from the checkpoint, therefore it wouldn't replay our
1426  * earlier WAL entries. If we do not fsync those pages here, they might
1427  * still not be on disk when the crash occurs.
1428  */
1429  if (wstate->btws_use_wal)
1430  {
1431  RelationOpenSmgr(wstate->index);
1433  }
1434 }
1435 
1436 /*
1437  * Create parallel context, and launch workers for leader.
1438  *
1439  * buildstate argument should be initialized (with the exception of the
1440  * tuplesort state in spools, which may later be created based on shared
1441  * state initially set up here).
1442  *
1443  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1444  *
1445  * request is the target number of parallel worker processes to launch.
1446  *
1447  * Sets buildstate's BTLeader, which caller must use to shut down parallel
1448  * mode by passing it to _bt_end_parallel() at the very end of its index
1449  * build. If not even a single worker process can be launched, this is
1450  * never set, and caller should proceed with a serial index build.
1451  */
1452 static void
1453 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1454 {
1455  ParallelContext *pcxt;
1456  int scantuplesortstates;
1457  Snapshot snapshot;
1458  Size estbtshared;
1459  Size estsort;
1460  BTShared *btshared;
1461  Sharedsort *sharedsort;
1462  Sharedsort *sharedsort2;
1463  BTSpool *btspool = buildstate->spool;
1464  BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1465  WalUsage *walusage;
1466  BufferUsage *bufferusage;
1467  bool leaderparticipates = true;
1468  int querylen;
1469 
1470 #ifdef DISABLE_LEADER_PARTICIPATION
1471  leaderparticipates = false;
1472 #endif
1473 
1474  /*
1475  * Enter parallel mode, and create context for parallel build of btree
1476  * index
1477  */
1479  Assert(request > 0);
1480  pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1481  request);
1482 
1483  scantuplesortstates = leaderparticipates ? request + 1 : request;
1484 
1485  /*
1486  * Prepare for scan of the base relation. In a normal index build, we use
1487  * SnapshotAny because we must retrieve all tuples and do our own time
1488  * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1489  * concurrent build, we take a regular MVCC snapshot and index whatever's
1490  * live according to that.
1491  */
1492  if (!isconcurrent)
1493  snapshot = SnapshotAny;
1494  else
1496 
1497  /*
1498  * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1499  * PARALLEL_KEY_TUPLESORT tuplesort workspace
1500  */
1501  estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1502  shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1503  estsort = tuplesort_estimate_shared(scantuplesortstates);
1504  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1505 
1506  /*
1507  * Unique case requires a second spool, and so we may have to account for
1508  * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1509  */
1510  if (!btspool->isunique)
1511  shm_toc_estimate_keys(&pcxt->estimator, 2);
1512  else
1513  {
1514  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1515  shm_toc_estimate_keys(&pcxt->estimator, 3);
1516  }
1517 
1518  /*
1519  * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1520  * and PARALLEL_KEY_BUFFER_USAGE.
1521  *
1522  * If there are no extensions loaded that care, we could skip this. We
1523  * have no way of knowing whether anyone's looking at pgWalUsage or
1524  * pgBufferUsage, so do it unconditionally.
1525  */
1527  mul_size(sizeof(WalUsage), pcxt->nworkers));
1528  shm_toc_estimate_keys(&pcxt->estimator, 1);
1530  mul_size(sizeof(BufferUsage), pcxt->nworkers));
1531  shm_toc_estimate_keys(&pcxt->estimator, 1);
1532 
1533  /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1534  if (debug_query_string)
1535  {
1536  querylen = strlen(debug_query_string);
1537  shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1538  shm_toc_estimate_keys(&pcxt->estimator, 1);
1539  }
1540  else
1541  querylen = 0; /* keep compiler quiet */
1542 
1543  /* Everyone's had a chance to ask for space, so now create the DSM */
1544  InitializeParallelDSM(pcxt);
1545 
1546  /* If no DSM segment was available, back out (do serial build) */
1547  if (pcxt->seg == NULL)
1548  {
1549  if (IsMVCCSnapshot(snapshot))
1550  UnregisterSnapshot(snapshot);
1551  DestroyParallelContext(pcxt);
1552  ExitParallelMode();
1553  return;
1554  }
1555 
1556  /* Store shared build state, for which we reserved space */
1557  btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1558  /* Initialize immutable state */
1559  btshared->heaprelid = RelationGetRelid(btspool->heap);
1560  btshared->indexrelid = RelationGetRelid(btspool->index);
1561  btshared->isunique = btspool->isunique;
1562  btshared->isconcurrent = isconcurrent;
1563  btshared->scantuplesortstates = scantuplesortstates;
1565  SpinLockInit(&btshared->mutex);
1566  /* Initialize mutable state */
1567  btshared->nparticipantsdone = 0;
1568  btshared->reltuples = 0.0;
1569  btshared->havedead = false;
1570  btshared->indtuples = 0.0;
1571  btshared->brokenhotchain = false;
1574  snapshot);
1575 
1576  /*
1577  * Store shared tuplesort-private state, for which we reserved space.
1578  * Then, initialize opaque state using tuplesort routine.
1579  */
1580  sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1581  tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1582  pcxt->seg);
1583 
1584  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1585  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1586 
1587  /* Unique case requires a second spool, and associated shared state */
1588  if (!btspool->isunique)
1589  sharedsort2 = NULL;
1590  else
1591  {
1592  /*
1593  * Store additional shared tuplesort-private state, for which we
1594  * reserved space. Then, initialize opaque state using tuplesort
1595  * routine.
1596  */
1597  sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1598  tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1599  pcxt->seg);
1600 
1601  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1602  }
1603 
1604  /* Store query string for workers */
1605  if (debug_query_string)
1606  {
1607  char *sharedquery;
1608 
1609  sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1610  memcpy(sharedquery, debug_query_string, querylen + 1);
1611  shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1612  }
1613 
1614  /*
1615  * Allocate space for each worker's WalUsage and BufferUsage; no need to
1616  * initialize.
1617  */
1618  walusage = shm_toc_allocate(pcxt->toc,
1619  mul_size(sizeof(WalUsage), pcxt->nworkers));
1620  shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1621  bufferusage = shm_toc_allocate(pcxt->toc,
1622  mul_size(sizeof(BufferUsage), pcxt->nworkers));
1623  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1624 
1625  /* Launch workers, saving status for leader/caller */
1626  LaunchParallelWorkers(pcxt);
1627  btleader->pcxt = pcxt;
1628  btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1629  if (leaderparticipates)
1630  btleader->nparticipanttuplesorts++;
1631  btleader->btshared = btshared;
1632  btleader->sharedsort = sharedsort;
1633  btleader->sharedsort2 = sharedsort2;
1634  btleader->snapshot = snapshot;
1635  btleader->walusage = walusage;
1636  btleader->bufferusage = bufferusage;
1637 
1638  /* If no workers were successfully launched, back out (do serial build) */
1639  if (pcxt->nworkers_launched == 0)
1640  {
1641  _bt_end_parallel(btleader);
1642  return;
1643  }
1644 
1645  /* Save leader state now that it's clear build will be parallel */
1646  buildstate->btleader = btleader;
1647 
1648  /* Join heap scan ourselves */
1649  if (leaderparticipates)
1651 
1652  /*
1653  * Caller needs to wait for all launched workers when we return. Make
1654  * sure that the failure-to-start case will not hang forever.
1655  */
1657 }
1658 
1659 /*
1660  * Shut down workers, destroy parallel context, and end parallel mode.
1661  */
1662 static void
1664 {
1665  int i;
1666 
1667  /* Shutdown worker processes */
1669 
1670  /*
1671  * Next, accumulate WAL usage. (This must wait for the workers to finish,
1672  * or we might get incomplete data.)
1673  */
1674  for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1675  InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1676 
1677  /* Free last reference to MVCC snapshot, if one was used */
1678  if (IsMVCCSnapshot(btleader->snapshot))
1679  UnregisterSnapshot(btleader->snapshot);
1680  DestroyParallelContext(btleader->pcxt);
1681  ExitParallelMode();
1682 }
1683 
1684 /*
1685  * Returns size of shared memory required to store state for a parallel
1686  * btree index build based on the snapshot its parallel scan will use.
1687  */
1688 static Size
1690 {
1691  /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1692  return add_size(BUFFERALIGN(sizeof(BTShared)),
1693  table_parallelscan_estimate(heap, snapshot));
1694 }
1695 
1696 /*
1697  * Within leader, wait for end of heap scan.
1698  *
1699  * When called, parallel heap scan started by _bt_begin_parallel() will
1700  * already be underway within worker processes (when leader participates
1701  * as a worker, we should end up here just as workers are finishing).
1702  *
1703  * Fills in fields needed for ambuild statistics, and lets caller set
1704  * field indicating that some worker encountered a broken HOT chain.
1705  *
1706  * Returns the total number of heap tuples scanned.
1707  */
1708 static double
1709 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1710 {
1711  BTShared *btshared = buildstate->btleader->btshared;
1712  int nparticipanttuplesorts;
1713  double reltuples;
1714 
1715  nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1716  for (;;)
1717  {
1718  SpinLockAcquire(&btshared->mutex);
1719  if (btshared->nparticipantsdone == nparticipanttuplesorts)
1720  {
1721  buildstate->havedead = btshared->havedead;
1722  buildstate->indtuples = btshared->indtuples;
1723  *brokenhotchain = btshared->brokenhotchain;
1724  reltuples = btshared->reltuples;
1725  SpinLockRelease(&btshared->mutex);
1726  break;
1727  }
1728  SpinLockRelease(&btshared->mutex);
1729 
1732  }
1733 
1735 
1736  return reltuples;
1737 }
1738 
1739 /*
1740  * Within leader, participate as a parallel worker.
1741  */
1742 static void
1744 {
1745  BTLeader *btleader = buildstate->btleader;
1746  BTSpool *leaderworker;
1747  BTSpool *leaderworker2;
1748  int sortmem;
1749 
1750  /* Allocate memory and initialize private spool */
1751  leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1752  leaderworker->heap = buildstate->spool->heap;
1753  leaderworker->index = buildstate->spool->index;
1754  leaderworker->isunique = buildstate->spool->isunique;
1755 
1756  /* Initialize second spool, if required */
1757  if (!btleader->btshared->isunique)
1758  leaderworker2 = NULL;
1759  else
1760  {
1761  /* Allocate memory for worker's own private secondary spool */
1762  leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1763 
1764  /* Initialize worker's own secondary spool */
1765  leaderworker2->heap = leaderworker->heap;
1766  leaderworker2->index = leaderworker->index;
1767  leaderworker2->isunique = false;
1768  }
1769 
1770  /*
1771  * Might as well use reliable figure when doling out maintenance_work_mem
1772  * (when requested number of workers were not launched, this will be
1773  * somewhat higher than it is for other workers).
1774  */
1775  sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1776 
1777  /* Perform work common to all participants */
1778  _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1779  btleader->sharedsort, btleader->sharedsort2,
1780  sortmem, true);
1781 
1782 #ifdef BTREE_BUILD_STATS
1784  {
1785  ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1786  ResetUsage();
1787  }
1788 #endif /* BTREE_BUILD_STATS */
1789 }
1790 
1791 /*
1792  * Perform work within a launched parallel process.
1793  */
1794 void
1796 {
1797  char *sharedquery;
1798  BTSpool *btspool;
1799  BTSpool *btspool2;
1800  BTShared *btshared;
1801  Sharedsort *sharedsort;
1802  Sharedsort *sharedsort2;
1803  Relation heapRel;
1804  Relation indexRel;
1805  LOCKMODE heapLockmode;
1806  LOCKMODE indexLockmode;
1807  WalUsage *walusage;
1808  BufferUsage *bufferusage;
1809  int sortmem;
1810 
1811 #ifdef BTREE_BUILD_STATS
1813  ResetUsage();
1814 #endif /* BTREE_BUILD_STATS */
1815 
1816  /* Set debug_query_string for individual workers first */
1817  sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1818  debug_query_string = sharedquery;
1819 
1820  /* Report the query string from leader */
1822 
1823  /* Look up nbtree shared state */
1824  btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1825 
1826  /* Open relations using lock modes known to be obtained by index.c */
1827  if (!btshared->isconcurrent)
1828  {
1829  heapLockmode = ShareLock;
1830  indexLockmode = AccessExclusiveLock;
1831  }
1832  else
1833  {
1834  heapLockmode = ShareUpdateExclusiveLock;
1835  indexLockmode = RowExclusiveLock;
1836  }
1837 
1838  /* Open relations within worker */
1839  heapRel = table_open(btshared->heaprelid, heapLockmode);
1840  indexRel = index_open(btshared->indexrelid, indexLockmode);
1841 
1842  /* Initialize worker's own spool */
1843  btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1844  btspool->heap = heapRel;
1845  btspool->index = indexRel;
1846  btspool->isunique = btshared->isunique;
1847 
1848  /* Look up shared state private to tuplesort.c */
1849  sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1850  tuplesort_attach_shared(sharedsort, seg);
1851  if (!btshared->isunique)
1852  {
1853  btspool2 = NULL;
1854  sharedsort2 = NULL;
1855  }
1856  else
1857  {
1858  /* Allocate memory for worker's own private secondary spool */
1859  btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1860 
1861  /* Initialize worker's own secondary spool */
1862  btspool2->heap = btspool->heap;
1863  btspool2->index = btspool->index;
1864  btspool2->isunique = false;
1865  /* Look up shared state private to tuplesort.c */
1866  sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1867  tuplesort_attach_shared(sharedsort2, seg);
1868  }
1869 
1870  /* Prepare to track buffer usage during parallel execution */
1872 
1873  /* Perform sorting of spool, and possibly a spool2 */
1874  sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1875  _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1876  sharedsort2, sortmem, false);
1877 
1878  /* Report WAL/buffer usage during parallel execution */
1879  bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1880  walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1882  &walusage[ParallelWorkerNumber]);
1883 
1884 #ifdef BTREE_BUILD_STATS
1886  {
1887  ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1888  ResetUsage();
1889  }
1890 #endif /* BTREE_BUILD_STATS */
1891 
1892  index_close(indexRel, indexLockmode);
1893  table_close(heapRel, heapLockmode);
1894 }
1895 
1896 /*
1897  * Perform a worker's portion of a parallel sort.
1898  *
1899  * This generates a tuplesort for passed btspool, and a second tuplesort
1900  * state if a second btspool is need (i.e. for unique index builds). All
1901  * other spool fields should already be set when this is called.
1902  *
1903  * sortmem is the amount of working memory to use within each worker,
1904  * expressed in KBs.
1905  *
1906  * When this returns, workers are done, and need only release resources.
1907  */
1908 static void
1910  BTShared *btshared, Sharedsort *sharedsort,
1911  Sharedsort *sharedsort2, int sortmem, bool progress)
1912 {
1913  SortCoordinate coordinate;
1914  BTBuildState buildstate;
1915  TableScanDesc scan;
1916  double reltuples;
1917  IndexInfo *indexInfo;
1918 
1919  /* Initialize local tuplesort coordination state */
1920  coordinate = palloc0(sizeof(SortCoordinateData));
1921  coordinate->isWorker = true;
1922  coordinate->nParticipants = -1;
1923  coordinate->sharedsort = sharedsort;
1924 
1925  /* Begin "partial" tuplesort */
1926  btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1927  btspool->index,
1928  btspool->isunique,
1929  sortmem, coordinate,
1930  false);
1931 
1932  /*
1933  * Just as with serial case, there may be a second spool. If so, a
1934  * second, dedicated spool2 partial tuplesort is required.
1935  */
1936  if (btspool2)
1937  {
1938  SortCoordinate coordinate2;
1939 
1940  /*
1941  * We expect that the second one (for dead tuples) won't get very
1942  * full, so we give it only work_mem (unless sortmem is less for
1943  * worker). Worker processes are generally permitted to allocate
1944  * work_mem independently.
1945  */
1946  coordinate2 = palloc0(sizeof(SortCoordinateData));
1947  coordinate2->isWorker = true;
1948  coordinate2->nParticipants = -1;
1949  coordinate2->sharedsort = sharedsort2;
1950  btspool2->sortstate =
1951  tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
1952  Min(sortmem, work_mem), coordinate2,
1953  false);
1954  }
1955 
1956  /* Fill in buildstate for _bt_build_callback() */
1957  buildstate.isunique = btshared->isunique;
1958  buildstate.havedead = false;
1959  buildstate.heap = btspool->heap;
1960  buildstate.spool = btspool;
1961  buildstate.spool2 = btspool2;
1962  buildstate.indtuples = 0;
1963  buildstate.btleader = NULL;
1964 
1965  /* Join parallel scan */
1966  indexInfo = BuildIndexInfo(btspool->index);
1967  indexInfo->ii_Concurrent = btshared->isconcurrent;
1968  scan = table_beginscan_parallel(btspool->heap,
1969  ParallelTableScanFromBTShared(btshared));
1970  reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1971  true, progress, _bt_build_callback,
1972  (void *) &buildstate, scan);
1973 
1974  /*
1975  * Execute this worker's part of the sort.
1976  *
1977  * Unlike leader and serial cases, we cannot avoid calling
1978  * tuplesort_performsort() for spool2 if it ends up containing no dead
1979  * tuples (this is disallowed for workers by tuplesort).
1980  */
1981  tuplesort_performsort(btspool->sortstate);
1982  if (btspool2)
1983  tuplesort_performsort(btspool2->sortstate);
1984 
1985  /*
1986  * Done. Record ambuild statistics, and whether we encountered a broken
1987  * HOT chain.
1988  */
1989  SpinLockAcquire(&btshared->mutex);
1990  btshared->nparticipantsdone++;
1991  btshared->reltuples += reltuples;
1992  if (buildstate.havedead)
1993  btshared->havedead = true;
1994  btshared->indtuples += buildstate.indtuples;
1995  if (indexInfo->ii_BrokenHotChain)
1996  btshared->brokenhotchain = true;
1997  SpinLockRelease(&btshared->mutex);
1998 
1999  /* Notify leader */
2001 
2002  /* We can end tuplesorts immediately */
2003  tuplesort_end(btspool->sortstate);
2004  if (btspool2)
2005  tuplesort_end(btspool2->sortstate);
2006 }
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2465
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
Definition: nbtsort.c:1453
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
signed short int16
Definition: c.h:428
bool ssup_nulls_first
Definition: sortsupport.h:75
int slock_t
Definition: s_lock.h:934
IndexTuple base
Definition: nbtree.h:871
#define BTP_ROOT
Definition: nbtree.h:75
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:859
Sharedsort * sharedsort2
Definition: nbtsort.c:194
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:2040
BlockNumber btpo_next
Definition: nbtree.h:65
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition: nbtree.h:1112
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:167
int scantuplesortstates
Definition: nbtsort.c:110
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
struct BTWriteState BTWriteState
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:69
Relation heap
Definition: nbtsort.c:210
#define AssertState(condition)
Definition: c.h:807
Page btws_zeropage
Definition: nbtsort.c:255
OffsetNumber baseoff
Definition: nbtree.h:872
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition: parallel.c:164
BTShared * btshared
Definition: nbtsort.c:192
int nparticipantsdone
Definition: nbtsort.c:144
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:1180
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:810
#define RelationGetDescr(relation)
Definition: rel.h:483
int LOCKMODE
Definition: lockdefs.h:26
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
#define BTGetTargetPageFreeSpace(relation)
Definition: nbtree.h:1099
Oid indexrelid
Definition: nbtsort.c:107
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:1101
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition: nbtree.h:1113
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
Definition: nbtsort.c:1074
void ShowUsage(const char *title)
Definition: postgres.c:4767
dsm_segment * seg
Definition: parallel.h:43
BlockNumber btws_pages_alloced
Definition: nbtsort.c:253
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
Sharedsort * sharedsort
Definition: nbtsort.c:193
shm_toc_estimator estimator
Definition: parallel.h:42
#define SpinLockInit(lock)
Definition: spin.h:60
#define BTP_LEAF
Definition: nbtree.h:74
ItemPointerData t_tid
Definition: itup.h:37
void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition: nbtsort.c:1795
#define Min(x, y)
Definition: c.h:986
bool havedead
Definition: nbtsort.c:209
Snapshot snapshot
Definition: nbtsort.c:195
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:211
Sharedsort * sharedsort
Definition: tuplesort.h:55
bool isunique
Definition: nbtsort.c:108
#define INDEX_SIZE_MASK
Definition: itup.h:65
bool allequalimage
Definition: nbtree.h:785
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
static void _bt_slideleft(Page rightmostpage)
Definition: nbtsort.c:732
#define ParallelTableScanFromBTShared(shared)
Definition: nbtsort.c:163
bool isconcurrent
Definition: nbtsort.c:109
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:69
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:84
uint32 BlockNumber
Definition: block.h:31
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:423
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1663
Tuplesortstate * sortstate
Definition: nbtsort.c:88
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2543
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:538
BlockNumber btps_blkno
Definition: nbtsort.c:235
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:4578
unsigned int Oid
Definition: postgres_ext.h:31
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:570
bool btws_use_wal
Definition: nbtsort.c:252
BTSpool * spool
Definition: nbtsort.c:211
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:189
static void _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
Definition: nbtsort.c:637
static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
Definition: pairingheap.c:79
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:250
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
ParallelContext * pcxt
Definition: nbtsort.c:172
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:900
Relation heap
Definition: nbtsort.c:89
signed int int32
Definition: c.h:429
struct BTBuildState BTBuildState
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:588
#define RelationOpenSmgr(relation)
Definition: rel.h:514
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2200
void ResetUsage(void)
Definition: postgres.c:4760
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:751
#define SpinLockAcquire(lock)
Definition: spin.h:62
void ConditionVariableInit(ConditionVariable *cv)
ItemPointer htids
Definition: nbtree.h:876
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:905
Page btps_page
Definition: nbtsort.c:234
struct BTSpool BTSpool
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
void pfree(void *pointer)
Definition: mcxt.c:1169
#define PARALLEL_KEY_BUFFER_USAGE
Definition: nbtsort.c:73
Size phystupsize
Definition: nbtree.h:879
void ConditionVariableCancelSleep(void)
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:200
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
Size btps_full
Definition: nbtsort.c:240
#define ERROR
Definition: elog.h:46
static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
Definition: nbtsort.c:1709
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:298
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1745
bool isunique
Definition: nbtsort.c:91
MemoryContext ssup_cxt
Definition: sortsupport.h:66
BTCycleId btpo_cycleid
Definition: nbtree.h:68
void ConditionVariableSignal(ConditionVariable *cv)
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition: nbtree.h:1114
void ExitParallelMode(void)
Definition: xact.c:992
Oid heaprelid
Definition: nbtsort.c:106
BufferUsage * bufferusage
Definition: nbtsort.c:197
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1398
BlockNumber btpo_prev
Definition: nbtree.h:64
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:537
uint32 btpo_level
Definition: nbtree.h:66
OffsetNumber btps_lastoff
Definition: nbtsort.c:237
static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtsort.c:763
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:474
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:523
BTSpool * spool2
Definition: nbtsort.c:217
#define RowExclusiveLock
Definition: lockdefs.h:38
IndexTupleData * IndexTuple
Definition: itup.h:53
struct BTShared BTShared
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:71
#define P_FIRSTKEY
Definition: nbtree.h:368
int ParallelWorkerNumber
Definition: parallel.c:112
double indtuples
Definition: nbtsort.c:218
bool isunique
Definition: nbtsort.c:208
#define RelationGetRelationName(relation)
Definition: rel.h:491
#define P_LEFTMOST(opaque)
Definition: nbtree.h:217
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:70
bool ii_BrokenHotChain
Definition: execnodes.h:175
unsigned int uint32
Definition: c.h:441
struct ItemIdData ItemIdData
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:555
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:476
int nworkers_launched
Definition: parallel.h:38
void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup)
Definition: sortsupport.c:162
bool brokenhotchain
Definition: nbtsort.c:148
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:528
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:199
#define BTREE_METAPAGE
Definition: nbtree.h:146
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:852
Size btps_lastextra
Definition: nbtsort.c:238
Relation index
Definition: nbtsort.c:90
Relation index
Definition: nbtsort.c:250
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
Definition: tableam.c:175
const char * debug_query_string
Definition: postgres.c:89
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:202
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1083
int progress
Definition: pgbench.c:270
void InstrStartParallelQuery(void)
Definition: instrument.c:181
#define SpinLockRelease(lock)
Definition: spin.h:64
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519
struct BTLeader BTLeader
BlockNumber btws_pages_written
Definition: nbtsort.c:254
void * palloc0(Size size)
Definition: mcxt.c:1093
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:58
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:120
uint32 btps_level
Definition: nbtsort.c:239
uintptr_t Datum
Definition: postgres.h:411
struct IndexTupleData IndexTupleData
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
Definition: nbtsort.c:695
static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: nbtsort.c:581
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition: nbtsort.c:367
bool log_btree_build_stats
Definition: guc.c:551
BTLeader * btleader
Definition: nbtsort.c:225
int work_mem
Definition: globals.c:124
double reltuples
Definition: nbtsort.c:145
AttrNumber ssup_attno
Definition: sortsupport.h:81
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:213
double indtuples
Definition: nbtsort.c:147
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition: nbtsort.c:1909
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:96
struct BTPageState * btps_next
Definition: nbtsort.c:241
BTScanInsert inskey
Definition: nbtsort.c:251
#define InvalidOffsetNumber
Definition: off.h:26
IndexTuple btps_lowkey
Definition: nbtsort.c:236
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
Definition: tuplesort.c:1727
slock_t mutex
Definition: nbtsort.c:126
int maintenance_work_mem
Definition: globals.c:126
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
Definition: nbtsort.c:833
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition: tableam.c:140
WalUsage * walusage
Definition: nbtsort.c:196
bool havedead
Definition: nbtsort.c:146
RelFileNode rd_node
Definition: rel.h:55
bool ii_Unique
Definition: execnodes.h:172
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:4601
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:522
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
PageHeaderData * PageHeader
Definition: bufpage.h:166
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:804
void pgstat_progress_update_param(int index, int64 val)
#define SK_BT_DESC
Definition: nbtree.h:1082
Definition: regguts.h:317
int nparticipanttuplesorts
Definition: nbtsort.c:180
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
int ii_ParallelWorkers
Definition: execnodes.h:176
size_t Size
Definition: c.h:540
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition: nbtsort.c:1689
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1532
#define MAXALIGN(LEN)
Definition: c.h:757
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
void EnterParallelMode(void)
Definition: xact.c:979
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:119
Size basetupsize
Definition: nbtree.h:873
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:791
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
static void _bt_leader_participate_as_worker(BTBuildState *buildstate)
Definition: nbtsort.c:1743
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:4557
#define RelationNeedsWAL(relation)
Definition: rel.h:570
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2634
bool ii_Concurrent
Definition: execnodes.h:174
#define SnapshotAny
Definition: snapmgr.h:67
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition: nbtree.h:1115
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:517
Size maxpostingsize
Definition: nbtree.h:868
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:158
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:462
#define BTMaxItemSize(page)
Definition: nbtree.h:162
#define P_HIKEY
Definition: nbtree.h:367
static Datum values[MAXATTR]
Definition: bootstrap.c:166
struct BTPageState BTPageState
#define AccessExclusiveLock
Definition: lockdefs.h:45
#define PARALLEL_KEY_WAL_USAGE
Definition: nbtsort.c:72
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2419
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * palloc(Size size)
Definition: mcxt.c:1062
void pgstat_report_activity(BackendState state, const char *cmd_str)
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition: progress.h:85
bool deduplicate
Definition: nbtree.h:866
static Page _bt_blnewpage(uint32 level)
Definition: nbtsort.c:610
Oid sk_collation
Definition: skey.h:70
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:996
#define elog(elevel,...)
Definition: elog.h:232
#define ShareLock
Definition: lockdefs.h:41
int i
static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
Definition: nbtsort.c:527
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1142
#define BUFFERALIGN(LEN)
Definition: c.h:759
#define unlikely(x)
Definition: c.h:273
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:102
#define PROGRESS_CREATEIDX_SUBPHASE
Definition: progress.h:83
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1066
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
Definition: nbtsort.c:1108
ConditionVariable workersdonecv
Definition: nbtsort.c:118
unsigned short t_info
Definition: itup.h:49
BTDedupStateData * BTDedupState
Definition: nbtree.h:891
Relation heap
Definition: nbtsort.c:249
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:68
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition: tableam.c:155
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1464
#define BTLessStrategyNumber
Definition: stratnum.h:29
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:39
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2692
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:200
uint16 btpo_flags
Definition: nbtree.h:67
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition: parallel.c:648
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:660
#define RelationGetRelid(relation)
Definition: rel.h:457
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:132
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
AttrNumber sk_attno
Definition: skey.h:67
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
shm_toc * toc
Definition: parallel.h:45
double index_tuples
Definition: genam.h:33
double heap_tuples
Definition: genam.h:32
#define MAXALIGN_DOWN(LEN)
Definition: c.h:769