PostgreSQL Source Code  git master
nbtsort.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtsort.c
4  * Build a btree from sorted input by loading leaf pages sequentially.
5  *
6  * NOTES
7  *
8  * We use tuplesort.c to sort the given index tuples into order.
9  * Then we scan the index tuples in order and build the btree pages
10  * for each level. We load source tuples into leaf-level pages.
11  * Whenever we fill a page at one level, we add a link to it to its
12  * parent level (starting a new parent level if necessary). When
13  * done, we write out each final page on each level, adding it to
14  * its parent level. When we have only one page on a level, it must be
15  * the root -- it can be attached to the btree metapage and we are done.
16  *
17  * It is not wise to pack the pages entirely full, since then *any*
18  * insertion would cause a split (and not only of the leaf page; the need
19  * for a split would cascade right up the tree). The steady-state load
20  * factor for btrees is usually estimated at 70%. We choose to pack leaf
21  * pages to the user-controllable fill factor (default 90%) while upper pages
22  * are always packed to 70%. This gives us reasonable density (there aren't
23  * many upper pages if the keys are reasonable-size) without risking a lot of
24  * cascading splits during early insertions.
25  *
26  * Formerly the index pages being built were kept in shared buffers, but
27  * that is of no value (since other backends have no interest in them yet)
28  * and it created locking problems for CHECKPOINT, because the upper-level
29  * pages were held exclusive-locked for long periods. Now we just build
30  * the pages in local memory and smgrwrite or smgrextend them as we finish
31  * them. They will need to be re-read into shared buffers on first use after
32  * the build finishes.
33  *
34  * This code isn't concerned about the FSM at all. The caller is responsible
35  * for initializing that.
36  *
37  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
38  * Portions Copyright (c) 1994, Regents of the University of California
39  *
40  * IDENTIFICATION
41  * src/backend/access/nbtree/nbtsort.c
42  *
43  *-------------------------------------------------------------------------
44  */
45 
46 #include "postgres.h"
47 
48 #include "access/nbtree.h"
49 #include "access/parallel.h"
50 #include "access/relscan.h"
51 #include "access/table.h"
52 #include "access/xact.h"
53 #include "access/xlog.h"
54 #include "access/xloginsert.h"
55 #include "catalog/index.h"
56 #include "commands/progress.h"
57 #include "executor/instrument.h"
58 #include "miscadmin.h"
59 #include "pgstat.h"
60 #include "storage/smgr.h"
61 #include "tcop/tcopprot.h" /* pgrminclude ignore */
62 #include "utils/rel.h"
63 #include "utils/sortsupport.h"
64 #include "utils/tuplesort.h"
65 
66 
67 /* Magic numbers for parallel state sharing */
68 #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
69 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
70 #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
71 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
72 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
73 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
74 
75 /*
76  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
77  * parallel index builds. This may be useful as a debugging aid.
78 #undef DISABLE_LEADER_PARTICIPATION
79  */
80 
81 /*
82  * Status record for spooling/sorting phase. (Note we may have two of
83  * these due to the special requirements for uniqueness-checking with
84  * dead tuples.)
85  */
86 typedef struct BTSpool
87 {
88  Tuplesortstate *sortstate; /* state data for tuplesort.c */
91  bool isunique;
94 
95 /*
96  * Status for index builds performed in parallel. This is allocated in a
97  * dynamic shared memory segment. Note that there is a separate tuplesort TOC
98  * entry, private to tuplesort.c but allocated by this module on its behalf.
99  */
100 typedef struct BTShared
101 {
102  /*
103  * These fields are not modified during the sort. They primarily exist
104  * for the benefit of worker processes that need to create BTSpool state
105  * corresponding to that used by the leader.
106  */
109  bool isunique;
113 
114  /*
115  * workersdonecv is used to monitor the progress of workers. All parallel
116  * participants must indicate that they are done before leader can use
117  * mutable state that workers maintain during scan (and before leader can
118  * proceed to tuplesort_performsort()).
119  */
121 
122  /*
123  * mutex protects all fields before heapdesc.
124  *
125  * These fields contain status information of interest to B-Tree index
126  * builds that must work just the same when an index is built in parallel.
127  */
129 
130  /*
131  * Mutable state that is maintained by workers, and reported back to
132  * leader at end of parallel scan.
133  *
134  * nparticipantsdone is number of worker processes finished.
135  *
136  * reltuples is the total number of input heap tuples.
137  *
138  * havedead indicates if RECENTLY_DEAD tuples were encountered during
139  * build.
140  *
141  * indtuples is the total number of tuples that made it into the index.
142  *
143  * brokenhotchain indicates if any worker detected a broken HOT chain
144  * during build.
145  */
147  double reltuples;
148  bool havedead;
149  double indtuples;
151 
152  /*
153  * ParallelTableScanDescData data follows. Can't directly embed here, as
154  * implementations of the parallel table scan desc interface might need
155  * stronger alignment.
156  */
158 
159 /*
160  * Return pointer to a BTShared's parallel table scan.
161  *
162  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
163  * MAXALIGN.
164  */
165 #define ParallelTableScanFromBTShared(shared) \
166  (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
167 
168 /*
169  * Status for leader in parallel index build.
170  */
171 typedef struct BTLeader
172 {
173  /* parallel context itself */
175 
176  /*
177  * nparticipanttuplesorts is the exact number of worker processes
178  * successfully launched, plus one leader process if it participates as a
179  * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
180  * participating as a worker).
181  */
183 
184  /*
185  * Leader process convenience pointers to shared state (leader avoids TOC
186  * lookups).
187  *
188  * btshared is the shared state for entire build. sharedsort is the
189  * shared, tuplesort-managed state passed to each process tuplesort.
190  * sharedsort2 is the corresponding btspool2 shared state, used only when
191  * building unique indexes. snapshot is the snapshot used by the scan iff
192  * an MVCC snapshot is required.
193  */
201 
202 /*
203  * Working state for btbuild and its callback.
204  *
205  * When parallel CREATE INDEX is used, there is a BTBuildState for each
206  * participant.
207  */
208 typedef struct BTBuildState
209 {
210  bool isunique;
212  bool havedead;
215 
216  /*
217  * spool2 is needed only when the index is a unique index. Dead tuples are
218  * put into spool2 instead of spool in order to avoid uniqueness check.
219  */
221  double indtuples;
222 
223  /*
224  * btleader is only present when a parallel index build is performed, and
225  * only in the leader process. (Actually, only the leader has a
226  * BTBuildState. Workers have their own spool and spool2, though.)
227  */
230 
231 /*
232  * Status record for a btree page being built. We have one of these
233  * for each active tree level.
234  */
235 typedef struct BTPageState
236 {
237  Page btps_page; /* workspace for page building */
238  BlockNumber btps_blkno; /* block # to write this page at */
239  IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
240  OffsetNumber btps_lastoff; /* last item offset loaded */
241  Size btps_lastextra; /* last item's extra posting list space */
242  uint32 btps_level; /* tree level (0 = leaf) */
243  Size btps_full; /* "full" if less than this much free space */
244  struct BTPageState *btps_next; /* link to parent level, if any */
246 
247 /*
248  * Overall status record for index writing phase.
249  */
250 typedef struct BTWriteState
251 {
254  BTScanInsert inskey; /* generic insertion scankey */
255  bool btws_use_wal; /* dump pages to WAL? */
256  BlockNumber btws_pages_alloced; /* # pages allocated */
257  BlockNumber btws_pages_written; /* # pages written out */
258  Page btws_zeropage; /* workspace for filling zeroes */
260 
261 
262 static double _bt_spools_heapscan(Relation heap, Relation index,
263  BTBuildState *buildstate, IndexInfo *indexInfo);
264 static void _bt_spooldestroy(BTSpool *btspool);
265 static void _bt_spool(BTSpool *btspool, ItemPointer self,
266  Datum *values, bool *isnull);
267 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
269  bool *isnull, bool tupleIsAlive, void *state);
270 static Page _bt_blnewpage(uint32 level);
271 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
272 static void _bt_slideleft(Page rightmostpage);
273 static void _bt_sortaddtup(Page page, Size itemsize,
274  IndexTuple itup, OffsetNumber itup_off,
275  bool newfirstdataitem);
276 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
277  IndexTuple itup, Size truncextra);
278 static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
280  BTDedupState dstate);
281 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
282 static void _bt_load(BTWriteState *wstate,
283  BTSpool *btspool, BTSpool *btspool2);
284 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
285  int request);
286 static void _bt_end_parallel(BTLeader *btleader);
287 static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
288 static double _bt_parallel_heapscan(BTBuildState *buildstate,
289  bool *brokenhotchain);
290 static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
291 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
292  BTShared *btshared, Sharedsort *sharedsort,
293  Sharedsort *sharedsort2, int sortmem,
294  bool progress);
295 
296 
297 /*
298  * btbuild() -- build a new btree index.
299  */
302 {
303  IndexBuildResult *result;
304  BTBuildState buildstate;
305  double reltuples;
306 
307 #ifdef BTREE_BUILD_STATS
309  ResetUsage();
310 #endif /* BTREE_BUILD_STATS */
311 
312  buildstate.isunique = indexInfo->ii_Unique;
313  buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
314  buildstate.havedead = false;
315  buildstate.heap = heap;
316  buildstate.spool = NULL;
317  buildstate.spool2 = NULL;
318  buildstate.indtuples = 0;
319  buildstate.btleader = NULL;
320 
321  /*
322  * We expect to be called exactly once for any index relation. If that's
323  * not the case, big trouble's what we have.
324  */
326  elog(ERROR, "index \"%s\" already contains data",
328 
329  reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
330 
331  /*
332  * Finish the build by (1) completing the sort of the spool file, (2)
333  * inserting the sorted tuples into btree pages and (3) building the upper
334  * levels. Finally, it may also be necessary to end use of parallelism.
335  */
336  _bt_leafbuild(buildstate.spool, buildstate.spool2);
337  _bt_spooldestroy(buildstate.spool);
338  if (buildstate.spool2)
339  _bt_spooldestroy(buildstate.spool2);
340  if (buildstate.btleader)
341  _bt_end_parallel(buildstate.btleader);
342 
343  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
344 
345  result->heap_tuples = reltuples;
346  result->index_tuples = buildstate.indtuples;
347 
348 #ifdef BTREE_BUILD_STATS
350  {
351  ShowUsage("BTREE BUILD STATS");
352  ResetUsage();
353  }
354 #endif /* BTREE_BUILD_STATS */
355 
356  return result;
357 }
358 
359 /*
360  * Create and initialize one or two spool structures, and save them in caller's
361  * buildstate argument. May also fill-in fields within indexInfo used by index
362  * builds.
363  *
364  * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
365  * routine encapsulates all aspects of managing parallelism. Caller need only
366  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
367  *
368  * Returns the total number of heap tuples scanned.
369  */
370 static double
372  IndexInfo *indexInfo)
373 {
374  BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
375  SortCoordinate coordinate = NULL;
376  double reltuples = 0;
377 
378  /*
379  * We size the sort area as maintenance_work_mem rather than work_mem to
380  * speed index creation. This should be OK since a single backend can't
381  * run multiple index creations in parallel (see also: notes on
382  * parallelism and maintenance_work_mem below).
383  */
384  btspool->heap = heap;
385  btspool->index = index;
386  btspool->isunique = indexInfo->ii_Unique;
387  btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
388 
389  /* Save as primary spool */
390  buildstate->spool = btspool;
391 
392  /* Report table scan phase started */
395 
396  /* Attempt to launch parallel worker scan when required */
397  if (indexInfo->ii_ParallelWorkers > 0)
398  _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
399  indexInfo->ii_ParallelWorkers);
400 
401  /*
402  * If parallel build requested and at least one worker process was
403  * successfully launched, set up coordination state
404  */
405  if (buildstate->btleader)
406  {
407  coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
408  coordinate->isWorker = false;
409  coordinate->nParticipants =
410  buildstate->btleader->nparticipanttuplesorts;
411  coordinate->sharedsort = buildstate->btleader->sharedsort;
412  }
413 
414  /*
415  * Begin serial/leader tuplesort.
416  *
417  * In cases where parallelism is involved, the leader receives the same
418  * share of maintenance_work_mem as a serial sort (it is generally treated
419  * in the same way as a serial sort once we return). Parallel worker
420  * Tuplesortstates will have received only a fraction of
421  * maintenance_work_mem, though.
422  *
423  * We rely on the lifetime of the Leader Tuplesortstate almost not
424  * overlapping with any worker Tuplesortstate's lifetime. There may be
425  * some small overlap, but that's okay because we rely on leader
426  * Tuplesortstate only allocating a small, fixed amount of memory here.
427  * When its tuplesort_performsort() is called (by our caller), and
428  * significant amounts of memory are likely to be used, all workers must
429  * have already freed almost all memory held by their Tuplesortstates
430  * (they are about to go away completely, too). The overall effect is
431  * that maintenance_work_mem always represents an absolute high watermark
432  * on the amount of memory used by a CREATE INDEX operation, regardless of
433  * the use of parallelism or any other factor.
434  */
435  buildstate->spool->sortstate =
436  tuplesort_begin_index_btree(heap, index, buildstate->isunique,
437  buildstate->nulls_not_distinct,
438  maintenance_work_mem, coordinate,
440 
441  /*
442  * If building a unique index, put dead tuples in a second spool to keep
443  * them out of the uniqueness check. We expect that the second spool (for
444  * dead tuples) won't get very full, so we give it only work_mem.
445  */
446  if (indexInfo->ii_Unique)
447  {
448  BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
449  SortCoordinate coordinate2 = NULL;
450 
451  /* Initialize secondary spool */
452  btspool2->heap = heap;
453  btspool2->index = index;
454  btspool2->isunique = false;
455  /* Save as secondary spool */
456  buildstate->spool2 = btspool2;
457 
458  if (buildstate->btleader)
459  {
460  /*
461  * Set up non-private state that is passed to
462  * tuplesort_begin_index_btree() about the basic high level
463  * coordination of a parallel sort.
464  */
465  coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
466  coordinate2->isWorker = false;
467  coordinate2->nParticipants =
468  buildstate->btleader->nparticipanttuplesorts;
469  coordinate2->sharedsort = buildstate->btleader->sharedsort2;
470  }
471 
472  /*
473  * We expect that the second one (for dead tuples) won't get very
474  * full, so we give it only work_mem
475  */
476  buildstate->spool2->sortstate =
477  tuplesort_begin_index_btree(heap, index, false, false, work_mem,
478  coordinate2, TUPLESORT_NONE);
479  }
480 
481  /* Fill spool using either serial or parallel heap scan */
482  if (!buildstate->btleader)
483  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
484  _bt_build_callback, (void *) buildstate,
485  NULL);
486  else
487  reltuples = _bt_parallel_heapscan(buildstate,
488  &indexInfo->ii_BrokenHotChain);
489 
490  /*
491  * Set the progress target for the next phase. Reset the block number
492  * values set by table_index_build_scan
493  */
494  {
495  const int progress_index[] = {
499  };
500  const int64 progress_vals[] = {
501  buildstate->indtuples,
502  0, 0
503  };
504 
505  pgstat_progress_update_multi_param(3, progress_index, progress_vals);
506  }
507 
508  /* okay, all heap tuples are spooled */
509  if (buildstate->spool2 && !buildstate->havedead)
510  {
511  /* spool2 turns out to be unnecessary */
512  _bt_spooldestroy(buildstate->spool2);
513  buildstate->spool2 = NULL;
514  }
515 
516  return reltuples;
517 }
518 
519 /*
520  * clean up a spool structure and its substructures.
521  */
522 static void
524 {
525  tuplesort_end(btspool->sortstate);
526  pfree(btspool);
527 }
528 
529 /*
530  * spool an index entry into the sort file.
531  */
532 static void
533 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
534 {
535  tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
536  self, values, isnull);
537 }
538 
539 /*
540  * given a spool loaded by successive calls to _bt_spool,
541  * create an entire btree.
542  */
543 static void
544 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
545 {
546  BTWriteState wstate;
547 
548 #ifdef BTREE_BUILD_STATS
550  {
551  ShowUsage("BTREE BUILD (Spool) STATISTICS");
552  ResetUsage();
553  }
554 #endif /* BTREE_BUILD_STATS */
555 
556  /* Execute the sort */
560  if (btspool2)
561  {
564  tuplesort_performsort(btspool2->sortstate);
565  }
566 
567  wstate.heap = btspool->heap;
568  wstate.index = btspool->index;
569  wstate.inskey = _bt_mkscankey(wstate.index, NULL);
570  /* _bt_mkscankey() won't set allequalimage without metapage */
571  wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
572  wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
573 
574  /* reserve the metapage */
575  wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
576  wstate.btws_pages_written = 0;
577  wstate.btws_zeropage = NULL; /* until needed */
578 
581  _bt_load(&wstate, btspool, btspool2);
582 }
583 
584 /*
585  * Per-tuple callback for table_index_build_scan
586  */
587 static void
589  ItemPointer tid,
590  Datum *values,
591  bool *isnull,
592  bool tupleIsAlive,
593  void *state)
594 {
595  BTBuildState *buildstate = (BTBuildState *) state;
596 
597  /*
598  * insert the index tuple into the appropriate spool file for subsequent
599  * processing
600  */
601  if (tupleIsAlive || buildstate->spool2 == NULL)
602  _bt_spool(buildstate->spool, tid, values, isnull);
603  else
604  {
605  /* dead tuples are put into spool2 */
606  buildstate->havedead = true;
607  _bt_spool(buildstate->spool2, tid, values, isnull);
608  }
609 
610  buildstate->indtuples += 1;
611 }
612 
613 /*
614  * allocate workspace for a new, clean btree page, not linked to any siblings.
615  */
616 static Page
618 {
619  Page page;
620  BTPageOpaque opaque;
621 
622  page = (Page) palloc(BLCKSZ);
623 
624  /* Zero the page and set up standard page header info */
625  _bt_pageinit(page, BLCKSZ);
626 
627  /* Initialize BT opaque state */
628  opaque = BTPageGetOpaque(page);
629  opaque->btpo_prev = opaque->btpo_next = P_NONE;
630  opaque->btpo_level = level;
631  opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
632  opaque->btpo_cycleid = 0;
633 
634  /* Make the P_HIKEY line pointer appear allocated */
635  ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
636 
637  return page;
638 }
639 
640 /*
641  * emit a completed btree page, and release the working storage.
642  */
643 static void
645 {
646  /* XLOG stuff */
647  if (wstate->btws_use_wal)
648  {
649  /* We use the XLOG_FPI record type for this */
650  log_newpage(&wstate->index->rd_locator, MAIN_FORKNUM, blkno, page, true);
651  }
652 
653  /*
654  * If we have to write pages nonsequentially, fill in the space with
655  * zeroes until we come back and overwrite. This is not logically
656  * necessary on standard Unix filesystems (unwritten space will read as
657  * zeroes anyway), but it should help to avoid fragmentation. The dummy
658  * pages aren't WAL-logged though.
659  */
660  while (blkno > wstate->btws_pages_written)
661  {
662  if (!wstate->btws_zeropage)
663  wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
664  /* don't set checksum for all-zero page */
666  wstate->btws_pages_written++,
667  (char *) wstate->btws_zeropage,
668  true);
669  }
670 
671  PageSetChecksumInplace(page, blkno);
672 
673  /*
674  * Now write the page. There's no need for smgr to schedule an fsync for
675  * this write; we'll do it ourselves before ending the build.
676  */
677  if (blkno == wstate->btws_pages_written)
678  {
679  /* extending the file... */
680  smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
681  (char *) page, true);
682  wstate->btws_pages_written++;
683  }
684  else
685  {
686  /* overwriting a block we zero-filled before */
687  smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
688  (char *) page, true);
689  }
690 
691  pfree(page);
692 }
693 
694 /*
695  * allocate and initialize a new BTPageState. the returned structure
696  * is suitable for immediate use by _bt_buildadd.
697  */
698 static BTPageState *
700 {
702 
703  /* create initial page for level */
704  state->btps_page = _bt_blnewpage(level);
705 
706  /* and assign it a page position */
707  state->btps_blkno = wstate->btws_pages_alloced++;
708 
709  state->btps_lowkey = NULL;
710  /* initialize lastoff so first item goes into P_FIRSTKEY */
711  state->btps_lastoff = P_HIKEY;
712  state->btps_lastextra = 0;
713  state->btps_level = level;
714  /* set "full" threshold based on level. See notes at head of file. */
715  if (level > 0)
716  state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
717  else
718  state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
719 
720  /* no parent level, yet */
721  state->btps_next = NULL;
722 
723  return state;
724 }
725 
726 /*
727  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
728  * P_HIKEY, overwriting P_HIKEY).
729  *
730  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
731  * rightmost page on its level is not supposed to get a high key. Now that
732  * it's clear that this page is a rightmost page, remove the unneeded empty
733  * P_HIKEY line pointer space.
734  */
735 static void
736 _bt_slideleft(Page rightmostpage)
737 {
738  OffsetNumber off;
739  OffsetNumber maxoff;
740  ItemId previi;
741 
742  maxoff = PageGetMaxOffsetNumber(rightmostpage);
743  Assert(maxoff >= P_FIRSTKEY);
744  previi = PageGetItemId(rightmostpage, P_HIKEY);
745  for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
746  {
747  ItemId thisii = PageGetItemId(rightmostpage, off);
748 
749  *previi = *thisii;
750  previi = thisii;
751  }
752  ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
753 }
754 
755 /*
756  * Add an item to a page being built.
757  *
758  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
759  * raises an error directly.
760  *
761  * Note that our nbtsort.c caller does not know yet if the page will be
762  * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
763  * caller. Page that turns out to be the rightmost on its level is fixed by
764  * calling _bt_slideleft().
765  */
766 static void
768  Size itemsize,
769  IndexTuple itup,
770  OffsetNumber itup_off,
771  bool newfirstdataitem)
772 {
773  IndexTupleData trunctuple;
774 
775  if (newfirstdataitem)
776  {
777  trunctuple = *itup;
778  trunctuple.t_info = sizeof(IndexTupleData);
779  BTreeTupleSetNAtts(&trunctuple, 0, false);
780  itup = &trunctuple;
781  itemsize = sizeof(IndexTupleData);
782  }
783 
784  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
785  false, false) == InvalidOffsetNumber)
786  elog(ERROR, "failed to add item to the index page");
787 }
788 
789 /*----------
790  * Add an item to a disk page from the sort output (or add a posting list
791  * item formed from the sort output).
792  *
793  * We must be careful to observe the page layout conventions of nbtsearch.c:
794  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
795  * - on non-leaf pages, the key portion of the first item need not be
796  * stored, we should store only the link.
797  *
798  * A leaf page being built looks like:
799  *
800  * +----------------+---------------------------------+
801  * | PageHeaderData | linp0 linp1 linp2 ... |
802  * +-----------+----+---------------------------------+
803  * | ... linpN | |
804  * +-----------+--------------------------------------+
805  * | ^ last |
806  * | |
807  * +-------------+------------------------------------+
808  * | | itemN ... |
809  * +-------------+------------------+-----------------+
810  * | ... item3 item2 item1 | "special space" |
811  * +--------------------------------+-----------------+
812  *
813  * Contrast this with the diagram in bufpage.h; note the mismatch
814  * between linps and items. This is because we reserve linp0 as a
815  * placeholder for the pointer to the "high key" item; when we have
816  * filled up the page, we will set linp0 to point to itemN and clear
817  * linpN. On the other hand, if we find this is the last (rightmost)
818  * page, we leave the items alone and slide the linp array over. If
819  * the high key is to be truncated, offset 1 is deleted, and we insert
820  * the truncated high key at offset 1.
821  *
822  * 'last' pointer indicates the last offset added to the page.
823  *
824  * 'truncextra' is the size of the posting list in itup, if any. This
825  * information is stashed for the next call here, when we may benefit
826  * from considering the impact of truncating away the posting list on
827  * the page before deciding to finish the page off. Posting lists are
828  * often relatively large, so it is worth going to the trouble of
829  * accounting for the saving from truncating away the posting list of
830  * the tuple that becomes the high key (that may be the only way to
831  * get close to target free space on the page). Note that this is
832  * only used for the soft fillfactor-wise limit, not the critical hard
833  * limit.
834  *----------
835  */
836 static void
838  Size truncextra)
839 {
840  Page npage;
841  BlockNumber nblkno;
842  OffsetNumber last_off;
843  Size last_truncextra;
844  Size pgspc;
845  Size itupsz;
846  bool isleaf;
847 
848  /*
849  * This is a handy place to check for cancel interrupts during the btree
850  * load phase of index creation.
851  */
853 
854  npage = state->btps_page;
855  nblkno = state->btps_blkno;
856  last_off = state->btps_lastoff;
857  last_truncextra = state->btps_lastextra;
858  state->btps_lastextra = truncextra;
859 
860  pgspc = PageGetFreeSpace(npage);
861  itupsz = IndexTupleSize(itup);
862  itupsz = MAXALIGN(itupsz);
863  /* Leaf case has slightly different rules due to suffix truncation */
864  isleaf = (state->btps_level == 0);
865 
866  /*
867  * Check whether the new item can fit on a btree page on current level at
868  * all.
869  *
870  * Every newly built index will treat heap TID as part of the keyspace,
871  * which imposes the requirement that new high keys must occasionally have
872  * a heap TID appended within _bt_truncate(). That may leave a new pivot
873  * tuple one or two MAXALIGN() quantums larger than the original
874  * firstright tuple it's derived from. v4 deals with the problem by
875  * decreasing the limit on the size of tuples inserted on the leaf level
876  * by the same small amount. Enforce the new v4+ limit on the leaf level,
877  * and the old limit on internal levels, since pivot tuples may need to
878  * make use of the reserved space. This should never fail on internal
879  * pages.
880  */
881  if (unlikely(itupsz > BTMaxItemSize(npage)))
882  _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
883  itup);
884 
885  /*
886  * Check to see if current page will fit new item, with space left over to
887  * append a heap TID during suffix truncation when page is a leaf page.
888  *
889  * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
890  * high key with heap TID when finishing off a leaf page, since we rely on
891  * _bt_check_third_page() rejecting oversized non-pivot tuples. On
892  * internal pages we can always fit 3 pivot tuples with larger internal
893  * page tuple limit (includes page high key).
894  *
895  * Most of the time, a page is only "full" in the sense that the soft
896  * fillfactor-wise limit has been exceeded. However, we must always leave
897  * at least two items plus a high key on each page before starting a new
898  * page. Disregard fillfactor and insert on "full" current page if we
899  * don't have the minimum number of items yet. (Note that we deliberately
900  * assume that suffix truncation neither enlarges nor shrinks new high key
901  * when applying soft limit, except when last tuple has a posting list.)
902  */
903  Assert(last_truncextra == 0 || isleaf);
904  if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
905  (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
906  {
907  /*
908  * Finish off the page and write it out.
909  */
910  Page opage = npage;
911  BlockNumber oblkno = nblkno;
912  ItemId ii;
913  ItemId hii;
914  IndexTuple oitup;
915 
916  /* Create new page of same level */
917  npage = _bt_blnewpage(state->btps_level);
918 
919  /* and assign it a page position */
920  nblkno = wstate->btws_pages_alloced++;
921 
922  /*
923  * We copy the last item on the page into the new page, and then
924  * rearrange the old page so that the 'last item' becomes its high key
925  * rather than a true data item. There had better be at least two
926  * items on the page already, else the page would be empty of useful
927  * data.
928  */
929  Assert(last_off > P_FIRSTKEY);
930  ii = PageGetItemId(opage, last_off);
931  oitup = (IndexTuple) PageGetItem(opage, ii);
932  _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
933  !isleaf);
934 
935  /*
936  * Move 'last' into the high key position on opage. _bt_blnewpage()
937  * allocated empty space for a line pointer when opage was first
938  * created, so this is a matter of rearranging already-allocated space
939  * on page, and initializing high key line pointer. (Actually, leaf
940  * pages must also swap oitup with a truncated version of oitup, which
941  * is sometimes larger than oitup, though never by more than the space
942  * needed to append a heap TID.)
943  */
944  hii = PageGetItemId(opage, P_HIKEY);
945  *hii = *ii;
946  ItemIdSetUnused(ii); /* redundant */
947  ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
948 
949  if (isleaf)
950  {
951  IndexTuple lastleft;
952  IndexTuple truncated;
953 
954  /*
955  * Truncate away any unneeded attributes from high key on leaf
956  * level. This is only done at the leaf level because downlinks
957  * in internal pages are either negative infinity items, or get
958  * their contents from copying from one level down. See also:
959  * _bt_split().
960  *
961  * We don't try to bias our choice of split point to make it more
962  * likely that _bt_truncate() can truncate away more attributes,
963  * whereas the split point used within _bt_split() is chosen much
964  * more delicately. Even still, the lastleft and firstright
965  * tuples passed to _bt_truncate() here are at least not fully
966  * equal to each other when deduplication is used, unless there is
967  * a large group of duplicates (also, unique index builds usually
968  * have few or no spool2 duplicates). When the split point is
969  * between two unequal tuples, _bt_truncate() will avoid including
970  * a heap TID in the new high key, which is the most important
971  * benefit of suffix truncation.
972  *
973  * Overwrite the old item with new truncated high key directly.
974  * oitup is already located at the physical beginning of tuple
975  * space, so this should directly reuse the existing tuple space.
976  */
977  ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
978  lastleft = (IndexTuple) PageGetItem(opage, ii);
979 
980  Assert(IndexTupleSize(oitup) > last_truncextra);
981  truncated = _bt_truncate(wstate->index, lastleft, oitup,
982  wstate->inskey);
983  if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
984  IndexTupleSize(truncated)))
985  elog(ERROR, "failed to add high key to the index page");
986  pfree(truncated);
987 
988  /* oitup should continue to point to the page's high key */
989  hii = PageGetItemId(opage, P_HIKEY);
990  oitup = (IndexTuple) PageGetItem(opage, hii);
991  }
992 
993  /*
994  * Link the old page into its parent, using its low key. If we don't
995  * have a parent, we have to create one; this adds a new btree level.
996  */
997  if (state->btps_next == NULL)
998  state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
999 
1000  Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
1002  BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
1003  P_LEFTMOST(BTPageGetOpaque(opage)));
1004  Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
1005  !P_LEFTMOST(BTPageGetOpaque(opage)));
1006  BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
1007  _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
1008  pfree(state->btps_lowkey);
1009 
1010  /*
1011  * Save a copy of the high key from the old page. It is also the low
1012  * key for the new page.
1013  */
1014  state->btps_lowkey = CopyIndexTuple(oitup);
1015 
1016  /*
1017  * Set the sibling links for both pages.
1018  */
1019  {
1020  BTPageOpaque oopaque = BTPageGetOpaque(opage);
1021  BTPageOpaque nopaque = BTPageGetOpaque(npage);
1022 
1023  oopaque->btpo_next = nblkno;
1024  nopaque->btpo_prev = oblkno;
1025  nopaque->btpo_next = P_NONE; /* redundant */
1026  }
1027 
1028  /*
1029  * Write out the old page. We never need to touch it again, so we can
1030  * free the opage workspace too.
1031  */
1032  _bt_blwritepage(wstate, opage, oblkno);
1033 
1034  /*
1035  * Reset last_off to point to new page
1036  */
1037  last_off = P_FIRSTKEY;
1038  }
1039 
1040  /*
1041  * By here, either original page is still the current page, or a new page
1042  * was created that became the current page. Either way, the current page
1043  * definitely has space for new item.
1044  *
1045  * If the new item is the first for its page, it must also be the first
1046  * item on its entire level. On later same-level pages, a low key for a
1047  * page will be copied from the prior page in the code above. Generate a
1048  * minus infinity low key here instead.
1049  */
1050  if (last_off == P_HIKEY)
1051  {
1052  Assert(state->btps_lowkey == NULL);
1053  state->btps_lowkey = palloc0(sizeof(IndexTupleData));
1054  state->btps_lowkey->t_info = sizeof(IndexTupleData);
1055  BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1056  }
1057 
1058  /*
1059  * Add the new item into the current page.
1060  */
1061  last_off = OffsetNumberNext(last_off);
1062  _bt_sortaddtup(npage, itupsz, itup, last_off,
1063  !isleaf && last_off == P_FIRSTKEY);
1064 
1065  state->btps_page = npage;
1066  state->btps_blkno = nblkno;
1067  state->btps_lastoff = last_off;
1068 }
1069 
1070 /*
1071  * Finalize pending posting list tuple, and add it to the index. Final tuple
1072  * is based on saved base tuple, and saved list of heap TIDs.
1073  *
1074  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1075  * using _bt_buildadd().
1076  */
1077 static void
1079  BTDedupState dstate)
1080 {
1081  Assert(dstate->nitems > 0);
1082 
1083  if (dstate->nitems == 1)
1084  _bt_buildadd(wstate, state, dstate->base, 0);
1085  else
1086  {
1087  IndexTuple postingtuple;
1088  Size truncextra;
1089 
1090  /* form a tuple with a posting list */
1091  postingtuple = _bt_form_posting(dstate->base,
1092  dstate->htids,
1093  dstate->nhtids);
1094  /* Calculate posting list overhead */
1095  truncextra = IndexTupleSize(postingtuple) -
1096  BTreeTupleGetPostingOffset(postingtuple);
1097 
1098  _bt_buildadd(wstate, state, postingtuple, truncextra);
1099  pfree(postingtuple);
1100  }
1101 
1102  dstate->nmaxitems = 0;
1103  dstate->nhtids = 0;
1104  dstate->nitems = 0;
1105  dstate->phystupsize = 0;
1106 }
1107 
1108 /*
1109  * Finish writing out the completed btree.
1110  */
1111 static void
1113 {
1114  BTPageState *s;
1115  BlockNumber rootblkno = P_NONE;
1116  uint32 rootlevel = 0;
1117  Page metapage;
1118 
1119  /*
1120  * Each iteration of this loop completes one more level of the tree.
1121  */
1122  for (s = state; s != NULL; s = s->btps_next)
1123  {
1124  BlockNumber blkno;
1125  BTPageOpaque opaque;
1126 
1127  blkno = s->btps_blkno;
1128  opaque = BTPageGetOpaque(s->btps_page);
1129 
1130  /*
1131  * We have to link the last page on this level to somewhere.
1132  *
1133  * If we're at the top, it's the root, so attach it to the metapage.
1134  * Otherwise, add an entry for it to its parent using its low key.
1135  * This may cause the last page of the parent level to split, but
1136  * that's not a problem -- we haven't gotten to it yet.
1137  */
1138  if (s->btps_next == NULL)
1139  {
1140  opaque->btpo_flags |= BTP_ROOT;
1141  rootblkno = blkno;
1142  rootlevel = s->btps_level;
1143  }
1144  else
1145  {
1146  Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1148  BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1149  P_LEFTMOST(opaque));
1150  Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1151  !P_LEFTMOST(opaque));
1153  _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1154  pfree(s->btps_lowkey);
1155  s->btps_lowkey = NULL;
1156  }
1157 
1158  /*
1159  * This is the rightmost page, so the ItemId array needs to be slid
1160  * back one slot. Then we can dump out the page.
1161  */
1163  _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1164  s->btps_page = NULL; /* writepage freed the workspace */
1165  }
1166 
1167  /*
1168  * As the last step in the process, construct the metapage and make it
1169  * point to the new root (unless we had no data at all, in which case it's
1170  * set to point to "P_NONE"). This changes the index to the "valid" state
1171  * by filling in a valid magic number in the metapage.
1172  */
1173  metapage = (Page) palloc(BLCKSZ);
1174  _bt_initmetapage(metapage, rootblkno, rootlevel,
1175  wstate->inskey->allequalimage);
1176  _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1177 }
1178 
1179 /*
1180  * Read tuples in correct sort order from tuplesort, and load them into
1181  * btree leaves.
1182  */
1183 static void
1184 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1185 {
1186  BTPageState *state = NULL;
1187  bool merge = (btspool2 != NULL);
1188  IndexTuple itup,
1189  itup2 = NULL;
1190  bool load1;
1191  TupleDesc tupdes = RelationGetDescr(wstate->index);
1192  int i,
1194  SortSupport sortKeys;
1195  int64 tuples_done = 0;
1196  bool deduplicate;
1197 
1198  deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1199  BTGetDeduplicateItems(wstate->index);
1200 
1201  if (merge)
1202  {
1203  /*
1204  * Another BTSpool for dead tuples exists. Now we have to merge
1205  * btspool and btspool2.
1206  */
1207 
1208  /* the preparation of merge */
1209  itup = tuplesort_getindextuple(btspool->sortstate, true);
1210  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1211 
1212  /* Prepare SortSupport data for each column */
1213  sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1214 
1215  for (i = 0; i < keysz; i++)
1216  {
1217  SortSupport sortKey = sortKeys + i;
1218  ScanKey scanKey = wstate->inskey->scankeys + i;
1219  int16 strategy;
1220 
1221  sortKey->ssup_cxt = CurrentMemoryContext;
1222  sortKey->ssup_collation = scanKey->sk_collation;
1223  sortKey->ssup_nulls_first =
1224  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1225  sortKey->ssup_attno = scanKey->sk_attno;
1226  /* Abbreviation is not supported here */
1227  sortKey->abbreviate = false;
1228 
1229  Assert(sortKey->ssup_attno != 0);
1230 
1231  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1233 
1234  PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1235  }
1236 
1237  for (;;)
1238  {
1239  load1 = true; /* load BTSpool next ? */
1240  if (itup2 == NULL)
1241  {
1242  if (itup == NULL)
1243  break;
1244  }
1245  else if (itup != NULL)
1246  {
1247  int32 compare = 0;
1248 
1249  for (i = 1; i <= keysz; i++)
1250  {
1251  SortSupport entry;
1252  Datum attrDatum1,
1253  attrDatum2;
1254  bool isNull1,
1255  isNull2;
1256 
1257  entry = sortKeys + i - 1;
1258  attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1259  attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1260 
1261  compare = ApplySortComparator(attrDatum1, isNull1,
1262  attrDatum2, isNull2,
1263  entry);
1264  if (compare > 0)
1265  {
1266  load1 = false;
1267  break;
1268  }
1269  else if (compare < 0)
1270  break;
1271  }
1272 
1273  /*
1274  * If key values are equal, we sort on ItemPointer. This is
1275  * required for btree indexes, since heap TID is treated as an
1276  * implicit last key attribute in order to ensure that all
1277  * keys in the index are physically unique.
1278  */
1279  if (compare == 0)
1280  {
1281  compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1282  Assert(compare != 0);
1283  if (compare > 0)
1284  load1 = false;
1285  }
1286  }
1287  else
1288  load1 = false;
1289 
1290  /* When we see first tuple, create first index page */
1291  if (state == NULL)
1292  state = _bt_pagestate(wstate, 0);
1293 
1294  if (load1)
1295  {
1296  _bt_buildadd(wstate, state, itup, 0);
1297  itup = tuplesort_getindextuple(btspool->sortstate, true);
1298  }
1299  else
1300  {
1301  _bt_buildadd(wstate, state, itup2, 0);
1302  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1303  }
1304 
1305  /* Report progress */
1307  ++tuples_done);
1308  }
1309  pfree(sortKeys);
1310  }
1311  else if (deduplicate)
1312  {
1313  /* merge is unnecessary, deduplicate into posting lists */
1314  BTDedupState dstate;
1315 
1316  dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
1317  dstate->deduplicate = true; /* unused */
1318  dstate->nmaxitems = 0; /* unused */
1319  dstate->maxpostingsize = 0; /* set later */
1320  /* Metadata about base tuple of current pending posting list */
1321  dstate->base = NULL;
1322  dstate->baseoff = InvalidOffsetNumber; /* unused */
1323  dstate->basetupsize = 0;
1324  /* Metadata about current pending posting list TIDs */
1325  dstate->htids = NULL;
1326  dstate->nhtids = 0;
1327  dstate->nitems = 0;
1328  dstate->phystupsize = 0; /* unused */
1329  dstate->nintervals = 0; /* unused */
1330 
1331  while ((itup = tuplesort_getindextuple(btspool->sortstate,
1332  true)) != NULL)
1333  {
1334  /* When we see first tuple, create first index page */
1335  if (state == NULL)
1336  {
1337  state = _bt_pagestate(wstate, 0);
1338 
1339  /*
1340  * Limit size of posting list tuples to 1/10 space we want to
1341  * leave behind on the page, plus space for final item's line
1342  * pointer. This is equal to the space that we'd like to
1343  * leave behind on each leaf page when fillfactor is 90,
1344  * allowing us to get close to fillfactor% space utilization
1345  * when there happen to be a great many duplicates. (This
1346  * makes higher leaf fillfactor settings ineffective when
1347  * building indexes that have many duplicates, but packing
1348  * leaf pages full with few very large tuples doesn't seem
1349  * like a useful goal.)
1350  */
1351  dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1352  sizeof(ItemIdData);
1353  Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
1354  dstate->maxpostingsize <= INDEX_SIZE_MASK);
1355  dstate->htids = palloc(dstate->maxpostingsize);
1356 
1357  /* start new pending posting list with itup copy */
1360  }
1361  else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1362  itup) > keysz &&
1363  _bt_dedup_save_htid(dstate, itup))
1364  {
1365  /*
1366  * Tuple is equal to base tuple of pending posting list. Heap
1367  * TID from itup has been saved in state.
1368  */
1369  }
1370  else
1371  {
1372  /*
1373  * Tuple is not equal to pending posting list tuple, or
1374  * _bt_dedup_save_htid() opted to not merge current item into
1375  * pending posting list.
1376  */
1377  _bt_sort_dedup_finish_pending(wstate, state, dstate);
1378  pfree(dstate->base);
1379 
1380  /* start new pending posting list with itup copy */
1383  }
1384 
1385  /* Report progress */
1387  ++tuples_done);
1388  }
1389 
1390  if (state)
1391  {
1392  /*
1393  * Handle the last item (there must be a last item when the
1394  * tuplesort returned one or more tuples)
1395  */
1396  _bt_sort_dedup_finish_pending(wstate, state, dstate);
1397  pfree(dstate->base);
1398  pfree(dstate->htids);
1399  }
1400 
1401  pfree(dstate);
1402  }
1403  else
1404  {
1405  /* merging and deduplication are both unnecessary */
1406  while ((itup = tuplesort_getindextuple(btspool->sortstate,
1407  true)) != NULL)
1408  {
1409  /* When we see first tuple, create first index page */
1410  if (state == NULL)
1411  state = _bt_pagestate(wstate, 0);
1412 
1413  _bt_buildadd(wstate, state, itup, 0);
1414 
1415  /* Report progress */
1417  ++tuples_done);
1418  }
1419  }
1420 
1421  /* Close down final pages and write the metapage */
1422  _bt_uppershutdown(wstate, state);
1423 
1424  /*
1425  * When we WAL-logged index pages, we must nonetheless fsync index files.
1426  * Since we're building outside shared buffers, a CHECKPOINT occurring
1427  * during the build has no way to flush the previously written data to
1428  * disk (indeed it won't know the index even exists). A crash later on
1429  * would replay WAL from the checkpoint, therefore it wouldn't replay our
1430  * earlier WAL entries. If we do not fsync those pages here, they might
1431  * still not be on disk when the crash occurs.
1432  */
1433  if (wstate->btws_use_wal)
1435 }
1436 
1437 /*
1438  * Create parallel context, and launch workers for leader.
1439  *
1440  * buildstate argument should be initialized (with the exception of the
1441  * tuplesort state in spools, which may later be created based on shared
1442  * state initially set up here).
1443  *
1444  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1445  *
1446  * request is the target number of parallel worker processes to launch.
1447  *
1448  * Sets buildstate's BTLeader, which caller must use to shut down parallel
1449  * mode by passing it to _bt_end_parallel() at the very end of its index
1450  * build. If not even a single worker process can be launched, this is
1451  * never set, and caller should proceed with a serial index build.
1452  */
1453 static void
1454 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1455 {
1456  ParallelContext *pcxt;
1457  int scantuplesortstates;
1458  Snapshot snapshot;
1459  Size estbtshared;
1460  Size estsort;
1461  BTShared *btshared;
1462  Sharedsort *sharedsort;
1463  Sharedsort *sharedsort2;
1464  BTSpool *btspool = buildstate->spool;
1465  BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1466  WalUsage *walusage;
1467  BufferUsage *bufferusage;
1468  bool leaderparticipates = true;
1469  int querylen;
1470 
1471 #ifdef DISABLE_LEADER_PARTICIPATION
1472  leaderparticipates = false;
1473 #endif
1474 
1475  /*
1476  * Enter parallel mode, and create context for parallel build of btree
1477  * index
1478  */
1480  Assert(request > 0);
1481  pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1482  request);
1483 
1484  scantuplesortstates = leaderparticipates ? request + 1 : request;
1485 
1486  /*
1487  * Prepare for scan of the base relation. In a normal index build, we use
1488  * SnapshotAny because we must retrieve all tuples and do our own time
1489  * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1490  * concurrent build, we take a regular MVCC snapshot and index whatever's
1491  * live according to that.
1492  */
1493  if (!isconcurrent)
1494  snapshot = SnapshotAny;
1495  else
1497 
1498  /*
1499  * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1500  * PARALLEL_KEY_TUPLESORT tuplesort workspace
1501  */
1502  estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1503  shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1504  estsort = tuplesort_estimate_shared(scantuplesortstates);
1505  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1506 
1507  /*
1508  * Unique case requires a second spool, and so we may have to account for
1509  * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1510  */
1511  if (!btspool->isunique)
1512  shm_toc_estimate_keys(&pcxt->estimator, 2);
1513  else
1514  {
1515  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1516  shm_toc_estimate_keys(&pcxt->estimator, 3);
1517  }
1518 
1519  /*
1520  * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1521  * and PARALLEL_KEY_BUFFER_USAGE.
1522  *
1523  * If there are no extensions loaded that care, we could skip this. We
1524  * have no way of knowing whether anyone's looking at pgWalUsage or
1525  * pgBufferUsage, so do it unconditionally.
1526  */
1528  mul_size(sizeof(WalUsage), pcxt->nworkers));
1529  shm_toc_estimate_keys(&pcxt->estimator, 1);
1531  mul_size(sizeof(BufferUsage), pcxt->nworkers));
1532  shm_toc_estimate_keys(&pcxt->estimator, 1);
1533 
1534  /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1535  if (debug_query_string)
1536  {
1537  querylen = strlen(debug_query_string);
1538  shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1539  shm_toc_estimate_keys(&pcxt->estimator, 1);
1540  }
1541  else
1542  querylen = 0; /* keep compiler quiet */
1543 
1544  /* Everyone's had a chance to ask for space, so now create the DSM */
1545  InitializeParallelDSM(pcxt);
1546 
1547  /* If no DSM segment was available, back out (do serial build) */
1548  if (pcxt->seg == NULL)
1549  {
1550  if (IsMVCCSnapshot(snapshot))
1551  UnregisterSnapshot(snapshot);
1552  DestroyParallelContext(pcxt);
1553  ExitParallelMode();
1554  return;
1555  }
1556 
1557  /* Store shared build state, for which we reserved space */
1558  btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1559  /* Initialize immutable state */
1560  btshared->heaprelid = RelationGetRelid(btspool->heap);
1561  btshared->indexrelid = RelationGetRelid(btspool->index);
1562  btshared->isunique = btspool->isunique;
1563  btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1564  btshared->isconcurrent = isconcurrent;
1565  btshared->scantuplesortstates = scantuplesortstates;
1567  SpinLockInit(&btshared->mutex);
1568  /* Initialize mutable state */
1569  btshared->nparticipantsdone = 0;
1570  btshared->reltuples = 0.0;
1571  btshared->havedead = false;
1572  btshared->indtuples = 0.0;
1573  btshared->brokenhotchain = false;
1576  snapshot);
1577 
1578  /*
1579  * Store shared tuplesort-private state, for which we reserved space.
1580  * Then, initialize opaque state using tuplesort routine.
1581  */
1582  sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1583  tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1584  pcxt->seg);
1585 
1586  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1587  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1588 
1589  /* Unique case requires a second spool, and associated shared state */
1590  if (!btspool->isunique)
1591  sharedsort2 = NULL;
1592  else
1593  {
1594  /*
1595  * Store additional shared tuplesort-private state, for which we
1596  * reserved space. Then, initialize opaque state using tuplesort
1597  * routine.
1598  */
1599  sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1600  tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1601  pcxt->seg);
1602 
1603  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1604  }
1605 
1606  /* Store query string for workers */
1607  if (debug_query_string)
1608  {
1609  char *sharedquery;
1610 
1611  sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1612  memcpy(sharedquery, debug_query_string, querylen + 1);
1613  shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1614  }
1615 
1616  /*
1617  * Allocate space for each worker's WalUsage and BufferUsage; no need to
1618  * initialize.
1619  */
1620  walusage = shm_toc_allocate(pcxt->toc,
1621  mul_size(sizeof(WalUsage), pcxt->nworkers));
1622  shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1623  bufferusage = shm_toc_allocate(pcxt->toc,
1624  mul_size(sizeof(BufferUsage), pcxt->nworkers));
1625  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1626 
1627  /* Launch workers, saving status for leader/caller */
1628  LaunchParallelWorkers(pcxt);
1629  btleader->pcxt = pcxt;
1630  btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1631  if (leaderparticipates)
1632  btleader->nparticipanttuplesorts++;
1633  btleader->btshared = btshared;
1634  btleader->sharedsort = sharedsort;
1635  btleader->sharedsort2 = sharedsort2;
1636  btleader->snapshot = snapshot;
1637  btleader->walusage = walusage;
1638  btleader->bufferusage = bufferusage;
1639 
1640  /* If no workers were successfully launched, back out (do serial build) */
1641  if (pcxt->nworkers_launched == 0)
1642  {
1643  _bt_end_parallel(btleader);
1644  return;
1645  }
1646 
1647  /* Save leader state now that it's clear build will be parallel */
1648  buildstate->btleader = btleader;
1649 
1650  /* Join heap scan ourselves */
1651  if (leaderparticipates)
1653 
1654  /*
1655  * Caller needs to wait for all launched workers when we return. Make
1656  * sure that the failure-to-start case will not hang forever.
1657  */
1659 }
1660 
1661 /*
1662  * Shut down workers, destroy parallel context, and end parallel mode.
1663  */
1664 static void
1666 {
1667  int i;
1668 
1669  /* Shutdown worker processes */
1671 
1672  /*
1673  * Next, accumulate WAL usage. (This must wait for the workers to finish,
1674  * or we might get incomplete data.)
1675  */
1676  for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1677  InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1678 
1679  /* Free last reference to MVCC snapshot, if one was used */
1680  if (IsMVCCSnapshot(btleader->snapshot))
1681  UnregisterSnapshot(btleader->snapshot);
1682  DestroyParallelContext(btleader->pcxt);
1683  ExitParallelMode();
1684 }
1685 
1686 /*
1687  * Returns size of shared memory required to store state for a parallel
1688  * btree index build based on the snapshot its parallel scan will use.
1689  */
1690 static Size
1692 {
1693  /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1694  return add_size(BUFFERALIGN(sizeof(BTShared)),
1695  table_parallelscan_estimate(heap, snapshot));
1696 }
1697 
1698 /*
1699  * Within leader, wait for end of heap scan.
1700  *
1701  * When called, parallel heap scan started by _bt_begin_parallel() will
1702  * already be underway within worker processes (when leader participates
1703  * as a worker, we should end up here just as workers are finishing).
1704  *
1705  * Fills in fields needed for ambuild statistics, and lets caller set
1706  * field indicating that some worker encountered a broken HOT chain.
1707  *
1708  * Returns the total number of heap tuples scanned.
1709  */
1710 static double
1711 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1712 {
1713  BTShared *btshared = buildstate->btleader->btshared;
1714  int nparticipanttuplesorts;
1715  double reltuples;
1716 
1717  nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1718  for (;;)
1719  {
1720  SpinLockAcquire(&btshared->mutex);
1721  if (btshared->nparticipantsdone == nparticipanttuplesorts)
1722  {
1723  buildstate->havedead = btshared->havedead;
1724  buildstate->indtuples = btshared->indtuples;
1725  *brokenhotchain = btshared->brokenhotchain;
1726  reltuples = btshared->reltuples;
1727  SpinLockRelease(&btshared->mutex);
1728  break;
1729  }
1730  SpinLockRelease(&btshared->mutex);
1731 
1734  }
1735 
1737 
1738  return reltuples;
1739 }
1740 
1741 /*
1742  * Within leader, participate as a parallel worker.
1743  */
1744 static void
1746 {
1747  BTLeader *btleader = buildstate->btleader;
1748  BTSpool *leaderworker;
1749  BTSpool *leaderworker2;
1750  int sortmem;
1751 
1752  /* Allocate memory and initialize private spool */
1753  leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1754  leaderworker->heap = buildstate->spool->heap;
1755  leaderworker->index = buildstate->spool->index;
1756  leaderworker->isunique = buildstate->spool->isunique;
1757  leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1758 
1759  /* Initialize second spool, if required */
1760  if (!btleader->btshared->isunique)
1761  leaderworker2 = NULL;
1762  else
1763  {
1764  /* Allocate memory for worker's own private secondary spool */
1765  leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1766 
1767  /* Initialize worker's own secondary spool */
1768  leaderworker2->heap = leaderworker->heap;
1769  leaderworker2->index = leaderworker->index;
1770  leaderworker2->isunique = false;
1771  }
1772 
1773  /*
1774  * Might as well use reliable figure when doling out maintenance_work_mem
1775  * (when requested number of workers were not launched, this will be
1776  * somewhat higher than it is for other workers).
1777  */
1778  sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1779 
1780  /* Perform work common to all participants */
1781  _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1782  btleader->sharedsort, btleader->sharedsort2,
1783  sortmem, true);
1784 
1785 #ifdef BTREE_BUILD_STATS
1787  {
1788  ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1789  ResetUsage();
1790  }
1791 #endif /* BTREE_BUILD_STATS */
1792 }
1793 
1794 /*
1795  * Perform work within a launched parallel process.
1796  */
1797 void
1799 {
1800  char *sharedquery;
1801  BTSpool *btspool;
1802  BTSpool *btspool2;
1803  BTShared *btshared;
1804  Sharedsort *sharedsort;
1805  Sharedsort *sharedsort2;
1806  Relation heapRel;
1807  Relation indexRel;
1808  LOCKMODE heapLockmode;
1809  LOCKMODE indexLockmode;
1810  WalUsage *walusage;
1811  BufferUsage *bufferusage;
1812  int sortmem;
1813 
1814 #ifdef BTREE_BUILD_STATS
1816  ResetUsage();
1817 #endif /* BTREE_BUILD_STATS */
1818 
1819  /*
1820  * The only possible status flag that can be set to the parallel worker is
1821  * PROC_IN_SAFE_IC.
1822  */
1823  Assert((MyProc->statusFlags == 0) ||
1825 
1826  /* Set debug_query_string for individual workers first */
1827  sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1828  debug_query_string = sharedquery;
1829 
1830  /* Report the query string from leader */
1832 
1833  /* Look up nbtree shared state */
1834  btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1835 
1836  /* Open relations using lock modes known to be obtained by index.c */
1837  if (!btshared->isconcurrent)
1838  {
1839  heapLockmode = ShareLock;
1840  indexLockmode = AccessExclusiveLock;
1841  }
1842  else
1843  {
1844  heapLockmode = ShareUpdateExclusiveLock;
1845  indexLockmode = RowExclusiveLock;
1846  }
1847 
1848  /* Open relations within worker */
1849  heapRel = table_open(btshared->heaprelid, heapLockmode);
1850  indexRel = index_open(btshared->indexrelid, indexLockmode);
1851 
1852  /* Initialize worker's own spool */
1853  btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1854  btspool->heap = heapRel;
1855  btspool->index = indexRel;
1856  btspool->isunique = btshared->isunique;
1857  btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1858 
1859  /* Look up shared state private to tuplesort.c */
1860  sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1861  tuplesort_attach_shared(sharedsort, seg);
1862  if (!btshared->isunique)
1863  {
1864  btspool2 = NULL;
1865  sharedsort2 = NULL;
1866  }
1867  else
1868  {
1869  /* Allocate memory for worker's own private secondary spool */
1870  btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1871 
1872  /* Initialize worker's own secondary spool */
1873  btspool2->heap = btspool->heap;
1874  btspool2->index = btspool->index;
1875  btspool2->isunique = false;
1876  /* Look up shared state private to tuplesort.c */
1877  sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1878  tuplesort_attach_shared(sharedsort2, seg);
1879  }
1880 
1881  /* Prepare to track buffer usage during parallel execution */
1883 
1884  /* Perform sorting of spool, and possibly a spool2 */
1885  sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1886  _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1887  sharedsort2, sortmem, false);
1888 
1889  /* Report WAL/buffer usage during parallel execution */
1890  bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1891  walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1893  &walusage[ParallelWorkerNumber]);
1894 
1895 #ifdef BTREE_BUILD_STATS
1897  {
1898  ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1899  ResetUsage();
1900  }
1901 #endif /* BTREE_BUILD_STATS */
1902 
1903  index_close(indexRel, indexLockmode);
1904  table_close(heapRel, heapLockmode);
1905 }
1906 
1907 /*
1908  * Perform a worker's portion of a parallel sort.
1909  *
1910  * This generates a tuplesort for passed btspool, and a second tuplesort
1911  * state if a second btspool is need (i.e. for unique index builds). All
1912  * other spool fields should already be set when this is called.
1913  *
1914  * sortmem is the amount of working memory to use within each worker,
1915  * expressed in KBs.
1916  *
1917  * When this returns, workers are done, and need only release resources.
1918  */
1919 static void
1921  BTShared *btshared, Sharedsort *sharedsort,
1922  Sharedsort *sharedsort2, int sortmem, bool progress)
1923 {
1924  SortCoordinate coordinate;
1925  BTBuildState buildstate;
1926  TableScanDesc scan;
1927  double reltuples;
1928  IndexInfo *indexInfo;
1929 
1930  /* Initialize local tuplesort coordination state */
1931  coordinate = palloc0(sizeof(SortCoordinateData));
1932  coordinate->isWorker = true;
1933  coordinate->nParticipants = -1;
1934  coordinate->sharedsort = sharedsort;
1935 
1936  /* Begin "partial" tuplesort */
1937  btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1938  btspool->index,
1939  btspool->isunique,
1940  btspool->nulls_not_distinct,
1941  sortmem, coordinate,
1942  TUPLESORT_NONE);
1943 
1944  /*
1945  * Just as with serial case, there may be a second spool. If so, a
1946  * second, dedicated spool2 partial tuplesort is required.
1947  */
1948  if (btspool2)
1949  {
1950  SortCoordinate coordinate2;
1951 
1952  /*
1953  * We expect that the second one (for dead tuples) won't get very
1954  * full, so we give it only work_mem (unless sortmem is less for
1955  * worker). Worker processes are generally permitted to allocate
1956  * work_mem independently.
1957  */
1958  coordinate2 = palloc0(sizeof(SortCoordinateData));
1959  coordinate2->isWorker = true;
1960  coordinate2->nParticipants = -1;
1961  coordinate2->sharedsort = sharedsort2;
1962  btspool2->sortstate =
1963  tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1964  Min(sortmem, work_mem), coordinate2,
1965  false);
1966  }
1967 
1968  /* Fill in buildstate for _bt_build_callback() */
1969  buildstate.isunique = btshared->isunique;
1970  buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1971  buildstate.havedead = false;
1972  buildstate.heap = btspool->heap;
1973  buildstate.spool = btspool;
1974  buildstate.spool2 = btspool2;
1975  buildstate.indtuples = 0;
1976  buildstate.btleader = NULL;
1977 
1978  /* Join parallel scan */
1979  indexInfo = BuildIndexInfo(btspool->index);
1980  indexInfo->ii_Concurrent = btshared->isconcurrent;
1981  scan = table_beginscan_parallel(btspool->heap,
1982  ParallelTableScanFromBTShared(btshared));
1983  reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1985  (void *) &buildstate, scan);
1986 
1987  /* Execute this worker's part of the sort */
1988  if (progress)
1991  tuplesort_performsort(btspool->sortstate);
1992  if (btspool2)
1993  {
1994  if (progress)
1997  tuplesort_performsort(btspool2->sortstate);
1998  }
1999 
2000  /*
2001  * Done. Record ambuild statistics, and whether we encountered a broken
2002  * HOT chain.
2003  */
2004  SpinLockAcquire(&btshared->mutex);
2005  btshared->nparticipantsdone++;
2006  btshared->reltuples += reltuples;
2007  if (buildstate.havedead)
2008  btshared->havedead = true;
2009  btshared->indtuples += buildstate.indtuples;
2010  if (indexInfo->ii_BrokenHotChain)
2011  btshared->brokenhotchain = true;
2012  SpinLockRelease(&btshared->mutex);
2013 
2014  /* Notify leader */
2016 
2017  /* We can end tuplesorts immediately */
2018  tuplesort_end(btspool->sortstate);
2019  if (btspool2)
2020  tuplesort_end(btspool2->sortstate);
2021 }
int ParallelWorkerNumber
Definition: parallel.c:113
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:203
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:773
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:550
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:927
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition: parallel.c:165
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition: parallel.c:670
void pgstat_progress_update_param(int index, int64 val)
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
uint32 BlockNumber
Definition: block.h:31
static Datum values[MAXATTR]
Definition: bootstrap.c:156
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:156
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1405
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1539
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:907
PageHeaderData * PageHeader
Definition: bufpage.h:170
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:468
unsigned int uint32
Definition: c.h:442
#define MAXALIGN_DOWN(LEN)
Definition: c.h:759
#define Min(x, y)
Definition: c.h:937
signed short int16
Definition: c.h:429
#define MAXALIGN(LEN)
Definition: c.h:747
signed int int32
Definition: c.h:430
#define BUFFERALIGN(LEN)
Definition: c.h:749
#define unlikely(x)
Definition: c.h:295
size_t Size
Definition: c.h:541
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
void ConditionVariableCancelSleep(void)
#define ERROR
Definition: elog.h:35
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
int maintenance_work_mem
Definition: globals.c:127
int work_mem
Definition: globals.c:125
bool log_btree_build_stats
Definition: guc_tables.c:472
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2420
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:158
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:132
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:218
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:208
void InstrStartParallelQuery(void)
Definition: instrument.c:200
int i
Definition: isn.c:73
Pointer Item
Definition: item.h:17
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
struct ItemIdData ItemIdData
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
struct IndexTupleData IndexTupleData
#define INDEX_SIZE_MASK
Definition: itup.h:65
Assert(fmt[strlen(fmt) - 1] !='\n')
int LOCKMODE
Definition: lockdefs.h:26
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
#define ShareLock
Definition: lockdefs.h:40
#define RowExclusiveLock
Definition: lockdefs.h:38
void pfree(void *pointer)
Definition: mcxt.c:1306
void * palloc0(Size size)
Definition: mcxt.c:1230
MemoryContext CurrentMemoryContext
Definition: mcxt.c:124
void * palloc(Size size)
Definition: mcxt.c:1199
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:484
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:433
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:864
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1141
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:69
#define BTGetDeduplicateItems(relation)
Definition: nbtree.h:1102
#define BTGetTargetPageFreeSpace(relation)
Definition: nbtree.h:1100
#define BTP_LEAF
Definition: nbtree.h:76
#define P_HIKEY
Definition: nbtree.h:368
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition: nbtree.h:1115
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition: nbtree.h:1116
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define BTP_ROOT
Definition: nbtree.h:77
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
Definition: nbtree.h:556
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition: nbtree.h:1113
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition: nbtree.h:1114
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:523
#define P_NONE
Definition: nbtree.h:212
#define BTMaxItemSize(page)
Definition: nbtree.h:164
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1084
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define SK_BT_DESC
Definition: nbtree.h:1083
BTDedupStateData * BTDedupState
Definition: nbtree.h:892
#define P_FIRSTKEY
Definition: nbtree.h:369
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:589
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:571
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:201
#define PARALLEL_KEY_BUFFER_USAGE
Definition: nbtsort.c:73
#define ParallelTableScanFromBTShared(shared)
Definition: nbtsort.c:165
static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
Definition: nbtsort.c:767
static void _bt_slideleft(Page rightmostpage)
Definition: nbtsort.c:736
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
Definition: nbtsort.c:699
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:1184
static void _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
Definition: nbtsort.c:644
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1665
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:301
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:70
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition: nbtsort.c:1920
static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition: nbtsort.c:1691
struct BTPageState BTPageState
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
Definition: nbtsort.c:1078
struct BTSpool BTSpool
static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
Definition: nbtsort.c:1711
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:544
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:68
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
Definition: nbtsort.c:1454
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
Definition: nbtsort.c:837
struct BTBuildState BTBuildState
void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition: nbtsort.c:1798
struct BTLeader BTLeader
static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: nbtsort.c:588
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition: nbtsort.c:371
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:523
static Page _bt_blnewpage(uint32 level)
Definition: nbtsort.c:617
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
Definition: nbtsort.c:1112
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:69
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:71
#define PARALLEL_KEY_WAL_USAGE
Definition: nbtsort.c:72
static void _bt_leader_participate_as_worker(BTBuildState *buildstate)
Definition: nbtsort.c:1745
struct BTWriteState BTWriteState
static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
Definition: nbtsort.c:533
struct BTShared BTShared
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2638
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2205
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2424
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2696
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
Definition: pairingheap.c:79
int progress
Definition: pgbench.c:270
const char * debug_query_string
Definition: postgres.c:82
void ShowUsage(const char *title)
Definition: postgres.c:4856
void ResetUsage(void)
Definition: postgres.c:4849
uintptr_t Datum
Definition: postgres.h:412
unsigned int Oid
Definition: postgres_ext.h:31
#define PROC_IN_SAFE_IC
Definition: proc.h:58
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:84
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:120
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition: progress.h:85
#define PROGRESS_CREATEIDX_SUBPHASE
Definition: progress.h:83
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:119
#define RelationGetRelid(relation)
Definition: rel.h:501
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:569
#define RelationGetDescr(relation)
Definition: rel.h:527
#define RelationGetRelationName(relation)
Definition: rel.h:535
#define RelationNeedsWAL(relation)
Definition: rel.h:626
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:520
@ MAIN_FORKNUM
Definition: relpath.h:50
int slock_t
Definition: s_lock.h:754
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:554
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:493
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:691
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:251
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:871
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:829
#define SnapshotAny
Definition: snapmgr.h:67
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:96
void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup)
Definition: sortsupport.c:162
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:200
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
PGPROC * MyProc
Definition: proc.c:68
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define BTLessStrategyNumber
Definition: stratnum.h:29
bool isunique
Definition: nbtsort.c:210
BTSpool * spool
Definition: nbtsort.c:214
BTLeader * btleader
Definition: nbtsort.c:228
bool nulls_not_distinct
Definition: nbtsort.c:211
bool havedead
Definition: nbtsort.c:212
Relation heap
Definition: nbtsort.c:213
BTSpool * spool2
Definition: nbtsort.c:220
double indtuples
Definition: nbtsort.c:221
Size maxpostingsize
Definition: nbtree.h:869
ItemPointer htids
Definition: nbtree.h:877
bool deduplicate
Definition: nbtree.h:867
OffsetNumber baseoff
Definition: nbtree.h:873
Size basetupsize
Definition: nbtree.h:874
IndexTuple base
Definition: nbtree.h:872
Size phystupsize
Definition: nbtree.h:880
ParallelContext * pcxt
Definition: nbtsort.c:174
BTShared * btshared
Definition: nbtsort.c:194
Sharedsort * sharedsort
Definition: nbtsort.c:195
Sharedsort * sharedsort2
Definition: nbtsort.c:196
int nparticipanttuplesorts
Definition: nbtsort.c:182
BufferUsage * bufferusage
Definition: nbtsort.c:199
Snapshot snapshot
Definition: nbtsort.c:197
WalUsage * walusage
Definition: nbtsort.c:198
BlockNumber btpo_next
Definition: nbtree.h:65
BlockNumber btpo_prev
Definition: nbtree.h:64
uint16 btpo_flags
Definition: nbtree.h:67
uint32 btpo_level
Definition: nbtree.h:66
BTCycleId btpo_cycleid
Definition: nbtree.h:68
Page btps_page
Definition: nbtsort.c:237
IndexTuple btps_lowkey
Definition: nbtsort.c:239
Size btps_full
Definition: nbtsort.c:243
OffsetNumber btps_lastoff
Definition: nbtsort.c:240
Size btps_lastextra
Definition: nbtsort.c:241
BlockNumber btps_blkno
Definition: nbtsort.c:238
struct BTPageState * btps_next
Definition: nbtsort.c:244
uint32 btps_level
Definition: nbtsort.c:242
bool allequalimage
Definition: nbtree.h:786
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:792
slock_t mutex
Definition: nbtsort.c:128
bool isconcurrent
Definition: nbtsort.c:111
double indtuples
Definition: nbtsort.c:149
double reltuples
Definition: nbtsort.c:147
Oid heaprelid
Definition: nbtsort.c:107
bool brokenhotchain
Definition: nbtsort.c:150
bool isunique
Definition: nbtsort.c:109
int nparticipantsdone
Definition: nbtsort.c:146
ConditionVariable workersdonecv
Definition: nbtsort.c:120
int scantuplesortstates
Definition: nbtsort.c:112
Oid indexrelid
Definition: nbtsort.c:108
bool havedead
Definition: nbtsort.c:148
bool nulls_not_distinct
Definition: nbtsort.c:110
bool isunique
Definition: nbtsort.c:91
bool nulls_not_distinct
Definition: nbtsort.c:92
Relation heap
Definition: nbtsort.c:89
Relation index
Definition: nbtsort.c:90
Tuplesortstate * sortstate
Definition: nbtsort.c:88
Page btws_zeropage
Definition: nbtsort.c:258
Relation heap
Definition: nbtsort.c:252
Relation index
Definition: nbtsort.c:253
BlockNumber btws_pages_alloced
Definition: nbtsort.c:256
BlockNumber btws_pages_written
Definition: nbtsort.c:257
BTScanInsert inskey
Definition: nbtsort.c:254
bool btws_use_wal
Definition: nbtsort.c:255
double heap_tuples
Definition: genam.h:32
double index_tuples
Definition: genam.h:33
bool ii_Unique
Definition: execnodes.h:190
bool ii_BrokenHotChain
Definition: execnodes.h:196
bool ii_NullsNotDistinct
Definition: execnodes.h:191
int ii_ParallelWorkers
Definition: execnodes.h:197
bool ii_Concurrent
Definition: execnodes.h:195
ItemPointerData t_tid
Definition: itup.h:37
unsigned short t_info
Definition: itup.h:49
uint8 statusFlags
Definition: proc.h:233
dsm_segment * seg
Definition: parallel.h:43
shm_toc_estimator estimator
Definition: parallel.h:42
shm_toc * toc
Definition: parallel.h:45
int nworkers_launched
Definition: parallel.h:38
RelFileLocator rd_locator
Definition: rel.h:56
int sk_flags
Definition: skey.h:66
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67
Sharedsort * sharedsort
Definition: tuplesort.h:57
AttrNumber ssup_attno
Definition: sortsupport.h:81
bool ssup_nulls_first
Definition: sortsupport.h:75
MemoryContext ssup_cxt
Definition: sortsupport.h:66
Definition: type.h:95
Definition: regguts.h:318
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Definition: tableam.c:175
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition: tableam.c:140
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition: tableam.c:155
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1748
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1385
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:2976
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:2955
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:972
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2999
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:60
#define TUPLESORT_NONE
Definition: tuplesort.h:92
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
@ WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN
Definition: wait_event.h:115
void ExitParallelMode(void)
Definition: xact.c:1048
void EnterParallelMode(void)
Definition: xact.c:1035
XLogRecPtr log_newpage(RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:1097