PostgreSQL Source Code  git master
nbtsort.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtsort.c
4  * Build a btree from sorted input by loading leaf pages sequentially.
5  *
6  * NOTES
7  *
8  * We use tuplesort.c to sort the given index tuples into order.
9  * Then we scan the index tuples in order and build the btree pages
10  * for each level. We load source tuples into leaf-level pages.
11  * Whenever we fill a page at one level, we add a link to it to its
12  * parent level (starting a new parent level if necessary). When
13  * done, we write out each final page on each level, adding it to
14  * its parent level. When we have only one page on a level, it must be
15  * the root -- it can be attached to the btree metapage and we are done.
16  *
17  * It is not wise to pack the pages entirely full, since then *any*
18  * insertion would cause a split (and not only of the leaf page; the need
19  * for a split would cascade right up the tree). The steady-state load
20  * factor for btrees is usually estimated at 70%. We choose to pack leaf
21  * pages to the user-controllable fill factor (default 90%) while upper pages
22  * are always packed to 70%. This gives us reasonable density (there aren't
23  * many upper pages if the keys are reasonable-size) without risking a lot of
24  * cascading splits during early insertions.
25  *
26  * Formerly the index pages being built were kept in shared buffers, but
27  * that is of no value (since other backends have no interest in them yet)
28  * and it created locking problems for CHECKPOINT, because the upper-level
29  * pages were held exclusive-locked for long periods. Now we just build
30  * the pages in local memory and smgrwrite or smgrextend them as we finish
31  * them. They will need to be re-read into shared buffers on first use after
32  * the build finishes.
33  *
34  * Since the index will never be used unless it is completely built,
35  * from a crash-recovery point of view there is no need to WAL-log the
36  * steps of the build. After completing the index build, we can just sync
37  * the whole file to disk using smgrimmedsync() before exiting this module.
38  * This can be seen to be sufficient for crash recovery by considering that
39  * it's effectively equivalent to what would happen if a CHECKPOINT occurred
40  * just after the index build. However, it is clearly not sufficient if the
41  * DBA is using the WAL log for PITR or replication purposes, since another
42  * machine would not be able to reconstruct the index from WAL. Therefore,
43  * we log the completed index pages to WAL if and only if WAL archiving is
44  * active.
45  *
46  * This code isn't concerned about the FSM at all. The caller is responsible
47  * for initializing that.
48  *
49  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
50  * Portions Copyright (c) 1994, Regents of the University of California
51  *
52  * IDENTIFICATION
53  * src/backend/access/nbtree/nbtsort.c
54  *
55  *-------------------------------------------------------------------------
56  */
57 
58 #include "postgres.h"
59 
60 #include "access/nbtree.h"
61 #include "access/parallel.h"
62 #include "access/relscan.h"
63 #include "access/table.h"
64 #include "access/tableam.h"
65 #include "access/xact.h"
66 #include "access/xlog.h"
67 #include "access/xloginsert.h"
68 #include "catalog/index.h"
69 #include "commands/progress.h"
70 #include "miscadmin.h"
71 #include "pgstat.h"
72 #include "storage/smgr.h"
73 #include "tcop/tcopprot.h" /* pgrminclude ignore */
74 #include "utils/rel.h"
75 #include "utils/sortsupport.h"
76 #include "utils/tuplesort.h"
77 
78 
79 /* Magic numbers for parallel state sharing */
80 #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
81 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
82 #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
83 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
84 
85 /*
86  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
87  * parallel index builds. This may be useful as a debugging aid.
88 #undef DISABLE_LEADER_PARTICIPATION
89  */
90 
91 /*
92  * Status record for spooling/sorting phase. (Note we may have two of
93  * these due to the special requirements for uniqueness-checking with
94  * dead tuples.)
95  */
96 typedef struct BTSpool
97 {
98  Tuplesortstate *sortstate; /* state data for tuplesort.c */
101  bool isunique;
102 } BTSpool;
103 
104 /*
105  * Status for index builds performed in parallel. This is allocated in a
106  * dynamic shared memory segment. Note that there is a separate tuplesort TOC
107  * entry, private to tuplesort.c but allocated by this module on its behalf.
108  */
109 typedef struct BTShared
110 {
111  /*
112  * These fields are not modified during the sort. They primarily exist
113  * for the benefit of worker processes that need to create BTSpool state
114  * corresponding to that used by the leader.
115  */
118  bool isunique;
121 
122  /*
123  * workersdonecv is used to monitor the progress of workers. All parallel
124  * participants must indicate that they are done before leader can use
125  * mutable state that workers maintain during scan (and before leader can
126  * proceed to tuplesort_performsort()).
127  */
129 
130  /*
131  * mutex protects all fields before heapdesc.
132  *
133  * These fields contain status information of interest to B-Tree index
134  * builds that must work just the same when an index is built in parallel.
135  */
137 
138  /*
139  * Mutable state that is maintained by workers, and reported back to
140  * leader at end of parallel scan.
141  *
142  * nparticipantsdone is number of worker processes finished.
143  *
144  * reltuples is the total number of input heap tuples.
145  *
146  * havedead indicates if RECENTLY_DEAD tuples were encountered during
147  * build.
148  *
149  * indtuples is the total number of tuples that made it into the index.
150  *
151  * brokenhotchain indicates if any worker detected a broken HOT chain
152  * during build.
153  */
155  double reltuples;
156  bool havedead;
157  double indtuples;
159 
160  /*
161  * ParallelTableScanDescData data follows. Can't directly embed here, as
162  * implementations of the parallel table scan desc interface might need
163  * stronger alignment.
164  */
165 } BTShared;
166 
167 /*
168  * Return pointer to a BTShared's parallel table scan.
169  *
170  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
171  * MAXALIGN.
172  */
173 #define ParallelTableScanFromBTShared(shared) \
174  (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
175 
176 /*
177  * Status for leader in parallel index build.
178  */
179 typedef struct BTLeader
180 {
181  /* parallel context itself */
183 
184  /*
185  * nparticipanttuplesorts is the exact number of worker processes
186  * successfully launched, plus one leader process if it participates as a
187  * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
188  * participating as a worker).
189  */
191 
192  /*
193  * Leader process convenience pointers to shared state (leader avoids TOC
194  * lookups).
195  *
196  * btshared is the shared state for entire build. sharedsort is the
197  * shared, tuplesort-managed state passed to each process tuplesort.
198  * sharedsort2 is the corresponding btspool2 shared state, used only when
199  * building unique indexes. snapshot is the snapshot used by the scan iff
200  * an MVCC snapshot is required.
201  */
206 } BTLeader;
207 
208 /*
209  * Working state for btbuild and its callback.
210  *
211  * When parallel CREATE INDEX is used, there is a BTBuildState for each
212  * participant.
213  */
214 typedef struct BTBuildState
215 {
216  bool isunique;
217  bool havedead;
220 
221  /*
222  * spool2 is needed only when the index is a unique index. Dead tuples are
223  * put into spool2 instead of spool in order to avoid uniqueness check.
224  */
226  double indtuples;
227 
228  /*
229  * btleader is only present when a parallel index build is performed, and
230  * only in the leader process. (Actually, only the leader has a
231  * BTBuildState. Workers have their own spool and spool2, though.)
232  */
234 } BTBuildState;
235 
236 /*
237  * Status record for a btree page being built. We have one of these
238  * for each active tree level.
239  *
240  * The reason we need to store a copy of the minimum key is that we'll
241  * need to propagate it to the parent node when this page is linked
242  * into its parent. However, if the page is not a leaf page, the first
243  * entry on the page doesn't need to contain a key, so we will not have
244  * stored the key itself on the page. (You might think we could skip
245  * copying the minimum key on leaf pages, but actually we must have a
246  * writable copy anyway because we'll poke the page's address into it
247  * before passing it up to the parent...)
248  */
249 typedef struct BTPageState
250 {
251  Page btps_page; /* workspace for page building */
252  BlockNumber btps_blkno; /* block # to write this page at */
253  IndexTuple btps_minkey; /* copy of minimum key (first item) on page */
254  OffsetNumber btps_lastoff; /* last item offset loaded */
255  uint32 btps_level; /* tree level (0 = leaf) */
256  Size btps_full; /* "full" if less than this much free space */
257  struct BTPageState *btps_next; /* link to parent level, if any */
258 } BTPageState;
259 
260 /*
261  * Overall status record for index writing phase.
262  */
263 typedef struct BTWriteState
264 {
267  BTScanInsert inskey; /* generic insertion scankey */
268  bool btws_use_wal; /* dump pages to WAL? */
269  BlockNumber btws_pages_alloced; /* # pages allocated */
270  BlockNumber btws_pages_written; /* # pages written out */
271  Page btws_zeropage; /* workspace for filling zeroes */
272 } BTWriteState;
273 
274 
276  BTBuildState *buildstate, IndexInfo *indexInfo);
277 static void _bt_spooldestroy(BTSpool *btspool);
278 static void _bt_spool(BTSpool *btspool, ItemPointer self,
279  Datum *values, bool *isnull);
280 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
282  bool *isnull, bool tupleIsAlive, void *state);
283 static Page _bt_blnewpage(uint32 level);
284 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
285 static void _bt_slideleft(Page page);
286 static void _bt_sortaddtup(Page page, Size itemsize,
287  IndexTuple itup, OffsetNumber itup_off);
288 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
289  IndexTuple itup);
290 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
291 static void _bt_load(BTWriteState *wstate,
292  BTSpool *btspool, BTSpool *btspool2);
293 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
294  int request);
295 static void _bt_end_parallel(BTLeader *btleader);
297 static double _bt_parallel_heapscan(BTBuildState *buildstate,
298  bool *brokenhotchain);
299 static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
300 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
301  BTShared *btshared, Sharedsort *sharedsort,
302  Sharedsort *sharedsort2, int sortmem,
303  bool progress);
304 
305 
306 /*
307  * btbuild() -- build a new btree index.
308  */
311 {
312  IndexBuildResult *result;
313  BTBuildState buildstate;
314  double reltuples;
315 
316 #ifdef BTREE_BUILD_STATS
318  ResetUsage();
319 #endif /* BTREE_BUILD_STATS */
320 
321  buildstate.isunique = indexInfo->ii_Unique;
322  buildstate.havedead = false;
323  buildstate.heap = heap;
324  buildstate.spool = NULL;
325  buildstate.spool2 = NULL;
326  buildstate.indtuples = 0;
327  buildstate.btleader = NULL;
328 
329  /*
330  * We expect to be called exactly once for any index relation. If that's
331  * not the case, big trouble's what we have.
332  */
333  if (RelationGetNumberOfBlocks(index) != 0)
334  elog(ERROR, "index \"%s\" already contains data",
335  RelationGetRelationName(index));
336 
337  reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
338 
339  /*
340  * Finish the build by (1) completing the sort of the spool file, (2)
341  * inserting the sorted tuples into btree pages and (3) building the upper
342  * levels. Finally, it may also be necessary to end use of parallelism.
343  */
344  _bt_leafbuild(buildstate.spool, buildstate.spool2);
345  _bt_spooldestroy(buildstate.spool);
346  if (buildstate.spool2)
347  _bt_spooldestroy(buildstate.spool2);
348  if (buildstate.btleader)
349  _bt_end_parallel(buildstate.btleader);
350 
351  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
352 
353  result->heap_tuples = reltuples;
354  result->index_tuples = buildstate.indtuples;
355 
356 #ifdef BTREE_BUILD_STATS
358  {
359  ShowUsage("BTREE BUILD STATS");
360  ResetUsage();
361  }
362 #endif /* BTREE_BUILD_STATS */
363 
364  return result;
365 }
366 
367 /*
368  * Create and initialize one or two spool structures, and save them in caller's
369  * buildstate argument. May also fill-in fields within indexInfo used by index
370  * builds.
371  *
372  * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
373  * routine encapsulates all aspects of managing parallelism. Caller need only
374  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
375  *
376  * Returns the total number of heap tuples scanned.
377  */
378 static double
380  IndexInfo *indexInfo)
381 {
382  BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
383  SortCoordinate coordinate = NULL;
384  double reltuples = 0;
385 
386  /*
387  * We size the sort area as maintenance_work_mem rather than work_mem to
388  * speed index creation. This should be OK since a single backend can't
389  * run multiple index creations in parallel (see also: notes on
390  * parallelism and maintenance_work_mem below).
391  */
392  btspool->heap = heap;
393  btspool->index = index;
394  btspool->isunique = indexInfo->ii_Unique;
395 
396  /* Save as primary spool */
397  buildstate->spool = btspool;
398 
399  /* Report table scan phase started */
402 
403  /* Attempt to launch parallel worker scan when required */
404  if (indexInfo->ii_ParallelWorkers > 0)
405  _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
406  indexInfo->ii_ParallelWorkers);
407 
408  /*
409  * If parallel build requested and at least one worker process was
410  * successfully launched, set up coordination state
411  */
412  if (buildstate->btleader)
413  {
414  coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
415  coordinate->isWorker = false;
416  coordinate->nParticipants =
417  buildstate->btleader->nparticipanttuplesorts;
418  coordinate->sharedsort = buildstate->btleader->sharedsort;
419  }
420 
421  /*
422  * Begin serial/leader tuplesort.
423  *
424  * In cases where parallelism is involved, the leader receives the same
425  * share of maintenance_work_mem as a serial sort (it is generally treated
426  * in the same way as a serial sort once we return). Parallel worker
427  * Tuplesortstates will have received only a fraction of
428  * maintenance_work_mem, though.
429  *
430  * We rely on the lifetime of the Leader Tuplesortstate almost not
431  * overlapping with any worker Tuplesortstate's lifetime. There may be
432  * some small overlap, but that's okay because we rely on leader
433  * Tuplesortstate only allocating a small, fixed amount of memory here.
434  * When its tuplesort_performsort() is called (by our caller), and
435  * significant amounts of memory are likely to be used, all workers must
436  * have already freed almost all memory held by their Tuplesortstates
437  * (they are about to go away completely, too). The overall effect is
438  * that maintenance_work_mem always represents an absolute high watermark
439  * on the amount of memory used by a CREATE INDEX operation, regardless of
440  * the use of parallelism or any other factor.
441  */
442  buildstate->spool->sortstate =
443  tuplesort_begin_index_btree(heap, index, buildstate->isunique,
444  maintenance_work_mem, coordinate,
445  false);
446 
447  /*
448  * If building a unique index, put dead tuples in a second spool to keep
449  * them out of the uniqueness check. We expect that the second spool (for
450  * dead tuples) won't get very full, so we give it only work_mem.
451  */
452  if (indexInfo->ii_Unique)
453  {
454  BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
455  SortCoordinate coordinate2 = NULL;
456 
457  /* Initialize secondary spool */
458  btspool2->heap = heap;
459  btspool2->index = index;
460  btspool2->isunique = false;
461  /* Save as secondary spool */
462  buildstate->spool2 = btspool2;
463 
464  if (buildstate->btleader)
465  {
466  /*
467  * Set up non-private state that is passed to
468  * tuplesort_begin_index_btree() about the basic high level
469  * coordination of a parallel sort.
470  */
471  coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
472  coordinate2->isWorker = false;
473  coordinate2->nParticipants =
474  buildstate->btleader->nparticipanttuplesorts;
475  coordinate2->sharedsort = buildstate->btleader->sharedsort2;
476  }
477 
478  /*
479  * We expect that the second one (for dead tuples) won't get very
480  * full, so we give it only work_mem
481  */
482  buildstate->spool2->sortstate =
483  tuplesort_begin_index_btree(heap, index, false, work_mem,
484  coordinate2, false);
485  }
486 
487  /* Fill spool using either serial or parallel heap scan */
488  if (!buildstate->btleader)
489  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
490  _bt_build_callback, (void *) buildstate,
491  NULL);
492  else
493  reltuples = _bt_parallel_heapscan(buildstate,
494  &indexInfo->ii_BrokenHotChain);
495 
496  /*
497  * Set the progress target for the next phase. Reset the block number
498  * values set by table_index_build_scan
499  */
500  {
501  const int index[] = {
505  };
506  const int64 val[] = {
507  buildstate->indtuples,
508  0, 0
509  };
510 
511  pgstat_progress_update_multi_param(3, index, val);
512  }
513 
514  /* okay, all heap tuples are spooled */
515  if (buildstate->spool2 && !buildstate->havedead)
516  {
517  /* spool2 turns out to be unnecessary */
518  _bt_spooldestroy(buildstate->spool2);
519  buildstate->spool2 = NULL;
520  }
521 
522  return reltuples;
523 }
524 
525 /*
526  * clean up a spool structure and its substructures.
527  */
528 static void
530 {
531  tuplesort_end(btspool->sortstate);
532  pfree(btspool);
533 }
534 
535 /*
536  * spool an index entry into the sort file.
537  */
538 static void
539 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
540 {
541  tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
542  self, values, isnull);
543 }
544 
545 /*
546  * given a spool loaded by successive calls to _bt_spool,
547  * create an entire btree.
548  */
549 static void
550 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
551 {
552  BTWriteState wstate;
553 
554 #ifdef BTREE_BUILD_STATS
556  {
557  ShowUsage("BTREE BUILD (Spool) STATISTICS");
558  ResetUsage();
559  }
560 #endif /* BTREE_BUILD_STATS */
561 
565  if (btspool2)
566  {
569  tuplesort_performsort(btspool2->sortstate);
570  }
571 
572  wstate.heap = btspool->heap;
573  wstate.index = btspool->index;
574  wstate.inskey = _bt_mkscankey(wstate.index, NULL);
575 
576  /*
577  * We need to log index creation in WAL iff WAL archiving/streaming is
578  * enabled UNLESS the index isn't WAL-logged anyway.
579  */
580  wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
581 
582  /* reserve the metapage */
583  wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
584  wstate.btws_pages_written = 0;
585  wstate.btws_zeropage = NULL; /* until needed */
586 
589  _bt_load(&wstate, btspool, btspool2);
590 }
591 
592 /*
593  * Per-tuple callback for table_index_build_scan
594  */
595 static void
597  HeapTuple htup,
598  Datum *values,
599  bool *isnull,
600  bool tupleIsAlive,
601  void *state)
602 {
603  BTBuildState *buildstate = (BTBuildState *) state;
604 
605  /*
606  * insert the index tuple into the appropriate spool file for subsequent
607  * processing
608  */
609  if (tupleIsAlive || buildstate->spool2 == NULL)
610  _bt_spool(buildstate->spool, &htup->t_self, values, isnull);
611  else
612  {
613  /* dead tuples are put into spool2 */
614  buildstate->havedead = true;
615  _bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
616  }
617 
618  buildstate->indtuples += 1;
619 }
620 
621 /*
622  * allocate workspace for a new, clean btree page, not linked to any siblings.
623  */
624 static Page
626 {
627  Page page;
628  BTPageOpaque opaque;
629 
630  page = (Page) palloc(BLCKSZ);
631 
632  /* Zero the page and set up standard page header info */
633  _bt_pageinit(page, BLCKSZ);
634 
635  /* Initialize BT opaque state */
636  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
637  opaque->btpo_prev = opaque->btpo_next = P_NONE;
638  opaque->btpo.level = level;
639  opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
640  opaque->btpo_cycleid = 0;
641 
642  /* Make the P_HIKEY line pointer appear allocated */
643  ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
644 
645  return page;
646 }
647 
648 /*
649  * emit a completed btree page, and release the working storage.
650  */
651 static void
653 {
654  /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
655  RelationOpenSmgr(wstate->index);
656 
657  /* XLOG stuff */
658  if (wstate->btws_use_wal)
659  {
660  /* We use the heap NEWPAGE record type for this */
661  log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
662  }
663 
664  /*
665  * If we have to write pages nonsequentially, fill in the space with
666  * zeroes until we come back and overwrite. This is not logically
667  * necessary on standard Unix filesystems (unwritten space will read as
668  * zeroes anyway), but it should help to avoid fragmentation. The dummy
669  * pages aren't WAL-logged though.
670  */
671  while (blkno > wstate->btws_pages_written)
672  {
673  if (!wstate->btws_zeropage)
674  wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
675  /* don't set checksum for all-zero page */
677  wstate->btws_pages_written++,
678  (char *) wstate->btws_zeropage,
679  true);
680  }
681 
682  PageSetChecksumInplace(page, blkno);
683 
684  /*
685  * Now write the page. There's no need for smgr to schedule an fsync for
686  * this write; we'll do it ourselves before ending the build.
687  */
688  if (blkno == wstate->btws_pages_written)
689  {
690  /* extending the file... */
691  smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
692  (char *) page, true);
693  wstate->btws_pages_written++;
694  }
695  else
696  {
697  /* overwriting a block we zero-filled before */
698  smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
699  (char *) page, true);
700  }
701 
702  pfree(page);
703 }
704 
705 /*
706  * allocate and initialize a new BTPageState. the returned structure
707  * is suitable for immediate use by _bt_buildadd.
708  */
709 static BTPageState *
711 {
713 
714  /* create initial page for level */
715  state->btps_page = _bt_blnewpage(level);
716 
717  /* and assign it a page position */
718  state->btps_blkno = wstate->btws_pages_alloced++;
719 
720  state->btps_minkey = NULL;
721  /* initialize lastoff so first item goes into P_FIRSTKEY */
722  state->btps_lastoff = P_HIKEY;
723  state->btps_level = level;
724  /* set "full" threshold based on level. See notes at head of file. */
725  if (level > 0)
726  state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
727  else
730  /* no parent level, yet */
731  state->btps_next = NULL;
732 
733  return state;
734 }
735 
736 /*
737  * slide an array of ItemIds back one slot (from P_FIRSTKEY to
738  * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover
739  * that we have built an ItemId array in what has turned out to be a
740  * P_RIGHTMOST page.
741  */
742 static void
744 {
745  OffsetNumber off;
746  OffsetNumber maxoff;
747  ItemId previi;
748  ItemId thisii;
749 
750  if (!PageIsEmpty(page))
751  {
752  maxoff = PageGetMaxOffsetNumber(page);
753  previi = PageGetItemId(page, P_HIKEY);
754  for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
755  {
756  thisii = PageGetItemId(page, off);
757  *previi = *thisii;
758  previi = thisii;
759  }
760  ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
761  }
762 }
763 
764 /*
765  * Add an item to a page being built.
766  *
767  * The main difference between this routine and a bare PageAddItem call
768  * is that this code knows that the leftmost data item on a non-leaf
769  * btree page doesn't need to have a key. Therefore, it strips such
770  * items down to just the item header.
771  *
772  * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
773  * that because it assumes that P_RIGHTMOST() will return the correct
774  * answer for the page. Here, we don't know yet if the page will be
775  * rightmost. Offset P_FIRSTKEY is always the first data key.
776  */
777 static void
779  Size itemsize,
780  IndexTuple itup,
781  OffsetNumber itup_off)
782 {
784  IndexTupleData trunctuple;
785 
786  if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
787  {
788  trunctuple = *itup;
789  trunctuple.t_info = sizeof(IndexTupleData);
790  /* Deliberately zero INDEX_ALT_TID_MASK bits */
791  BTreeTupleSetNAtts(&trunctuple, 0);
792  itup = &trunctuple;
793  itemsize = sizeof(IndexTupleData);
794  }
795 
796  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
797  false, false) == InvalidOffsetNumber)
798  elog(ERROR, "failed to add item to the index page");
799 }
800 
801 /*----------
802  * Add an item to a disk page from the sort output.
803  *
804  * We must be careful to observe the page layout conventions of nbtsearch.c:
805  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
806  * - on non-leaf pages, the key portion of the first item need not be
807  * stored, we should store only the link.
808  *
809  * A leaf page being built looks like:
810  *
811  * +----------------+---------------------------------+
812  * | PageHeaderData | linp0 linp1 linp2 ... |
813  * +-----------+----+---------------------------------+
814  * | ... linpN | |
815  * +-----------+--------------------------------------+
816  * | ^ last |
817  * | |
818  * +-------------+------------------------------------+
819  * | | itemN ... |
820  * +-------------+------------------+-----------------+
821  * | ... item3 item2 item1 | "special space" |
822  * +--------------------------------+-----------------+
823  *
824  * Contrast this with the diagram in bufpage.h; note the mismatch
825  * between linps and items. This is because we reserve linp0 as a
826  * placeholder for the pointer to the "high key" item; when we have
827  * filled up the page, we will set linp0 to point to itemN and clear
828  * linpN. On the other hand, if we find this is the last (rightmost)
829  * page, we leave the items alone and slide the linp array over. If
830  * the high key is to be truncated, offset 1 is deleted, and we insert
831  * the truncated high key at offset 1.
832  *
833  * 'last' pointer indicates the last offset added to the page.
834  *----------
835  */
836 static void
838 {
839  Page npage;
840  BlockNumber nblkno;
841  OffsetNumber last_off;
842  Size pgspc;
843  Size itupsz;
844  bool isleaf;
845 
846  /*
847  * This is a handy place to check for cancel interrupts during the btree
848  * load phase of index creation.
849  */
851 
852  npage = state->btps_page;
853  nblkno = state->btps_blkno;
854  last_off = state->btps_lastoff;
855 
856  pgspc = PageGetFreeSpace(npage);
857  itupsz = IndexTupleSize(itup);
858  itupsz = MAXALIGN(itupsz);
859  /* Leaf case has slightly different rules due to suffix truncation */
860  isleaf = (state->btps_level == 0);
861 
862  /*
863  * Check whether the new item can fit on a btree page on current level at
864  * all.
865  *
866  * Every newly built index will treat heap TID as part of the keyspace,
867  * which imposes the requirement that new high keys must occasionally have
868  * a heap TID appended within _bt_truncate(). That may leave a new pivot
869  * tuple one or two MAXALIGN() quantums larger than the original first
870  * right tuple it's derived from. v4 deals with the problem by decreasing
871  * the limit on the size of tuples inserted on the leaf level by the same
872  * small amount. Enforce the new v4+ limit on the leaf level, and the old
873  * limit on internal levels, since pivot tuples may need to make use of
874  * the reserved space. This should never fail on internal pages.
875  */
876  if (unlikely(itupsz > BTMaxItemSize(npage)))
877  _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
878  itup);
879 
880  /*
881  * Check to see if current page will fit new item, with space left over to
882  * append a heap TID during suffix truncation when page is a leaf page.
883  *
884  * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
885  * high key with heap TID when finishing off a leaf page, since we rely on
886  * _bt_check_third_page() rejecting oversized non-pivot tuples. On
887  * internal pages we can always fit 3 pivot tuples with larger internal
888  * page tuple limit (includes page high key).
889  *
890  * Most of the time, a page is only "full" in the sense that the soft
891  * fillfactor-wise limit has been exceeded. However, we must always leave
892  * at least two items plus a high key on each page before starting a new
893  * page. Disregard fillfactor and insert on "full" current page if we
894  * don't have the minimum number of items yet. (Note that we deliberately
895  * assume that suffix truncation neither enlarges nor shrinks new high key
896  * when applying soft limit.)
897  */
898  if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
899  (pgspc < state->btps_full && last_off > P_FIRSTKEY))
900  {
901  /*
902  * Finish off the page and write it out.
903  */
904  Page opage = npage;
905  BlockNumber oblkno = nblkno;
906  ItemId ii;
907  ItemId hii;
908  IndexTuple oitup;
909 
910  /* Create new page of same level */
911  npage = _bt_blnewpage(state->btps_level);
912 
913  /* and assign it a page position */
914  nblkno = wstate->btws_pages_alloced++;
915 
916  /*
917  * We copy the last item on the page into the new page, and then
918  * rearrange the old page so that the 'last item' becomes its high key
919  * rather than a true data item. There had better be at least two
920  * items on the page already, else the page would be empty of useful
921  * data.
922  */
923  Assert(last_off > P_FIRSTKEY);
924  ii = PageGetItemId(opage, last_off);
925  oitup = (IndexTuple) PageGetItem(opage, ii);
926  _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
927 
928  /*
929  * Move 'last' into the high key position on opage. _bt_blnewpage()
930  * allocated empty space for a line pointer when opage was first
931  * created, so this is a matter of rearranging already-allocated space
932  * on page, and initializing high key line pointer. (Actually, leaf
933  * pages must also swap oitup with a truncated version of oitup, which
934  * is sometimes larger than oitup, though never by more than the space
935  * needed to append a heap TID.)
936  */
937  hii = PageGetItemId(opage, P_HIKEY);
938  *hii = *ii;
939  ItemIdSetUnused(ii); /* redundant */
940  ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
941 
942  if (isleaf)
943  {
944  IndexTuple lastleft;
945  IndexTuple truncated;
946  Size truncsz;
947 
948  /*
949  * Truncate away any unneeded attributes from high key on leaf
950  * level. This is only done at the leaf level because downlinks
951  * in internal pages are either negative infinity items, or get
952  * their contents from copying from one level down. See also:
953  * _bt_split().
954  *
955  * We don't try to bias our choice of split point to make it more
956  * likely that _bt_truncate() can truncate away more attributes,
957  * whereas the split point passed to _bt_split() is chosen much
958  * more delicately. Suffix truncation is mostly useful because it
959  * improves space utilization for workloads with random
960  * insertions. It doesn't seem worthwhile to add logic for
961  * choosing a split point here for a benefit that is bound to be
962  * much smaller.
963  *
964  * Since the truncated tuple is often smaller than the original
965  * tuple, it cannot just be copied in place (besides, we want to
966  * actually save space on the leaf page). We delete the original
967  * high key, and add our own truncated high key at the same
968  * offset.
969  *
970  * Note that the page layout won't be changed very much. oitup is
971  * already located at the physical beginning of tuple space, so we
972  * only shift the line pointer array back and forth, and overwrite
973  * the tuple space previously occupied by oitup. This is fairly
974  * cheap.
975  */
976  ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
977  lastleft = (IndexTuple) PageGetItem(opage, ii);
978 
979  truncated = _bt_truncate(wstate->index, lastleft, oitup,
980  wstate->inskey);
981  truncsz = IndexTupleSize(truncated);
983  _bt_sortaddtup(opage, truncsz, truncated, P_HIKEY);
984  pfree(truncated);
985 
986  /* oitup should continue to point to the page's high key */
987  hii = PageGetItemId(opage, P_HIKEY);
988  oitup = (IndexTuple) PageGetItem(opage, hii);
989  }
990 
991  /*
992  * Link the old page into its parent, using its minimum key. If we
993  * don't have a parent, we have to create one; this adds a new btree
994  * level.
995  */
996  if (state->btps_next == NULL)
997  state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
998 
999  Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <=
1001  BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > 0) ||
1003  Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 ||
1005  BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno);
1006  _bt_buildadd(wstate, state->btps_next, state->btps_minkey);
1007  pfree(state->btps_minkey);
1008 
1009  /*
1010  * Save a copy of the high key from the old page. It is also used as
1011  * the minimum key for the new page.
1012  */
1013  state->btps_minkey = CopyIndexTuple(oitup);
1014 
1015  /*
1016  * Set the sibling links for both pages.
1017  */
1018  {
1019  BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
1020  BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
1021 
1022  oopaque->btpo_next = nblkno;
1023  nopaque->btpo_prev = oblkno;
1024  nopaque->btpo_next = P_NONE; /* redundant */
1025  }
1026 
1027  /*
1028  * Write out the old page. We never need to touch it again, so we can
1029  * free the opage workspace too.
1030  */
1031  _bt_blwritepage(wstate, opage, oblkno);
1032 
1033  /*
1034  * Reset last_off to point to new page
1035  */
1036  last_off = P_FIRSTKEY;
1037  }
1038 
1039  /*
1040  * By here, either original page is still the current page, or a new page
1041  * was created that became the current page. Either way, the current page
1042  * definitely has space for new item.
1043  *
1044  * If the new item is the first for its page, stash a copy for later. Note
1045  * this will only happen for the first item on a level; on later pages,
1046  * the first item for a page is copied from the prior page in the code
1047  * above. The minimum key for an entire level is nothing more than a
1048  * minus infinity (downlink only) pivot tuple placeholder.
1049  */
1050  if (last_off == P_HIKEY)
1051  {
1052  Assert(state->btps_minkey == NULL);
1053  state->btps_minkey = CopyIndexTuple(itup);
1054  /* _bt_sortaddtup() will perform full truncation later */
1055  BTreeTupleSetNAtts(state->btps_minkey, 0);
1056  }
1057 
1058  /*
1059  * Add the new item into the current page.
1060  */
1061  last_off = OffsetNumberNext(last_off);
1062  _bt_sortaddtup(npage, itupsz, itup, last_off);
1063 
1064  state->btps_page = npage;
1065  state->btps_blkno = nblkno;
1066  state->btps_lastoff = last_off;
1067 }
1068 
1069 /*
1070  * Finish writing out the completed btree.
1071  */
1072 static void
1074 {
1075  BTPageState *s;
1076  BlockNumber rootblkno = P_NONE;
1077  uint32 rootlevel = 0;
1078  Page metapage;
1079 
1080  /*
1081  * Each iteration of this loop completes one more level of the tree.
1082  */
1083  for (s = state; s != NULL; s = s->btps_next)
1084  {
1085  BlockNumber blkno;
1086  BTPageOpaque opaque;
1087 
1088  blkno = s->btps_blkno;
1090 
1091  /*
1092  * We have to link the last page on this level to somewhere.
1093  *
1094  * If we're at the top, it's the root, so attach it to the metapage.
1095  * Otherwise, add an entry for it to its parent using its minimum key.
1096  * This may cause the last page of the parent level to split, but
1097  * that's not a problem -- we haven't gotten to it yet.
1098  */
1099  if (s->btps_next == NULL)
1100  {
1101  opaque->btpo_flags |= BTP_ROOT;
1102  rootblkno = blkno;
1103  rootlevel = s->btps_level;
1104  }
1105  else
1106  {
1107  Assert((BTreeTupleGetNAtts(s->btps_minkey, wstate->index) <=
1109  BTreeTupleGetNAtts(s->btps_minkey, wstate->index) > 0) ||
1110  P_LEFTMOST(opaque));
1111  Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 ||
1112  !P_LEFTMOST(opaque));
1114  _bt_buildadd(wstate, s->btps_next, s->btps_minkey);
1115  pfree(s->btps_minkey);
1116  s->btps_minkey = NULL;
1117  }
1118 
1119  /*
1120  * This is the rightmost page, so the ItemId array needs to be slid
1121  * back one slot. Then we can dump out the page.
1122  */
1124  _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1125  s->btps_page = NULL; /* writepage freed the workspace */
1126  }
1127 
1128  /*
1129  * As the last step in the process, construct the metapage and make it
1130  * point to the new root (unless we had no data at all, in which case it's
1131  * set to point to "P_NONE"). This changes the index to the "valid" state
1132  * by filling in a valid magic number in the metapage.
1133  */
1134  metapage = (Page) palloc(BLCKSZ);
1135  _bt_initmetapage(metapage, rootblkno, rootlevel);
1136  _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1137 }
1138 
1139 /*
1140  * Read tuples in correct sort order from tuplesort, and load them into
1141  * btree leaves.
1142  */
1143 static void
1144 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1145 {
1146  BTPageState *state = NULL;
1147  bool merge = (btspool2 != NULL);
1148  IndexTuple itup,
1149  itup2 = NULL;
1150  bool load1;
1151  TupleDesc tupdes = RelationGetDescr(wstate->index);
1152  int i,
1154  SortSupport sortKeys;
1155  int64 tuples_done = 0;
1156 
1157  if (merge)
1158  {
1159  /*
1160  * Another BTSpool for dead tuples exists. Now we have to merge
1161  * btspool and btspool2.
1162  */
1163 
1164  /* the preparation of merge */
1165  itup = tuplesort_getindextuple(btspool->sortstate, true);
1166  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1167 
1168  /* Prepare SortSupport data for each column */
1169  sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1170 
1171  for (i = 0; i < keysz; i++)
1172  {
1173  SortSupport sortKey = sortKeys + i;
1174  ScanKey scanKey = wstate->inskey->scankeys + i;
1175  int16 strategy;
1176 
1177  sortKey->ssup_cxt = CurrentMemoryContext;
1178  sortKey->ssup_collation = scanKey->sk_collation;
1179  sortKey->ssup_nulls_first =
1180  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1181  sortKey->ssup_attno = scanKey->sk_attno;
1182  /* Abbreviation is not supported here */
1183  sortKey->abbreviate = false;
1184 
1185  AssertState(sortKey->ssup_attno != 0);
1186 
1187  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1189 
1190  PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1191  }
1192 
1193  for (;;)
1194  {
1195  load1 = true; /* load BTSpool next ? */
1196  if (itup2 == NULL)
1197  {
1198  if (itup == NULL)
1199  break;
1200  }
1201  else if (itup != NULL)
1202  {
1203  int32 compare = 0;
1204 
1205  for (i = 1; i <= keysz; i++)
1206  {
1207  SortSupport entry;
1208  Datum attrDatum1,
1209  attrDatum2;
1210  bool isNull1,
1211  isNull2;
1212 
1213  entry = sortKeys + i - 1;
1214  attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1215  attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1216 
1217  compare = ApplySortComparator(attrDatum1, isNull1,
1218  attrDatum2, isNull2,
1219  entry);
1220  if (compare > 0)
1221  {
1222  load1 = false;
1223  break;
1224  }
1225  else if (compare < 0)
1226  break;
1227  }
1228 
1229  /*
1230  * If key values are equal, we sort on ItemPointer. This is
1231  * required for btree indexes, since heap TID is treated as an
1232  * implicit last key attribute in order to ensure that all
1233  * keys in the index are physically unique.
1234  */
1235  if (compare == 0)
1236  {
1237  compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1238  Assert(compare != 0);
1239  if (compare > 0)
1240  load1 = false;
1241  }
1242  }
1243  else
1244  load1 = false;
1245 
1246  /* When we see first tuple, create first index page */
1247  if (state == NULL)
1248  state = _bt_pagestate(wstate, 0);
1249 
1250  if (load1)
1251  {
1252  _bt_buildadd(wstate, state, itup);
1253  itup = tuplesort_getindextuple(btspool->sortstate, true);
1254  }
1255  else
1256  {
1257  _bt_buildadd(wstate, state, itup2);
1258  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1259  }
1260 
1261  /* Report progress */
1263  ++tuples_done);
1264  }
1265  pfree(sortKeys);
1266  }
1267  else
1268  {
1269  /* merge is unnecessary */
1270  while ((itup = tuplesort_getindextuple(btspool->sortstate,
1271  true)) != NULL)
1272  {
1273  /* When we see first tuple, create first index page */
1274  if (state == NULL)
1275  state = _bt_pagestate(wstate, 0);
1276 
1277  _bt_buildadd(wstate, state, itup);
1278 
1279  /* Report progress */
1281  ++tuples_done);
1282  }
1283  }
1284 
1285  /* Close down final pages and write the metapage */
1286  _bt_uppershutdown(wstate, state);
1287 
1288  /*
1289  * If the index is WAL-logged, we must fsync it down to disk before it's
1290  * safe to commit the transaction. (For a non-WAL-logged index we don't
1291  * care since the index will be uninteresting after a crash anyway.)
1292  *
1293  * It's obvious that we must do this when not WAL-logging the build. It's
1294  * less obvious that we have to do it even if we did WAL-log the index
1295  * pages. The reason is that since we're building outside shared buffers,
1296  * a CHECKPOINT occurring during the build has no way to flush the
1297  * previously written data to disk (indeed it won't know the index even
1298  * exists). A crash later on would replay WAL from the checkpoint,
1299  * therefore it wouldn't replay our earlier WAL entries. If we do not
1300  * fsync those pages here, they might still not be on disk when the crash
1301  * occurs.
1302  */
1303  if (RelationNeedsWAL(wstate->index))
1304  {
1305  RelationOpenSmgr(wstate->index);
1307  }
1308 }
1309 
1310 /*
1311  * Create parallel context, and launch workers for leader.
1312  *
1313  * buildstate argument should be initialized (with the exception of the
1314  * tuplesort state in spools, which may later be created based on shared
1315  * state initially set up here).
1316  *
1317  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1318  *
1319  * request is the target number of parallel worker processes to launch.
1320  *
1321  * Sets buildstate's BTLeader, which caller must use to shut down parallel
1322  * mode by passing it to _bt_end_parallel() at the very end of its index
1323  * build. If not even a single worker process can be launched, this is
1324  * never set, and caller should proceed with a serial index build.
1325  */
1326 static void
1327 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1328 {
1329  ParallelContext *pcxt;
1330  int scantuplesortstates;
1331  Snapshot snapshot;
1332  Size estbtshared;
1333  Size estsort;
1334  BTShared *btshared;
1335  Sharedsort *sharedsort;
1336  Sharedsort *sharedsort2;
1337  BTSpool *btspool = buildstate->spool;
1338  BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1339  bool leaderparticipates = true;
1340  char *sharedquery;
1341  int querylen;
1342 
1343 #ifdef DISABLE_LEADER_PARTICIPATION
1344  leaderparticipates = false;
1345 #endif
1346 
1347  /*
1348  * Enter parallel mode, and create context for parallel build of btree
1349  * index
1350  */
1352  Assert(request > 0);
1353  pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1354  request);
1355  scantuplesortstates = leaderparticipates ? request + 1 : request;
1356 
1357  /*
1358  * Prepare for scan of the base relation. In a normal index build, we use
1359  * SnapshotAny because we must retrieve all tuples and do our own time
1360  * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1361  * concurrent build, we take a regular MVCC snapshot and index whatever's
1362  * live according to that.
1363  */
1364  if (!isconcurrent)
1365  snapshot = SnapshotAny;
1366  else
1368 
1369  /*
1370  * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1371  * PARALLEL_KEY_TUPLESORT tuplesort workspace
1372  */
1373  estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1374  shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1375  estsort = tuplesort_estimate_shared(scantuplesortstates);
1376  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1377 
1378  /*
1379  * Unique case requires a second spool, and so we may have to account for
1380  * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1381  */
1382  if (!btspool->isunique)
1383  shm_toc_estimate_keys(&pcxt->estimator, 2);
1384  else
1385  {
1386  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1387  shm_toc_estimate_keys(&pcxt->estimator, 3);
1388  }
1389 
1390  /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1391  querylen = strlen(debug_query_string);
1392  shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1393  shm_toc_estimate_keys(&pcxt->estimator, 1);
1394 
1395  /* Everyone's had a chance to ask for space, so now create the DSM */
1396  InitializeParallelDSM(pcxt);
1397 
1398  /* Store shared build state, for which we reserved space */
1399  btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1400  /* Initialize immutable state */
1401  btshared->heaprelid = RelationGetRelid(btspool->heap);
1402  btshared->indexrelid = RelationGetRelid(btspool->index);
1403  btshared->isunique = btspool->isunique;
1404  btshared->isconcurrent = isconcurrent;
1405  btshared->scantuplesortstates = scantuplesortstates;
1407  SpinLockInit(&btshared->mutex);
1408  /* Initialize mutable state */
1409  btshared->nparticipantsdone = 0;
1410  btshared->reltuples = 0.0;
1411  btshared->havedead = false;
1412  btshared->indtuples = 0.0;
1413  btshared->brokenhotchain = false;
1416  snapshot);
1417 
1418  /*
1419  * Store shared tuplesort-private state, for which we reserved space.
1420  * Then, initialize opaque state using tuplesort routine.
1421  */
1422  sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1423  tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1424  pcxt->seg);
1425 
1426  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1427  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1428 
1429  /* Unique case requires a second spool, and associated shared state */
1430  if (!btspool->isunique)
1431  sharedsort2 = NULL;
1432  else
1433  {
1434  /*
1435  * Store additional shared tuplesort-private state, for which we
1436  * reserved space. Then, initialize opaque state using tuplesort
1437  * routine.
1438  */
1439  sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1440  tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1441  pcxt->seg);
1442 
1443  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1444  }
1445 
1446  /* Store query string for workers */
1447  sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1448  memcpy(sharedquery, debug_query_string, querylen + 1);
1449  shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1450 
1451  /* Launch workers, saving status for leader/caller */
1452  LaunchParallelWorkers(pcxt);
1453  btleader->pcxt = pcxt;
1454  btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1455  if (leaderparticipates)
1456  btleader->nparticipanttuplesorts++;
1457  btleader->btshared = btshared;
1458  btleader->sharedsort = sharedsort;
1459  btleader->sharedsort2 = sharedsort2;
1460  btleader->snapshot = snapshot;
1461 
1462  /* If no workers were successfully launched, back out (do serial build) */
1463  if (pcxt->nworkers_launched == 0)
1464  {
1465  _bt_end_parallel(btleader);
1466  return;
1467  }
1468 
1469  /* Save leader state now that it's clear build will be parallel */
1470  buildstate->btleader = btleader;
1471 
1472  /* Join heap scan ourselves */
1473  if (leaderparticipates)
1475 
1476  /*
1477  * Caller needs to wait for all launched workers when we return. Make
1478  * sure that the failure-to-start case will not hang forever.
1479  */
1481 }
1482 
1483 /*
1484  * Shut down workers, destroy parallel context, and end parallel mode.
1485  */
1486 static void
1488 {
1489  /* Shutdown worker processes */
1491  /* Free last reference to MVCC snapshot, if one was used */
1492  if (IsMVCCSnapshot(btleader->snapshot))
1493  UnregisterSnapshot(btleader->snapshot);
1494  DestroyParallelContext(btleader->pcxt);
1495  ExitParallelMode();
1496 }
1497 
1498 /*
1499  * Returns size of shared memory required to store state for a parallel
1500  * btree index build based on the snapshot its parallel scan will use.
1501  */
1502 static Size
1504 {
1505  /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1506  return add_size(BUFFERALIGN(sizeof(BTShared)),
1507  table_parallelscan_estimate(heap, snapshot));
1508 }
1509 
1510 /*
1511  * Within leader, wait for end of heap scan.
1512  *
1513  * When called, parallel heap scan started by _bt_begin_parallel() will
1514  * already be underway within worker processes (when leader participates
1515  * as a worker, we should end up here just as workers are finishing).
1516  *
1517  * Fills in fields needed for ambuild statistics, and lets caller set
1518  * field indicating that some worker encountered a broken HOT chain.
1519  *
1520  * Returns the total number of heap tuples scanned.
1521  */
1522 static double
1523 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1524 {
1525  BTShared *btshared = buildstate->btleader->btshared;
1526  int nparticipanttuplesorts;
1527  double reltuples;
1528 
1529  nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1530  for (;;)
1531  {
1532  SpinLockAcquire(&btshared->mutex);
1533  if (btshared->nparticipantsdone == nparticipanttuplesorts)
1534  {
1535  buildstate->havedead = btshared->havedead;
1536  buildstate->indtuples = btshared->indtuples;
1537  *brokenhotchain = btshared->brokenhotchain;
1538  reltuples = btshared->reltuples;
1539  SpinLockRelease(&btshared->mutex);
1540  break;
1541  }
1542  SpinLockRelease(&btshared->mutex);
1543 
1546  }
1547 
1549 
1550  return reltuples;
1551 }
1552 
1553 /*
1554  * Within leader, participate as a parallel worker.
1555  */
1556 static void
1558 {
1559  BTLeader *btleader = buildstate->btleader;
1560  BTSpool *leaderworker;
1561  BTSpool *leaderworker2;
1562  int sortmem;
1563 
1564  /* Allocate memory and initialize private spool */
1565  leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1566  leaderworker->heap = buildstate->spool->heap;
1567  leaderworker->index = buildstate->spool->index;
1568  leaderworker->isunique = buildstate->spool->isunique;
1569 
1570  /* Initialize second spool, if required */
1571  if (!btleader->btshared->isunique)
1572  leaderworker2 = NULL;
1573  else
1574  {
1575  /* Allocate memory for worker's own private secondary spool */
1576  leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1577 
1578  /* Initialize worker's own secondary spool */
1579  leaderworker2->heap = leaderworker->heap;
1580  leaderworker2->index = leaderworker->index;
1581  leaderworker2->isunique = false;
1582  }
1583 
1584  /*
1585  * Might as well use reliable figure when doling out maintenance_work_mem
1586  * (when requested number of workers were not launched, this will be
1587  * somewhat higher than it is for other workers).
1588  */
1589  sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1590 
1591  /* Perform work common to all participants */
1592  _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1593  btleader->sharedsort, btleader->sharedsort2,
1594  sortmem, true);
1595 
1596 #ifdef BTREE_BUILD_STATS
1598  {
1599  ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1600  ResetUsage();
1601  }
1602 #endif /* BTREE_BUILD_STATS */
1603 }
1604 
1605 /*
1606  * Perform work within a launched parallel process.
1607  */
1608 void
1610 {
1611  char *sharedquery;
1612  BTSpool *btspool;
1613  BTSpool *btspool2;
1614  BTShared *btshared;
1615  Sharedsort *sharedsort;
1616  Sharedsort *sharedsort2;
1617  Relation heapRel;
1618  Relation indexRel;
1619  LOCKMODE heapLockmode;
1620  LOCKMODE indexLockmode;
1621  int sortmem;
1622 
1623 #ifdef BTREE_BUILD_STATS
1625  ResetUsage();
1626 #endif /* BTREE_BUILD_STATS */
1627 
1628  /* Set debug_query_string for individual workers first */
1629  sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false);
1630  debug_query_string = sharedquery;
1631 
1632  /* Report the query string from leader */
1634 
1635  /* Look up nbtree shared state */
1636  btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1637 
1638  /* Open relations using lock modes known to be obtained by index.c */
1639  if (!btshared->isconcurrent)
1640  {
1641  heapLockmode = ShareLock;
1642  indexLockmode = AccessExclusiveLock;
1643  }
1644  else
1645  {
1646  heapLockmode = ShareUpdateExclusiveLock;
1647  indexLockmode = RowExclusiveLock;
1648  }
1649 
1650  /* Open relations within worker */
1651  heapRel = table_open(btshared->heaprelid, heapLockmode);
1652  indexRel = index_open(btshared->indexrelid, indexLockmode);
1653 
1654  /* Initialize worker's own spool */
1655  btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1656  btspool->heap = heapRel;
1657  btspool->index = indexRel;
1658  btspool->isunique = btshared->isunique;
1659 
1660  /* Look up shared state private to tuplesort.c */
1661  sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1662  tuplesort_attach_shared(sharedsort, seg);
1663  if (!btshared->isunique)
1664  {
1665  btspool2 = NULL;
1666  sharedsort2 = NULL;
1667  }
1668  else
1669  {
1670  /* Allocate memory for worker's own private secondary spool */
1671  btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1672 
1673  /* Initialize worker's own secondary spool */
1674  btspool2->heap = btspool->heap;
1675  btspool2->index = btspool->index;
1676  btspool2->isunique = false;
1677  /* Look up shared state private to tuplesort.c */
1678  sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1679  tuplesort_attach_shared(sharedsort2, seg);
1680  }
1681 
1682  /* Perform sorting of spool, and possibly a spool2 */
1683  sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1684  _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1685  sharedsort2, sortmem, false);
1686 
1687 #ifdef BTREE_BUILD_STATS
1689  {
1690  ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1691  ResetUsage();
1692  }
1693 #endif /* BTREE_BUILD_STATS */
1694 
1695  index_close(indexRel, indexLockmode);
1696  table_close(heapRel, heapLockmode);
1697 }
1698 
1699 /*
1700  * Perform a worker's portion of a parallel sort.
1701  *
1702  * This generates a tuplesort for passed btspool, and a second tuplesort
1703  * state if a second btspool is need (i.e. for unique index builds). All
1704  * other spool fields should already be set when this is called.
1705  *
1706  * sortmem is the amount of working memory to use within each worker,
1707  * expressed in KBs.
1708  *
1709  * When this returns, workers are done, and need only release resources.
1710  */
1711 static void
1713  BTShared *btshared, Sharedsort *sharedsort,
1714  Sharedsort *sharedsort2, int sortmem, bool progress)
1715 {
1716  SortCoordinate coordinate;
1717  BTBuildState buildstate;
1718  TableScanDesc scan;
1719  double reltuples;
1720  IndexInfo *indexInfo;
1721 
1722  /* Initialize local tuplesort coordination state */
1723  coordinate = palloc0(sizeof(SortCoordinateData));
1724  coordinate->isWorker = true;
1725  coordinate->nParticipants = -1;
1726  coordinate->sharedsort = sharedsort;
1727 
1728  /* Begin "partial" tuplesort */
1729  btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1730  btspool->index,
1731  btspool->isunique,
1732  sortmem, coordinate,
1733  false);
1734 
1735  /*
1736  * Just as with serial case, there may be a second spool. If so, a
1737  * second, dedicated spool2 partial tuplesort is required.
1738  */
1739  if (btspool2)
1740  {
1741  SortCoordinate coordinate2;
1742 
1743  /*
1744  * We expect that the second one (for dead tuples) won't get very
1745  * full, so we give it only work_mem (unless sortmem is less for
1746  * worker). Worker processes are generally permitted to allocate
1747  * work_mem independently.
1748  */
1749  coordinate2 = palloc0(sizeof(SortCoordinateData));
1750  coordinate2->isWorker = true;
1751  coordinate2->nParticipants = -1;
1752  coordinate2->sharedsort = sharedsort2;
1753  btspool2->sortstate =
1754  tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
1755  Min(sortmem, work_mem), coordinate2,
1756  false);
1757  }
1758 
1759  /* Fill in buildstate for _bt_build_callback() */
1760  buildstate.isunique = btshared->isunique;
1761  buildstate.havedead = false;
1762  buildstate.heap = btspool->heap;
1763  buildstate.spool = btspool;
1764  buildstate.spool2 = btspool2;
1765  buildstate.indtuples = 0;
1766  buildstate.btleader = NULL;
1767 
1768  /* Join parallel scan */
1769  indexInfo = BuildIndexInfo(btspool->index);
1770  indexInfo->ii_Concurrent = btshared->isconcurrent;
1771  scan = table_beginscan_parallel(btspool->heap,
1772  ParallelTableScanFromBTShared(btshared));
1773  reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1774  true, progress, _bt_build_callback,
1775  (void *) &buildstate, scan);
1776 
1777  /*
1778  * Execute this worker's part of the sort.
1779  *
1780  * Unlike leader and serial cases, we cannot avoid calling
1781  * tuplesort_performsort() for spool2 if it ends up containing no dead
1782  * tuples (this is disallowed for workers by tuplesort).
1783  */
1784  tuplesort_performsort(btspool->sortstate);
1785  if (btspool2)
1786  tuplesort_performsort(btspool2->sortstate);
1787 
1788  /*
1789  * Done. Record ambuild statistics, and whether we encountered a broken
1790  * HOT chain.
1791  */
1792  SpinLockAcquire(&btshared->mutex);
1793  btshared->nparticipantsdone++;
1794  btshared->reltuples += reltuples;
1795  if (buildstate.havedead)
1796  btshared->havedead = true;
1797  btshared->indtuples += buildstate.indtuples;
1798  if (indexInfo->ii_BrokenHotChain)
1799  btshared->brokenhotchain = true;
1800  SpinLockRelease(&btshared->mutex);
1801 
1802  /* Notify leader */
1804 
1805  /* We can end tuplesorts immediately */
1806  tuplesort_end(btspool->sortstate);
1807  if (btspool2)
1808  tuplesort_end(btspool2->sortstate);
1809 }
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2215
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
Definition: nbtsort.c:1327
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
signed short int16
Definition: c.h:345
bool ssup_nulls_first
Definition: sortsupport.h:75
int slock_t
Definition: s_lock.h:929
#define BTP_ROOT
Definition: nbtree.h:72
Sharedsort * sharedsort2
Definition: nbtsort.c:204
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1790
BlockNumber btpo_next
Definition: nbtree.h:58
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition: nbtree.h:689
#define PageIsEmpty(page)
Definition: bufpage.h:222
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:133
int scantuplesortstates
Definition: nbtsort.c:120
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
struct BTWriteState BTWriteState
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:81
Relation heap
Definition: nbtsort.c:218
#define AssertState(condition)
Definition: c.h:735
Page btws_zeropage
Definition: nbtsort.c:271
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
Definition: parallel.c:159
BTShared * btshared
Definition: nbtsort.c:202
int nparticipantsdone
Definition: nbtsort.c:154
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:1144
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:865
#define RelationGetDescr(relation)
Definition: rel.h:440
int LOCKMODE
Definition: lockdefs.h:26
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:85
Oid indexrelid
Definition: nbtsort.c:117
void pgstat_report_activity(BackendState state, const char *cmd_str)
Definition: pgstat.c:3121
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:726
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition: nbtree.h:690
void ShowUsage(const char *title)
Definition: postgres.c:4572
dsm_segment * seg
Definition: parallel.h:42
BlockNumber btws_pages_alloced
Definition: nbtsort.c:269
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
void pgstat_progress_update_param(int index, int64 val)
Definition: pgstat.c:3220
Sharedsort * sharedsort
Definition: nbtsort.c:203
shm_toc_estimator estimator
Definition: parallel.h:41
#define SpinLockInit(lock)
Definition: spin.h:60
#define BTP_LEAF
Definition: nbtree.h:71
#define XLogIsNeeded()
Definition: xlog.h:181
ItemPointerData t_tid
Definition: itup.h:37
void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition: nbtsort.c:1609
#define Min(x, y)
Definition: c.h:890
union BTPageOpaqueData::@46 btpo
bool havedead
Definition: nbtsort.c:217
Snapshot snapshot
Definition: nbtsort.c:205
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:181
Sharedsort * sharedsort
Definition: tuplesort.h:56
bool isunique
Definition: nbtsort.c:118
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
#define ParallelTableScanFromBTShared(shared)
Definition: nbtsort.c:173
bool isconcurrent
Definition: nbtsort.c:119
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
Definition: progress.h:67
uint32 BlockNumber
Definition: block.h:31
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1487
Tuplesortstate * sortstate
Definition: nbtsort.c:98
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2168
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:550
BlockNumber btps_blkno
Definition: nbtsort.c:252
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:4391
unsigned int Oid
Definition: postgres_ext.h:31
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
bool btws_use_wal
Definition: nbtsort.c:268
BTSpool * spool
Definition: nbtsort.c:219
static void _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
Definition: nbtsort.c:652
static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
Definition: pairingheap.c:79
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:306
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:336
ParallelContext * pcxt
Definition: nbtsort.c:182
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:581
Relation heap
Definition: nbtsort.c:99
signed int int32
Definition: c.h:346
struct BTBuildState BTBuildState
IndexTuple btps_minkey
Definition: nbtsort.c:253
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
#define RelationOpenSmgr(relation)
Definition: rel.h:471
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:2116
void ResetUsage(void)
Definition: postgres.c:4565
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:717
#define SpinLockAcquire(lock)
Definition: spin.h:62
void ConditionVariableInit(ConditionVariable *cv)
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:871
Page btps_page
Definition: nbtsort.c:251
static void _bt_build_callback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: nbtsort.c:596
struct BTSpool BTSpool
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
void pfree(void *pointer)
Definition: mcxt.c:1031
void ConditionVariableCancelSleep(void)
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:170
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
Size btps_full
Definition: nbtsort.c:256
#define ERROR
Definition: elog.h:43
static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
Definition: nbtsort.c:1523
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:310
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1500
bool isunique
Definition: nbtsort.c:101
MemoryContext ssup_cxt
Definition: sortsupport.h:66
ItemPointerData t_self
Definition: htup.h:65
BTCycleId btpo_cycleid
Definition: nbtree.h:65
void ConditionVariableSignal(ConditionVariable *cv)
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition: nbtree.h:691
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
Definition: nbtsort.c:837
void ExitParallelMode(void)
Definition: xact.c:974
Oid heaprelid
Definition: nbtsort.c:116
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
Definition: nbtpage.c:51
BlockNumber btpo_prev
Definition: nbtree.h:57
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:502
OffsetNumber btps_lastoff
Definition: nbtsort.c:254
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:609
BTSpool * spool2
Definition: nbtsort.c:225
#define RowExclusiveLock
Definition: lockdefs.h:38
IndexTupleData * IndexTuple
Definition: itup.h:53
struct BTShared BTShared
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:83
#define P_FIRSTKEY
Definition: nbtree.h:218
double indtuples
Definition: nbtsort.c:226
bool isunique
Definition: nbtsort.c:216
#define RelationGetRelationName(relation)
Definition: rel.h:448
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:82
bool ii_BrokenHotChain
Definition: execnodes.h:172
unsigned int uint32
Definition: c.h:358
struct ItemIdData ItemIdData
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:433
int nworkers_launched
Definition: parallel.h:37
void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup)
Definition: sortsupport.c:161
bool brokenhotchain
Definition: nbtsort.c:158
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:494
#define BTREE_DEFAULT_FILLFACTOR
Definition: nbtree.h:169
#define BTREE_METAPAGE
Definition: nbtree.h:131
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition: rel.h:305
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:907
Relation index
Definition: nbtsort.c:100
Relation index
Definition: nbtsort.c:266
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
Definition: tableam.c:161
const char * debug_query_string
Definition: postgres.c:87
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:196
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:682
int progress
Definition: pgbench.c:215
uint32 level
Definition: nbtree.h:61
#define SpinLockRelease(lock)
Definition: spin.h:64
struct BTLeader BTLeader
BlockNumber btws_pages_written
Definition: nbtsort.c:270
void * palloc0(Size size)
Definition: mcxt.c:955
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:59
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:103
uint32 btps_level
Definition: nbtsort.c:255
uintptr_t Datum
Definition: postgres.h:367
struct IndexTupleData IndexTupleData
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
Definition: nbtsort.c:710
Size add_size(Size s1, Size s2)
Definition: shmem.c:475
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition: nbtsort.c:379
bool log_btree_build_stats
Definition: guc.c:496
BTLeader * btleader
Definition: nbtsort.c:233
int work_mem
Definition: globals.c:121
double reltuples
Definition: nbtsort.c:155
AttrNumber ssup_attno
Definition: sortsupport.h:81
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:198
double indtuples
Definition: nbtsort.c:157
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition: nbtsort.c:1712
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:98
struct BTPageState * btps_next
Definition: nbtsort.c:257
BTScanInsert inskey
Definition: nbtsort.c:267
#define InvalidOffsetNumber
Definition: off.h:26
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
Definition: tuplesort.c:1477
slock_t mutex
Definition: nbtsort.c:136
int maintenance_work_mem
Definition: globals.c:122
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
Definition: tableam.c:126
bool havedead
Definition: nbtsort.c:156
RelFileNode rd_node
Definition: rel.h:55
bool ii_Unique
Definition: execnodes.h:169
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:4414
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
PageHeaderData * PageHeader
Definition: bufpage.h:166
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define SK_BT_DESC
Definition: nbtree.h:681
Definition: regguts.h:298
int nparticipanttuplesorts
Definition: nbtsort.c:190
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
Definition: pgstat.c:3242
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
int ii_ParallelWorkers
Definition: execnodes.h:173
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
Definition: nbtsort.c:1503
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1198
#define MAXALIGN(LEN)
Definition: c.h:685
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
void EnterParallelMode(void)
Definition: xact.c:961
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:102
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:478
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
static void _bt_leader_participate_as_worker(BTBuildState *buildstate)
Definition: nbtsort.c:1557
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:4370
#define RelationNeedsWAL(relation)
Definition: rel.h:516
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
Definition: nbtutils.c:2532
bool ii_Concurrent
Definition: execnodes.h:171
#define SnapshotAny
Definition: snapmgr.h:70
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition: nbtree.h:692
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:529
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:152
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:562
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:303
#define BTMaxItemSize(page)
Definition: nbtree.h:147
#define P_HIKEY
Definition: nbtree.h:217
static Datum values[MAXATTR]
Definition: bootstrap.c:167
static void _bt_slideleft(Page page)
Definition: nbtsort.c:743
struct BTPageState BTPageState
#define AccessExclusiveLock
Definition: lockdefs.h:45
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * palloc(Size size)
Definition: mcxt.c:924
#define PROGRESS_CREATEIDX_TUPLES_DONE
Definition: progress.h:68
static Page _bt_blnewpage(uint32 level)
Definition: nbtsort.c:625
Oid sk_collation
Definition: skey.h:70
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:972
#define elog(elevel,...)
Definition: elog.h:226
#define ShareLock
Definition: lockdefs.h:41
int i
static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
Definition: nbtsort.c:539
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:963
static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtsort.c:778
#define BUFFERALIGN(LEN)
Definition: c.h:687
#define unlikely(x)
Definition: c.h:208
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define PROGRESS_CREATEIDX_SUBPHASE
Definition: progress.h:66
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:975
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
Definition: nbtsort.c:1073
ConditionVariable workersdonecv
Definition: nbtsort.c:128
unsigned short t_info
Definition: itup.h:49
Relation heap
Definition: nbtsort.c:265
#define ItemIdSetUnused(itemId)
Definition: itemid.h:128
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:80
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
Definition: tableam.c:141
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1235
#define BTLessStrategyNumber
Definition: stratnum.h:29
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:39
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:200
uint16 btpo_flags
Definition: nbtree.h:64
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition: parallel.c:614
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:696
#define RelationGetRelid(relation)
Definition: rel.h:414
long val
Definition: informix.c:689
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:126
float4 reltuples
Definition: pg_class.h:63
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
AttrNumber sk_attno
Definition: skey.h:67
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
shm_toc * toc
Definition: parallel.h:44
double index_tuples
Definition: genam.h:33
double heap_tuples
Definition: genam.h:32
#define P_ISLEAF(opaque)
Definition: nbtree.h:189