PostgreSQL Source Code  git master
nbtsort.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtsort.c
4  * Build a btree from sorted input by loading leaf pages sequentially.
5  *
6  * NOTES
7  *
8  * We use tuplesort.c to sort the given index tuples into order.
9  * Then we scan the index tuples in order and build the btree pages
10  * for each level. We load source tuples into leaf-level pages.
11  * Whenever we fill a page at one level, we add a link to it to its
12  * parent level (starting a new parent level if necessary). When
13  * done, we write out each final page on each level, adding it to
14  * its parent level. When we have only one page on a level, it must be
15  * the root -- it can be attached to the btree metapage and we are done.
16  *
17  * This code is moderately slow (~10% slower) compared to the regular
18  * btree (insertion) build code on sorted or well-clustered data. On
19  * random data, however, the insertion build code is unusable -- the
20  * difference on a 60MB heap is a factor of 15 because the random
21  * probes into the btree thrash the buffer pool. (NOTE: the above
22  * "10%" estimate is probably obsolete, since it refers to an old and
23  * not very good external sort implementation that used to exist in
24  * this module. tuplesort.c is almost certainly faster.)
25  *
26  * It is not wise to pack the pages entirely full, since then *any*
27  * insertion would cause a split (and not only of the leaf page; the need
28  * for a split would cascade right up the tree). The steady-state load
29  * factor for btrees is usually estimated at 70%. We choose to pack leaf
30  * pages to the user-controllable fill factor (default 90%) while upper pages
31  * are always packed to 70%. This gives us reasonable density (there aren't
32  * many upper pages if the keys are reasonable-size) without risking a lot of
33  * cascading splits during early insertions.
34  *
35  * Formerly the index pages being built were kept in shared buffers, but
36  * that is of no value (since other backends have no interest in them yet)
37  * and it created locking problems for CHECKPOINT, because the upper-level
38  * pages were held exclusive-locked for long periods. Now we just build
39  * the pages in local memory and smgrwrite or smgrextend them as we finish
40  * them. They will need to be re-read into shared buffers on first use after
41  * the build finishes.
42  *
43  * Since the index will never be used unless it is completely built,
44  * from a crash-recovery point of view there is no need to WAL-log the
45  * steps of the build. After completing the index build, we can just sync
46  * the whole file to disk using smgrimmedsync() before exiting this module.
47  * This can be seen to be sufficient for crash recovery by considering that
48  * it's effectively equivalent to what would happen if a CHECKPOINT occurred
49  * just after the index build. However, it is clearly not sufficient if the
50  * DBA is using the WAL log for PITR or replication purposes, since another
51  * machine would not be able to reconstruct the index from WAL. Therefore,
52  * we log the completed index pages to WAL if and only if WAL archiving is
53  * active.
54  *
55  * This code isn't concerned about the FSM at all. The caller is responsible
56  * for initializing that.
57  *
58  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
59  * Portions Copyright (c) 1994, Regents of the University of California
60  *
61  * IDENTIFICATION
62  * src/backend/access/nbtree/nbtsort.c
63  *
64  *-------------------------------------------------------------------------
65  */
66 
67 #include "postgres.h"
68 
69 #include "access/nbtree.h"
70 #include "access/parallel.h"
71 #include "access/relscan.h"
72 #include "access/xact.h"
73 #include "access/xlog.h"
74 #include "access/xloginsert.h"
75 #include "catalog/index.h"
76 #include "miscadmin.h"
77 #include "pgstat.h"
78 #include "storage/smgr.h"
79 #include "tcop/tcopprot.h" /* pgrminclude ignore */
80 #include "utils/rel.h"
81 #include "utils/sortsupport.h"
82 #include "utils/tuplesort.h"
83 
84 
85 /* Magic numbers for parallel state sharing */
86 #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
87 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
88 #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
89 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
90 
91 /*
92  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
93  * parallel index builds. This may be useful as a debugging aid.
94 #undef DISABLE_LEADER_PARTICIPATION
95  */
96 
97 /*
98  * Status record for spooling/sorting phase. (Note we may have two of
99  * these due to the special requirements for uniqueness-checking with
100  * dead tuples.)
101  */
102 typedef struct BTSpool
103 {
104  Tuplesortstate *sortstate; /* state data for tuplesort.c */
107  bool isunique;
108 } BTSpool;
109 
110 /*
111  * Status for index builds performed in parallel. This is allocated in a
112  * dynamic shared memory segment. Note that there is a separate tuplesort TOC
113  * entry, private to tuplesort.c but allocated by this module on its behalf.
114  */
115 typedef struct BTShared
116 {
117  /*
118  * These fields are not modified during the sort. They primarily exist
119  * for the benefit of worker processes that need to create BTSpool state
120  * corresponding to that used by the leader.
121  */
124  bool isunique;
127 
128  /*
129  * workersdonecv is used to monitor the progress of workers. All parallel
130  * participants must indicate that they are done before leader can use
131  * mutable state that workers maintain during scan (and before leader can
132  * proceed to tuplesort_performsort()).
133  */
135 
136  /*
137  * mutex protects all fields before heapdesc.
138  *
139  * These fields contain status information of interest to B-Tree index
140  * builds that must work just the same when an index is built in parallel.
141  */
143 
144  /*
145  * Mutable state that is maintained by workers, and reported back to
146  * leader at end of parallel scan.
147  *
148  * nparticipantsdone is number of worker processes finished.
149  *
150  * reltuples is the total number of input heap tuples.
151  *
152  * havedead indicates if RECENTLY_DEAD tuples were encountered during
153  * build.
154  *
155  * indtuples is the total number of tuples that made it into the index.
156  *
157  * brokenhotchain indicates if any worker detected a broken HOT chain
158  * during build.
159  */
161  double reltuples;
162  bool havedead;
163  double indtuples;
165 
166  /*
167  * This variable-sized field must come last.
168  *
169  * See _bt_parallel_estimate_shared().
170  */
172 } BTShared;
173 
174 /*
175  * Status for leader in parallel index build.
176  */
177 typedef struct BTLeader
178 {
179  /* parallel context itself */
181 
182  /*
183  * nparticipanttuplesorts is the exact number of worker processes
184  * successfully launched, plus one leader process if it participates as a
185  * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
186  * participating as a worker).
187  */
189 
190  /*
191  * Leader process convenience pointers to shared state (leader avoids TOC
192  * lookups).
193  *
194  * btshared is the shared state for entire build. sharedsort is the
195  * shared, tuplesort-managed state passed to each process tuplesort.
196  * sharedsort2 is the corresponding btspool2 shared state, used only when
197  * building unique indexes. snapshot is the snapshot used by the scan iff
198  * an MVCC snapshot is required.
199  */
204 } BTLeader;
205 
206 /*
207  * Working state for btbuild and its callback.
208  *
209  * When parallel CREATE INDEX is used, there is a BTBuildState for each
210  * participant.
211  */
212 typedef struct BTBuildState
213 {
214  bool isunique;
215  bool havedead;
218 
219  /*
220  * spool2 is needed only when the index is a unique index. Dead tuples are
221  * put into spool2 instead of spool in order to avoid uniqueness check.
222  */
224  double indtuples;
225 
226  /*
227  * btleader is only present when a parallel index build is performed, and
228  * only in the leader process. (Actually, only the leader has a
229  * BTBuildState. Workers have their own spool and spool2, though.)
230  */
232 } BTBuildState;
233 
234 /*
235  * Status record for a btree page being built. We have one of these
236  * for each active tree level.
237  *
238  * The reason we need to store a copy of the minimum key is that we'll
239  * need to propagate it to the parent node when this page is linked
240  * into its parent. However, if the page is not a leaf page, the first
241  * entry on the page doesn't need to contain a key, so we will not have
242  * stored the key itself on the page. (You might think we could skip
243  * copying the minimum key on leaf pages, but actually we must have a
244  * writable copy anyway because we'll poke the page's address into it
245  * before passing it up to the parent...)
246  */
247 typedef struct BTPageState
248 {
249  Page btps_page; /* workspace for page building */
250  BlockNumber btps_blkno; /* block # to write this page at */
251  IndexTuple btps_minkey; /* copy of minimum key (first item) on page */
252  OffsetNumber btps_lastoff; /* last item offset loaded */
253  uint32 btps_level; /* tree level (0 = leaf) */
254  Size btps_full; /* "full" if less than this much free space */
255  struct BTPageState *btps_next; /* link to parent level, if any */
256 } BTPageState;
257 
258 /*
259  * Overall status record for index writing phase.
260  */
261 typedef struct BTWriteState
262 {
265  bool btws_use_wal; /* dump pages to WAL? */
266  BlockNumber btws_pages_alloced; /* # pages allocated */
267  BlockNumber btws_pages_written; /* # pages written out */
268  Page btws_zeropage; /* workspace for filling zeroes */
269 } BTWriteState;
270 
271 
273  BTBuildState *buildstate, IndexInfo *indexInfo);
274 static void _bt_spooldestroy(BTSpool *btspool);
275 static void _bt_spool(BTSpool *btspool, ItemPointer self,
276  Datum *values, bool *isnull);
277 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
279  bool *isnull, bool tupleIsAlive, void *state);
280 static Page _bt_blnewpage(uint32 level);
281 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
282 static void _bt_slideleft(Page page);
283 static void _bt_sortaddtup(Page page, Size itemsize,
284  IndexTuple itup, OffsetNumber itup_off);
285 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
286  IndexTuple itup);
287 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
288 static void _bt_load(BTWriteState *wstate,
289  BTSpool *btspool, BTSpool *btspool2);
290 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
291  int request);
292 static void _bt_end_parallel(BTLeader *btleader);
294 static double _bt_parallel_heapscan(BTBuildState *buildstate,
295  bool *brokenhotchain);
296 static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
297 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
298  BTShared *btshared, Sharedsort *sharedsort,
299  Sharedsort *sharedsort2, int sortmem);
300 
301 
302 /*
303  * btbuild() -- build a new btree index.
304  */
307 {
308  IndexBuildResult *result;
309  BTBuildState buildstate;
310  double reltuples;
311 
312 #ifdef BTREE_BUILD_STATS
314  ResetUsage();
315 #endif /* BTREE_BUILD_STATS */
316 
317  buildstate.isunique = indexInfo->ii_Unique;
318  buildstate.havedead = false;
319  buildstate.heap = heap;
320  buildstate.spool = NULL;
321  buildstate.spool2 = NULL;
322  buildstate.indtuples = 0;
323  buildstate.btleader = NULL;
324 
325  /*
326  * We expect to be called exactly once for any index relation. If that's
327  * not the case, big trouble's what we have.
328  */
329  if (RelationGetNumberOfBlocks(index) != 0)
330  elog(ERROR, "index \"%s\" already contains data",
331  RelationGetRelationName(index));
332 
333  reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
334 
335  /*
336  * Finish the build by (1) completing the sort of the spool file, (2)
337  * inserting the sorted tuples into btree pages and (3) building the upper
338  * levels. Finally, it may also be necessary to end use of parallelism.
339  */
340  _bt_leafbuild(buildstate.spool, buildstate.spool2);
341  _bt_spooldestroy(buildstate.spool);
342  if (buildstate.spool2)
343  _bt_spooldestroy(buildstate.spool2);
344  if (buildstate.btleader)
345  _bt_end_parallel(buildstate.btleader);
346 
347  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
348 
349  result->heap_tuples = reltuples;
350  result->index_tuples = buildstate.indtuples;
351 
352 #ifdef BTREE_BUILD_STATS
354  {
355  ShowUsage("BTREE BUILD STATS");
356  ResetUsage();
357  }
358 #endif /* BTREE_BUILD_STATS */
359 
360  return result;
361 }
362 
363 /*
364  * Create and initialize one or two spool structures, and save them in caller's
365  * buildstate argument. May also fill-in fields within indexInfo used by index
366  * builds.
367  *
368  * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
369  * routine encapsulates all aspects of managing parallelism. Caller need only
370  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
371  *
372  * Returns the total number of heap tuples scanned.
373  */
374 static double
376  IndexInfo *indexInfo)
377 {
378  BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
379  SortCoordinate coordinate = NULL;
380  double reltuples = 0;
381 
382  /*
383  * We size the sort area as maintenance_work_mem rather than work_mem to
384  * speed index creation. This should be OK since a single backend can't
385  * run multiple index creations in parallel (see also: notes on
386  * parallelism and maintenance_work_mem below).
387  */
388  btspool->heap = heap;
389  btspool->index = index;
390  btspool->isunique = indexInfo->ii_Unique;
391 
392  /* Save as primary spool */
393  buildstate->spool = btspool;
394 
395  /* Attempt to launch parallel worker scan when required */
396  if (indexInfo->ii_ParallelWorkers > 0)
397  _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
398  indexInfo->ii_ParallelWorkers);
399 
400  /*
401  * If parallel build requested and at least one worker process was
402  * successfully launched, set up coordination state
403  */
404  if (buildstate->btleader)
405  {
406  coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
407  coordinate->isWorker = false;
408  coordinate->nParticipants =
409  buildstate->btleader->nparticipanttuplesorts;
410  coordinate->sharedsort = buildstate->btleader->sharedsort;
411  }
412 
413  /*
414  * Begin serial/leader tuplesort.
415  *
416  * In cases where parallelism is involved, the leader receives the same
417  * share of maintenance_work_mem as a serial sort (it is generally treated
418  * in the same way as a serial sort once we return). Parallel worker
419  * Tuplesortstates will have received only a fraction of
420  * maintenance_work_mem, though.
421  *
422  * We rely on the lifetime of the Leader Tuplesortstate almost not
423  * overlapping with any worker Tuplesortstate's lifetime. There may be
424  * some small overlap, but that's okay because we rely on leader
425  * Tuplesortstate only allocating a small, fixed amount of memory here.
426  * When its tuplesort_performsort() is called (by our caller), and
427  * significant amounts of memory are likely to be used, all workers must
428  * have already freed almost all memory held by their Tuplesortstates
429  * (they are about to go away completely, too). The overall effect is
430  * that maintenance_work_mem always represents an absolute high watermark
431  * on the amount of memory used by a CREATE INDEX operation, regardless of
432  * the use of parallelism or any other factor.
433  */
434  buildstate->spool->sortstate =
435  tuplesort_begin_index_btree(heap, index, buildstate->isunique,
436  maintenance_work_mem, coordinate,
437  false);
438 
439  /*
440  * If building a unique index, put dead tuples in a second spool to keep
441  * them out of the uniqueness check. We expect that the second spool (for
442  * dead tuples) won't get very full, so we give it only work_mem.
443  */
444  if (indexInfo->ii_Unique)
445  {
446  BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
447  SortCoordinate coordinate2 = NULL;
448 
449  /* Initialize secondary spool */
450  btspool2->heap = heap;
451  btspool2->index = index;
452  btspool2->isunique = false;
453  /* Save as secondary spool */
454  buildstate->spool2 = btspool2;
455 
456  if (buildstate->btleader)
457  {
458  /*
459  * Set up non-private state that is passed to
460  * tuplesort_begin_index_btree() about the basic high level
461  * coordination of a parallel sort.
462  */
463  coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
464  coordinate2->isWorker = false;
465  coordinate2->nParticipants =
466  buildstate->btleader->nparticipanttuplesorts;
467  coordinate2->sharedsort = buildstate->btleader->sharedsort2;
468  }
469 
470  /*
471  * We expect that the second one (for dead tuples) won't get very
472  * full, so we give it only work_mem
473  */
474  buildstate->spool2->sortstate =
475  tuplesort_begin_index_btree(heap, index, false, work_mem,
476  coordinate2, false);
477  }
478 
479  /* Fill spool using either serial or parallel heap scan */
480  if (!buildstate->btleader)
481  reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
482  _bt_build_callback, (void *) buildstate,
483  NULL);
484  else
485  reltuples = _bt_parallel_heapscan(buildstate,
486  &indexInfo->ii_BrokenHotChain);
487 
488  /* okay, all heap tuples are spooled */
489  if (buildstate->spool2 && !buildstate->havedead)
490  {
491  /* spool2 turns out to be unnecessary */
492  _bt_spooldestroy(buildstate->spool2);
493  buildstate->spool2 = NULL;
494  }
495 
496  return reltuples;
497 }
498 
499 /*
500  * clean up a spool structure and its substructures.
501  */
502 static void
504 {
505  tuplesort_end(btspool->sortstate);
506  pfree(btspool);
507 }
508 
509 /*
510  * spool an index entry into the sort file.
511  */
512 static void
513 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
514 {
515  tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
516  self, values, isnull);
517 }
518 
519 /*
520  * given a spool loaded by successive calls to _bt_spool,
521  * create an entire btree.
522  */
523 static void
524 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
525 {
526  BTWriteState wstate;
527 
528 #ifdef BTREE_BUILD_STATS
530  {
531  ShowUsage("BTREE BUILD (Spool) STATISTICS");
532  ResetUsage();
533  }
534 #endif /* BTREE_BUILD_STATS */
535 
537  if (btspool2)
538  tuplesort_performsort(btspool2->sortstate);
539 
540  wstate.heap = btspool->heap;
541  wstate.index = btspool->index;
542 
543  /*
544  * We need to log index creation in WAL iff WAL archiving/streaming is
545  * enabled UNLESS the index isn't WAL-logged anyway.
546  */
547  wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
548 
549  /* reserve the metapage */
550  wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
551  wstate.btws_pages_written = 0;
552  wstate.btws_zeropage = NULL; /* until needed */
553 
554  _bt_load(&wstate, btspool, btspool2);
555 }
556 
557 /*
558  * Per-tuple callback from IndexBuildHeapScan
559  */
560 static void
562  HeapTuple htup,
563  Datum *values,
564  bool *isnull,
565  bool tupleIsAlive,
566  void *state)
567 {
568  BTBuildState *buildstate = (BTBuildState *) state;
569 
570  /*
571  * insert the index tuple into the appropriate spool file for subsequent
572  * processing
573  */
574  if (tupleIsAlive || buildstate->spool2 == NULL)
575  _bt_spool(buildstate->spool, &htup->t_self, values, isnull);
576  else
577  {
578  /* dead tuples are put into spool2 */
579  buildstate->havedead = true;
580  _bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
581  }
582 
583  buildstate->indtuples += 1;
584 }
585 
586 /*
587  * allocate workspace for a new, clean btree page, not linked to any siblings.
588  */
589 static Page
591 {
592  Page page;
593  BTPageOpaque opaque;
594 
595  page = (Page) palloc(BLCKSZ);
596 
597  /* Zero the page and set up standard page header info */
598  _bt_pageinit(page, BLCKSZ);
599 
600  /* Initialize BT opaque state */
601  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
602  opaque->btpo_prev = opaque->btpo_next = P_NONE;
603  opaque->btpo.level = level;
604  opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
605  opaque->btpo_cycleid = 0;
606 
607  /* Make the P_HIKEY line pointer appear allocated */
608  ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
609 
610  return page;
611 }
612 
613 /*
614  * emit a completed btree page, and release the working storage.
615  */
616 static void
618 {
619  /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
620  RelationOpenSmgr(wstate->index);
621 
622  /* XLOG stuff */
623  if (wstate->btws_use_wal)
624  {
625  /* We use the heap NEWPAGE record type for this */
626  log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
627  }
628 
629  /*
630  * If we have to write pages nonsequentially, fill in the space with
631  * zeroes until we come back and overwrite. This is not logically
632  * necessary on standard Unix filesystems (unwritten space will read as
633  * zeroes anyway), but it should help to avoid fragmentation. The dummy
634  * pages aren't WAL-logged though.
635  */
636  while (blkno > wstate->btws_pages_written)
637  {
638  if (!wstate->btws_zeropage)
639  wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
640  /* don't set checksum for all-zero page */
642  wstate->btws_pages_written++,
643  (char *) wstate->btws_zeropage,
644  true);
645  }
646 
647  PageSetChecksumInplace(page, blkno);
648 
649  /*
650  * Now write the page. There's no need for smgr to schedule an fsync for
651  * this write; we'll do it ourselves before ending the build.
652  */
653  if (blkno == wstate->btws_pages_written)
654  {
655  /* extending the file... */
656  smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
657  (char *) page, true);
658  wstate->btws_pages_written++;
659  }
660  else
661  {
662  /* overwriting a block we zero-filled before */
663  smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
664  (char *) page, true);
665  }
666 
667  pfree(page);
668 }
669 
670 /*
671  * allocate and initialize a new BTPageState. the returned structure
672  * is suitable for immediate use by _bt_buildadd.
673  */
674 static BTPageState *
676 {
678 
679  /* create initial page for level */
680  state->btps_page = _bt_blnewpage(level);
681 
682  /* and assign it a page position */
683  state->btps_blkno = wstate->btws_pages_alloced++;
684 
685  state->btps_minkey = NULL;
686  /* initialize lastoff so first item goes into P_FIRSTKEY */
687  state->btps_lastoff = P_HIKEY;
688  state->btps_level = level;
689  /* set "full" threshold based on level. See notes at head of file. */
690  if (level > 0)
691  state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
692  else
695  /* no parent level, yet */
696  state->btps_next = NULL;
697 
698  return state;
699 }
700 
701 /*
702  * slide an array of ItemIds back one slot (from P_FIRSTKEY to
703  * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover
704  * that we have built an ItemId array in what has turned out to be a
705  * P_RIGHTMOST page.
706  */
707 static void
709 {
710  OffsetNumber off;
711  OffsetNumber maxoff;
712  ItemId previi;
713  ItemId thisii;
714 
715  if (!PageIsEmpty(page))
716  {
717  maxoff = PageGetMaxOffsetNumber(page);
718  previi = PageGetItemId(page, P_HIKEY);
719  for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
720  {
721  thisii = PageGetItemId(page, off);
722  *previi = *thisii;
723  previi = thisii;
724  }
725  ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
726  }
727 }
728 
729 /*
730  * Add an item to a page being built.
731  *
732  * The main difference between this routine and a bare PageAddItem call
733  * is that this code knows that the leftmost data item on a non-leaf
734  * btree page doesn't need to have a key. Therefore, it strips such
735  * items down to just the item header.
736  *
737  * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
738  * that because it assumes that P_RIGHTMOST() will return the correct
739  * answer for the page. Here, we don't know yet if the page will be
740  * rightmost. Offset P_FIRSTKEY is always the first data key.
741  */
742 static void
744  Size itemsize,
745  IndexTuple itup,
746  OffsetNumber itup_off)
747 {
749  IndexTupleData trunctuple;
750 
751  if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
752  {
753  trunctuple = *itup;
754  trunctuple.t_info = sizeof(IndexTupleData);
755  BTreeTupleSetNAtts(&trunctuple, 0);
756  itup = &trunctuple;
757  itemsize = sizeof(IndexTupleData);
758  }
759 
760  if (PageAddItem(page, (Item) itup, itemsize, itup_off,
761  false, false) == InvalidOffsetNumber)
762  elog(ERROR, "failed to add item to the index page");
763 }
764 
765 /*----------
766  * Add an item to a disk page from the sort output.
767  *
768  * We must be careful to observe the page layout conventions of nbtsearch.c:
769  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
770  * - on non-leaf pages, the key portion of the first item need not be
771  * stored, we should store only the link.
772  *
773  * A leaf page being built looks like:
774  *
775  * +----------------+---------------------------------+
776  * | PageHeaderData | linp0 linp1 linp2 ... |
777  * +-----------+----+---------------------------------+
778  * | ... linpN | |
779  * +-----------+--------------------------------------+
780  * | ^ last |
781  * | |
782  * +-------------+------------------------------------+
783  * | | itemN ... |
784  * +-------------+------------------+-----------------+
785  * | ... item3 item2 item1 | "special space" |
786  * +--------------------------------+-----------------+
787  *
788  * Contrast this with the diagram in bufpage.h; note the mismatch
789  * between linps and items. This is because we reserve linp0 as a
790  * placeholder for the pointer to the "high key" item; when we have
791  * filled up the page, we will set linp0 to point to itemN and clear
792  * linpN. On the other hand, if we find this is the last (rightmost)
793  * page, we leave the items alone and slide the linp array over. If
794  * the high key is to be truncated, offset 1 is deleted, and we insert
795  * the truncated high key at offset 1.
796  *
797  * 'last' pointer indicates the last offset added to the page.
798  *----------
799  */
800 static void
802 {
803  Page npage;
804  BlockNumber nblkno;
805  OffsetNumber last_off;
806  Size pgspc;
807  Size itupsz;
808  int indnatts = IndexRelationGetNumberOfAttributes(wstate->index);
809  int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(wstate->index);
810 
811  /*
812  * This is a handy place to check for cancel interrupts during the btree
813  * load phase of index creation.
814  */
816 
817  npage = state->btps_page;
818  nblkno = state->btps_blkno;
819  last_off = state->btps_lastoff;
820 
821  pgspc = PageGetFreeSpace(npage);
822  itupsz = IndexTupleSize(itup);
823  itupsz = MAXALIGN(itupsz);
824 
825  /*
826  * Check whether the item can fit on a btree page at all. (Eventually, we
827  * ought to try to apply TOAST methods if not.) We actually need to be
828  * able to fit three items on every page, so restrict any one item to 1/3
829  * the per-page available space. Note that at this point, itupsz doesn't
830  * include the ItemId.
831  *
832  * NOTE: similar code appears in _bt_insertonpg() to defend against
833  * oversize items being inserted into an already-existing index. But
834  * during creation of an index, we don't go through there.
835  */
836  if (itupsz > BTMaxItemSize(npage))
837  ereport(ERROR,
838  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
839  errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
840  itupsz, BTMaxItemSize(npage),
841  RelationGetRelationName(wstate->index)),
842  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
843  "Consider a function index of an MD5 hash of the value, "
844  "or use full text indexing."),
845  errtableconstraint(wstate->heap,
846  RelationGetRelationName(wstate->index))));
847 
848  /*
849  * Check to see if page is "full". It's definitely full if the item won't
850  * fit. Otherwise, compare to the target freespace derived from the
851  * fillfactor. However, we must put at least two items on each page, so
852  * disregard fillfactor if we don't have that many.
853  */
854  if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
855  {
856  /*
857  * Finish off the page and write it out.
858  */
859  Page opage = npage;
860  BlockNumber oblkno = nblkno;
861  ItemId ii;
862  ItemId hii;
863  IndexTuple oitup;
865 
866  /* Create new page of same level */
867  npage = _bt_blnewpage(state->btps_level);
868 
869  /* and assign it a page position */
870  nblkno = wstate->btws_pages_alloced++;
871 
872  /*
873  * We copy the last item on the page into the new page, and then
874  * rearrange the old page so that the 'last item' becomes its high key
875  * rather than a true data item. There had better be at least two
876  * items on the page already, else the page would be empty of useful
877  * data.
878  */
879  Assert(last_off > P_FIRSTKEY);
880  ii = PageGetItemId(opage, last_off);
881  oitup = (IndexTuple) PageGetItem(opage, ii);
882  _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
883 
884  /*
885  * Move 'last' into the high key position on opage
886  */
887  hii = PageGetItemId(opage, P_HIKEY);
888  *hii = *ii;
889  ItemIdSetUnused(ii); /* redundant */
890  ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
891 
892  if (indnkeyatts != indnatts && P_ISLEAF(opageop))
893  {
894  IndexTuple truncated;
895  Size truncsz;
896 
897  /*
898  * Truncate any non-key attributes from high key on leaf level
899  * (i.e. truncate on leaf level if we're building an INCLUDE
900  * index). This is only done at the leaf level because
901  * downlinks in internal pages are either negative infinity
902  * items, or get their contents from copying from one level
903  * down. See also: _bt_split().
904  *
905  * Since the truncated tuple is probably smaller than the
906  * original, it cannot just be copied in place (besides, we want
907  * to actually save space on the leaf page). We delete the
908  * original high key, and add our own truncated high key at the
909  * same offset.
910  *
911  * Note that the page layout won't be changed very much. oitup
912  * is already located at the physical beginning of tuple space,
913  * so we only shift the line pointer array back and forth, and
914  * overwrite the latter portion of the space occupied by the
915  * original tuple. This is fairly cheap.
916  */
917  truncated = _bt_nonkey_truncate(wstate->index, oitup);
918  truncsz = IndexTupleSize(truncated);
920  _bt_sortaddtup(opage, truncsz, truncated, P_HIKEY);
921  pfree(truncated);
922 
923  /* oitup should continue to point to the page's high key */
924  hii = PageGetItemId(opage, P_HIKEY);
925  oitup = (IndexTuple) PageGetItem(opage, hii);
926  }
927 
928  /*
929  * Link the old page into its parent, using its minimum key. If we
930  * don't have a parent, we have to create one; this adds a new btree
931  * level.
932  */
933  if (state->btps_next == NULL)
934  state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
935 
936  Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) ==
938  BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno);
939  _bt_buildadd(wstate, state->btps_next, state->btps_minkey);
940  pfree(state->btps_minkey);
941 
942  /*
943  * Save a copy of the minimum key for the new page. We have to copy
944  * it off the old page, not the new one, in case we are not at leaf
945  * level.
946  */
947  state->btps_minkey = CopyIndexTuple(oitup);
948 
949  /*
950  * Set the sibling links for both pages.
951  */
952  {
955 
956  oopaque->btpo_next = nblkno;
957  nopaque->btpo_prev = oblkno;
958  nopaque->btpo_next = P_NONE; /* redundant */
959  }
960 
961  /*
962  * Write out the old page. We never need to touch it again, so we can
963  * free the opage workspace too.
964  */
965  _bt_blwritepage(wstate, opage, oblkno);
966 
967  /*
968  * Reset last_off to point to new page
969  */
970  last_off = P_FIRSTKEY;
971  }
972 
973  /*
974  * If the new item is the first for its page, stash a copy for later. Note
975  * this will only happen for the first item on a level; on later pages,
976  * the first item for a page is copied from the prior page in the code
977  * above.
978  */
979  if (last_off == P_HIKEY)
980  {
981  BTPageOpaque npageop;
982 
983  Assert(state->btps_minkey == NULL);
984 
985  npageop = (BTPageOpaque) PageGetSpecialPointer(npage);
986 
987  /*
988  * Truncate included attributes of the tuple that we're going to
989  * insert into the parent page as a downlink
990  */
991  if (indnkeyatts != indnatts && P_ISLEAF(npageop))
992  state->btps_minkey = _bt_nonkey_truncate(wstate->index, itup);
993  else
994  state->btps_minkey = CopyIndexTuple(itup);
995  }
996 
997  /*
998  * Add the new item into the current page.
999  */
1000  last_off = OffsetNumberNext(last_off);
1001  _bt_sortaddtup(npage, itupsz, itup, last_off);
1002 
1003  state->btps_page = npage;
1004  state->btps_blkno = nblkno;
1005  state->btps_lastoff = last_off;
1006 }
1007 
1008 /*
1009  * Finish writing out the completed btree.
1010  */
1011 static void
1013 {
1014  BTPageState *s;
1015  BlockNumber rootblkno = P_NONE;
1016  uint32 rootlevel = 0;
1017  Page metapage;
1018 
1019  /*
1020  * Each iteration of this loop completes one more level of the tree.
1021  */
1022  for (s = state; s != NULL; s = s->btps_next)
1023  {
1024  BlockNumber blkno;
1025  BTPageOpaque opaque;
1026 
1027  blkno = s->btps_blkno;
1029 
1030  /*
1031  * We have to link the last page on this level to somewhere.
1032  *
1033  * If we're at the top, it's the root, so attach it to the metapage.
1034  * Otherwise, add an entry for it to its parent using its minimum key.
1035  * This may cause the last page of the parent level to split, but
1036  * that's not a problem -- we haven't gotten to it yet.
1037  */
1038  if (s->btps_next == NULL)
1039  {
1040  opaque->btpo_flags |= BTP_ROOT;
1041  rootblkno = blkno;
1042  rootlevel = s->btps_level;
1043  }
1044  else
1045  {
1046  Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) ==
1049  _bt_buildadd(wstate, s->btps_next, s->btps_minkey);
1050  pfree(s->btps_minkey);
1051  s->btps_minkey = NULL;
1052  }
1053 
1054  /*
1055  * This is the rightmost page, so the ItemId array needs to be slid
1056  * back one slot. Then we can dump out the page.
1057  */
1059  _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1060  s->btps_page = NULL; /* writepage freed the workspace */
1061  }
1062 
1063  /*
1064  * As the last step in the process, construct the metapage and make it
1065  * point to the new root (unless we had no data at all, in which case it's
1066  * set to point to "P_NONE"). This changes the index to the "valid" state
1067  * by filling in a valid magic number in the metapage.
1068  */
1069  metapage = (Page) palloc(BLCKSZ);
1070  _bt_initmetapage(metapage, rootblkno, rootlevel);
1071  _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1072 }
1073 
1074 /*
1075  * Read tuples in correct sort order from tuplesort, and load them into
1076  * btree leaves.
1077  */
1078 static void
1079 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1080 {
1081  BTPageState *state = NULL;
1082  bool merge = (btspool2 != NULL);
1083  IndexTuple itup,
1084  itup2 = NULL;
1085  bool load1;
1086  TupleDesc tupdes = RelationGetDescr(wstate->index);
1087  int i,
1089  ScanKey indexScanKey = NULL;
1090  SortSupport sortKeys;
1091 
1092  if (merge)
1093  {
1094  /*
1095  * Another BTSpool for dead tuples exists. Now we have to merge
1096  * btspool and btspool2.
1097  */
1098 
1099  /* the preparation of merge */
1100  itup = tuplesort_getindextuple(btspool->sortstate, true);
1101  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1102  indexScanKey = _bt_mkscankey_nodata(wstate->index);
1103 
1104  /* Prepare SortSupport data for each column */
1105  sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1106 
1107  for (i = 0; i < keysz; i++)
1108  {
1109  SortSupport sortKey = sortKeys + i;
1110  ScanKey scanKey = indexScanKey + i;
1111  int16 strategy;
1112 
1113  sortKey->ssup_cxt = CurrentMemoryContext;
1114  sortKey->ssup_collation = scanKey->sk_collation;
1115  sortKey->ssup_nulls_first =
1116  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1117  sortKey->ssup_attno = scanKey->sk_attno;
1118  /* Abbreviation is not supported here */
1119  sortKey->abbreviate = false;
1120 
1121  AssertState(sortKey->ssup_attno != 0);
1122 
1123  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1125 
1126  PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1127  }
1128 
1129  _bt_freeskey(indexScanKey);
1130 
1131  for (;;)
1132  {
1133  load1 = true; /* load BTSpool next ? */
1134  if (itup2 == NULL)
1135  {
1136  if (itup == NULL)
1137  break;
1138  }
1139  else if (itup != NULL)
1140  {
1141  for (i = 1; i <= keysz; i++)
1142  {
1143  SortSupport entry;
1144  Datum attrDatum1,
1145  attrDatum2;
1146  bool isNull1,
1147  isNull2;
1148  int32 compare;
1149 
1150  entry = sortKeys + i - 1;
1151  attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1152  attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1153 
1154  compare = ApplySortComparator(attrDatum1, isNull1,
1155  attrDatum2, isNull2,
1156  entry);
1157  if (compare > 0)
1158  {
1159  load1 = false;
1160  break;
1161  }
1162  else if (compare < 0)
1163  break;
1164  }
1165  }
1166  else
1167  load1 = false;
1168 
1169  /* When we see first tuple, create first index page */
1170  if (state == NULL)
1171  state = _bt_pagestate(wstate, 0);
1172 
1173  if (load1)
1174  {
1175  _bt_buildadd(wstate, state, itup);
1176  itup = tuplesort_getindextuple(btspool->sortstate, true);
1177  }
1178  else
1179  {
1180  _bt_buildadd(wstate, state, itup2);
1181  itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1182  }
1183  }
1184  pfree(sortKeys);
1185  }
1186  else
1187  {
1188  /* merge is unnecessary */
1189  while ((itup = tuplesort_getindextuple(btspool->sortstate,
1190  true)) != NULL)
1191  {
1192  /* When we see first tuple, create first index page */
1193  if (state == NULL)
1194  state = _bt_pagestate(wstate, 0);
1195 
1196  _bt_buildadd(wstate, state, itup);
1197  }
1198  }
1199 
1200  /* Close down final pages and write the metapage */
1201  _bt_uppershutdown(wstate, state);
1202 
1203  /*
1204  * If the index is WAL-logged, we must fsync it down to disk before it's
1205  * safe to commit the transaction. (For a non-WAL-logged index we don't
1206  * care since the index will be uninteresting after a crash anyway.)
1207  *
1208  * It's obvious that we must do this when not WAL-logging the build. It's
1209  * less obvious that we have to do it even if we did WAL-log the index
1210  * pages. The reason is that since we're building outside shared buffers,
1211  * a CHECKPOINT occurring during the build has no way to flush the
1212  * previously written data to disk (indeed it won't know the index even
1213  * exists). A crash later on would replay WAL from the checkpoint,
1214  * therefore it wouldn't replay our earlier WAL entries. If we do not
1215  * fsync those pages here, they might still not be on disk when the crash
1216  * occurs.
1217  */
1218  if (RelationNeedsWAL(wstate->index))
1219  {
1220  RelationOpenSmgr(wstate->index);
1222  }
1223 }
1224 
1225 /*
1226  * Create parallel context, and launch workers for leader.
1227  *
1228  * buildstate argument should be initialized (with the exception of the
1229  * tuplesort state in spools, which may later be created based on shared
1230  * state initially set up here).
1231  *
1232  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1233  *
1234  * request is the target number of parallel worker processes to launch.
1235  *
1236  * Sets buildstate's BTLeader, which caller must use to shut down parallel
1237  * mode by passing it to _bt_end_parallel() at the very end of its index
1238  * build. If not even a single worker process can be launched, this is
1239  * never set, and caller should proceed with a serial index build.
1240  */
1241 static void
1242 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1243 {
1244  ParallelContext *pcxt;
1245  int scantuplesortstates;
1246  Snapshot snapshot;
1247  Size estbtshared;
1248  Size estsort;
1249  BTShared *btshared;
1250  Sharedsort *sharedsort;
1251  Sharedsort *sharedsort2;
1252  BTSpool *btspool = buildstate->spool;
1253  BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1254  bool leaderparticipates = true;
1255  char *sharedquery;
1256  int querylen;
1257 
1258 #ifdef DISABLE_LEADER_PARTICIPATION
1259  leaderparticipates = false;
1260 #endif
1261 
1262  /*
1263  * Enter parallel mode, and create context for parallel build of btree
1264  * index
1265  */
1267  Assert(request > 0);
1268  pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1269  request, true);
1270  scantuplesortstates = leaderparticipates ? request + 1 : request;
1271 
1272  /*
1273  * Prepare for scan of the base relation. In a normal index build, we use
1274  * SnapshotAny because we must retrieve all tuples and do our own time
1275  * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1276  * concurrent build, we take a regular MVCC snapshot and index whatever's
1277  * live according to that.
1278  */
1279  if (!isconcurrent)
1280  snapshot = SnapshotAny;
1281  else
1283 
1284  /*
1285  * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1286  * PARALLEL_KEY_TUPLESORT tuplesort workspace
1287  */
1288  estbtshared = _bt_parallel_estimate_shared(snapshot);
1289  shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1290  estsort = tuplesort_estimate_shared(scantuplesortstates);
1291  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1292 
1293  /*
1294  * Unique case requires a second spool, and so we may have to account for
1295  * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1296  */
1297  if (!btspool->isunique)
1298  shm_toc_estimate_keys(&pcxt->estimator, 2);
1299  else
1300  {
1301  shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1302  shm_toc_estimate_keys(&pcxt->estimator, 3);
1303  }
1304 
1305  /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1306  querylen = strlen(debug_query_string);
1307  shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1308  shm_toc_estimate_keys(&pcxt->estimator, 1);
1309 
1310  /* Everyone's had a chance to ask for space, so now create the DSM */
1311  InitializeParallelDSM(pcxt);
1312 
1313  /* Store shared build state, for which we reserved space */
1314  btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1315  /* Initialize immutable state */
1316  btshared->heaprelid = RelationGetRelid(btspool->heap);
1317  btshared->indexrelid = RelationGetRelid(btspool->index);
1318  btshared->isunique = btspool->isunique;
1319  btshared->isconcurrent = isconcurrent;
1320  btshared->scantuplesortstates = scantuplesortstates;
1322  SpinLockInit(&btshared->mutex);
1323  /* Initialize mutable state */
1324  btshared->nparticipantsdone = 0;
1325  btshared->reltuples = 0.0;
1326  btshared->havedead = false;
1327  btshared->indtuples = 0.0;
1328  btshared->brokenhotchain = false;
1329  heap_parallelscan_initialize(&btshared->heapdesc, btspool->heap, snapshot);
1330 
1331  /*
1332  * Store shared tuplesort-private state, for which we reserved space.
1333  * Then, initialize opaque state using tuplesort routine.
1334  */
1335  sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1336  tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1337  pcxt->seg);
1338 
1339  shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1340  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1341 
1342  /* Unique case requires a second spool, and associated shared state */
1343  if (!btspool->isunique)
1344  sharedsort2 = NULL;
1345  else
1346  {
1347  /*
1348  * Store additional shared tuplesort-private state, for which we
1349  * reserved space. Then, initialize opaque state using tuplesort
1350  * routine.
1351  */
1352  sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1353  tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1354  pcxt->seg);
1355 
1356  shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1357  }
1358 
1359  /* Store query string for workers */
1360  sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1361  memcpy(sharedquery, debug_query_string, querylen + 1);
1362  shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1363 
1364  /* Launch workers, saving status for leader/caller */
1365  LaunchParallelWorkers(pcxt);
1366  btleader->pcxt = pcxt;
1367  btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1368  if (leaderparticipates)
1369  btleader->nparticipanttuplesorts++;
1370  btleader->btshared = btshared;
1371  btleader->sharedsort = sharedsort;
1372  btleader->sharedsort2 = sharedsort2;
1373  btleader->snapshot = snapshot;
1374 
1375  /* If no workers were successfully launched, back out (do serial build) */
1376  if (pcxt->nworkers_launched == 0)
1377  {
1378  _bt_end_parallel(btleader);
1379  return;
1380  }
1381 
1382  /* Save leader state now that it's clear build will be parallel */
1383  buildstate->btleader = btleader;
1384 
1385  /* Join heap scan ourselves */
1386  if (leaderparticipates)
1388 
1389  /*
1390  * Caller needs to wait for all launched workers when we return. Make
1391  * sure that the failure-to-start case will not hang forever.
1392  */
1394 }
1395 
1396 /*
1397  * Shut down workers, destroy parallel context, and end parallel mode.
1398  */
1399 static void
1401 {
1402  /* Shutdown worker processes */
1404  /* Free last reference to MVCC snapshot, if one was used */
1405  if (IsMVCCSnapshot(btleader->snapshot))
1406  UnregisterSnapshot(btleader->snapshot);
1407  DestroyParallelContext(btleader->pcxt);
1408  ExitParallelMode();
1409 }
1410 
1411 /*
1412  * Returns size of shared memory required to store state for a parallel
1413  * btree index build based on the snapshot its parallel scan will use.
1414  */
1415 static Size
1417 {
1418  if (!IsMVCCSnapshot(snapshot))
1419  {
1420  Assert(snapshot == SnapshotAny);
1421  return sizeof(BTShared);
1422  }
1423 
1424  return add_size(offsetof(BTShared, heapdesc) +
1425  offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1426  EstimateSnapshotSpace(snapshot));
1427 }
1428 
1429 /*
1430  * Within leader, wait for end of heap scan.
1431  *
1432  * When called, parallel heap scan started by _bt_begin_parallel() will
1433  * already be underway within worker processes (when leader participates
1434  * as a worker, we should end up here just as workers are finishing).
1435  *
1436  * Fills in fields needed for ambuild statistics, and lets caller set
1437  * field indicating that some worker encountered a broken HOT chain.
1438  *
1439  * Returns the total number of heap tuples scanned.
1440  */
1441 static double
1442 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1443 {
1444  BTShared *btshared = buildstate->btleader->btshared;
1445  int nparticipanttuplesorts;
1446  double reltuples;
1447 
1448  nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1449  for (;;)
1450  {
1451  SpinLockAcquire(&btshared->mutex);
1452  if (btshared->nparticipantsdone == nparticipanttuplesorts)
1453  {
1454  buildstate->havedead = btshared->havedead;
1455  buildstate->indtuples = btshared->indtuples;
1456  *brokenhotchain = btshared->brokenhotchain;
1457  reltuples = btshared->reltuples;
1458  SpinLockRelease(&btshared->mutex);
1459  break;
1460  }
1461  SpinLockRelease(&btshared->mutex);
1462 
1465  }
1466 
1468 
1469  return reltuples;
1470 }
1471 
1472 /*
1473  * Within leader, participate as a parallel worker.
1474  */
1475 static void
1477 {
1478  BTLeader *btleader = buildstate->btleader;
1479  BTSpool *leaderworker;
1480  BTSpool *leaderworker2;
1481  int sortmem;
1482 
1483  /* Allocate memory and initialize private spool */
1484  leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1485  leaderworker->heap = buildstate->spool->heap;
1486  leaderworker->index = buildstate->spool->index;
1487  leaderworker->isunique = buildstate->spool->isunique;
1488 
1489  /* Initialize second spool, if required */
1490  if (!btleader->btshared->isunique)
1491  leaderworker2 = NULL;
1492  else
1493  {
1494  /* Allocate memory for worker's own private secondary spool */
1495  leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1496 
1497  /* Initialize worker's own secondary spool */
1498  leaderworker2->heap = leaderworker->heap;
1499  leaderworker2->index = leaderworker->index;
1500  leaderworker2->isunique = false;
1501  }
1502 
1503  /*
1504  * Might as well use reliable figure when doling out maintenance_work_mem
1505  * (when requested number of workers were not launched, this will be
1506  * somewhat higher than it is for other workers).
1507  */
1508  sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1509 
1510  /* Perform work common to all participants */
1511  _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1512  btleader->sharedsort, btleader->sharedsort2,
1513  sortmem);
1514 
1515 #ifdef BTREE_BUILD_STATS
1517  {
1518  ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1519  ResetUsage();
1520  }
1521 #endif /* BTREE_BUILD_STATS */
1522 }
1523 
1524 /*
1525  * Perform work within a launched parallel process.
1526  */
1527 void
1529 {
1530  char *sharedquery;
1531  BTSpool *btspool;
1532  BTSpool *btspool2;
1533  BTShared *btshared;
1534  Sharedsort *sharedsort;
1535  Sharedsort *sharedsort2;
1536  Relation heapRel;
1537  Relation indexRel;
1538  LOCKMODE heapLockmode;
1539  LOCKMODE indexLockmode;
1540  int sortmem;
1541 
1542 #ifdef BTREE_BUILD_STATS
1544  ResetUsage();
1545 #endif /* BTREE_BUILD_STATS */
1546 
1547  /* Set debug_query_string for individual workers first */
1548  sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false);
1549  debug_query_string = sharedquery;
1550 
1551  /* Report the query string from leader */
1553 
1554  /* Look up nbtree shared state */
1555  btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1556 
1557  /* Open relations using lock modes known to be obtained by index.c */
1558  if (!btshared->isconcurrent)
1559  {
1560  heapLockmode = ShareLock;
1561  indexLockmode = AccessExclusiveLock;
1562  }
1563  else
1564  {
1565  heapLockmode = ShareUpdateExclusiveLock;
1566  indexLockmode = RowExclusiveLock;
1567  }
1568 
1569  /* Open relations within worker */
1570  heapRel = heap_open(btshared->heaprelid, heapLockmode);
1571  indexRel = index_open(btshared->indexrelid, indexLockmode);
1572 
1573  /* Initialize worker's own spool */
1574  btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1575  btspool->heap = heapRel;
1576  btspool->index = indexRel;
1577  btspool->isunique = btshared->isunique;
1578 
1579  /* Look up shared state private to tuplesort.c */
1580  sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1581  tuplesort_attach_shared(sharedsort, seg);
1582  if (!btshared->isunique)
1583  {
1584  btspool2 = NULL;
1585  sharedsort2 = NULL;
1586  }
1587  else
1588  {
1589  /* Allocate memory for worker's own private secondary spool */
1590  btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1591 
1592  /* Initialize worker's own secondary spool */
1593  btspool2->heap = btspool->heap;
1594  btspool2->index = btspool->index;
1595  btspool2->isunique = false;
1596  /* Look up shared state private to tuplesort.c */
1597  sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1598  tuplesort_attach_shared(sharedsort2, seg);
1599  }
1600 
1601  /* Perform sorting of spool, and possibly a spool2 */
1602  sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1603  _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1604  sharedsort2, sortmem);
1605 
1606 #ifdef BTREE_BUILD_STATS
1608  {
1609  ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1610  ResetUsage();
1611  }
1612 #endif /* BTREE_BUILD_STATS */
1613 
1614  index_close(indexRel, indexLockmode);
1615  heap_close(heapRel, heapLockmode);
1616 }
1617 
1618 /*
1619  * Perform a worker's portion of a parallel sort.
1620  *
1621  * This generates a tuplesort for passed btspool, and a second tuplesort
1622  * state if a second btspool is need (i.e. for unique index builds). All
1623  * other spool fields should already be set when this is called.
1624  *
1625  * sortmem is the amount of working memory to use within each worker,
1626  * expressed in KBs.
1627  *
1628  * When this returns, workers are done, and need only release resources.
1629  */
1630 static void
1632  BTShared *btshared, Sharedsort *sharedsort,
1633  Sharedsort *sharedsort2, int sortmem)
1634 {
1635  SortCoordinate coordinate;
1636  BTBuildState buildstate;
1637  HeapScanDesc scan;
1638  double reltuples;
1639  IndexInfo *indexInfo;
1640 
1641  /* Initialize local tuplesort coordination state */
1642  coordinate = palloc0(sizeof(SortCoordinateData));
1643  coordinate->isWorker = true;
1644  coordinate->nParticipants = -1;
1645  coordinate->sharedsort = sharedsort;
1646 
1647  /* Begin "partial" tuplesort */
1648  btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1649  btspool->index,
1650  btspool->isunique,
1651  sortmem, coordinate,
1652  false);
1653 
1654  /*
1655  * Just as with serial case, there may be a second spool. If so, a
1656  * second, dedicated spool2 partial tuplesort is required.
1657  */
1658  if (btspool2)
1659  {
1660  SortCoordinate coordinate2;
1661 
1662  /*
1663  * We expect that the second one (for dead tuples) won't get very
1664  * full, so we give it only work_mem (unless sortmem is less for
1665  * worker). Worker processes are generally permitted to allocate
1666  * work_mem independently.
1667  */
1668  coordinate2 = palloc0(sizeof(SortCoordinateData));
1669  coordinate2->isWorker = true;
1670  coordinate2->nParticipants = -1;
1671  coordinate2->sharedsort = sharedsort2;
1672  btspool2->sortstate =
1673  tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
1674  Min(sortmem, work_mem), coordinate2,
1675  false);
1676  }
1677 
1678  /* Fill in buildstate for _bt_build_callback() */
1679  buildstate.isunique = btshared->isunique;
1680  buildstate.havedead = false;
1681  buildstate.heap = btspool->heap;
1682  buildstate.spool = btspool;
1683  buildstate.spool2 = btspool2;
1684  buildstate.indtuples = 0;
1685  buildstate.btleader = NULL;
1686 
1687  /* Join parallel scan */
1688  indexInfo = BuildIndexInfo(btspool->index);
1689  indexInfo->ii_Concurrent = btshared->isconcurrent;
1690  scan = heap_beginscan_parallel(btspool->heap, &btshared->heapdesc);
1691  reltuples = IndexBuildHeapScan(btspool->heap, btspool->index, indexInfo,
1692  true, _bt_build_callback,
1693  (void *) &buildstate, scan);
1694 
1695  /*
1696  * Execute this worker's part of the sort.
1697  *
1698  * Unlike leader and serial cases, we cannot avoid calling
1699  * tuplesort_performsort() for spool2 if it ends up containing no dead
1700  * tuples (this is disallowed for workers by tuplesort).
1701  */
1702  tuplesort_performsort(btspool->sortstate);
1703  if (btspool2)
1704  tuplesort_performsort(btspool2->sortstate);
1705 
1706  /*
1707  * Done. Record ambuild statistics, and whether we encountered a broken
1708  * HOT chain.
1709  */
1710  SpinLockAcquire(&btshared->mutex);
1711  btshared->nparticipantsdone++;
1712  btshared->reltuples += reltuples;
1713  if (buildstate.havedead)
1714  btshared->havedead = true;
1715  btshared->indtuples += buildstate.indtuples;
1716  if (indexInfo->ii_BrokenHotChain)
1717  btshared->brokenhotchain = true;
1718  SpinLockRelease(&btshared->mutex);
1719 
1720  /* Notify leader */
1722 
1723  /* We can end tuplesorts immediately */
1724  tuplesort_end(btspool->sortstate);
1725  if (btspool2)
1726  tuplesort_end(btspool2->sortstate);
1727 }
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2215
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
Definition: nbtsort.c:1242
signed short int16
Definition: c.h:312
bool ssup_nulls_first
Definition: sortsupport.h:75
int slock_t
Definition: s_lock.h:912
ScanKey _bt_mkscankey_nodata(Relation rel)
Definition: nbtutils.c:126
#define BTP_ROOT
Definition: nbtree.h:72
Sharedsort * sharedsort2
Definition: nbtsort.c:202
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1790
BlockNumber btpo_next
Definition: nbtree.h:58
#define PageIsEmpty(page)
Definition: bufpage.h:218
int errhint(const char *fmt,...)
Definition: elog.c:987
int scantuplesortstates
Definition: nbtsort.c:126
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
struct BTWriteState BTWriteState
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:87
Relation heap
Definition: nbtsort.c:216
#define AssertState(condition)
Definition: c.h:702
Page btws_zeropage
Definition: nbtsort.c:268
BTShared * btshared
Definition: nbtsort.c:200
int nparticipantsdone
Definition: nbtsort.c:160
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:1079
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:863
#define RelationGetDescr(relation)
Definition: rel.h:433
int LOCKMODE
Definition: lockdefs.h:26
Oid indexrelid
Definition: nbtsort.c:123
void pgstat_report_activity(BackendState state, const char *cmd_str)
Definition: pgstat.c:2994
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:723
void ShowUsage(const char *title)
Definition: postgres.c:4453
dsm_segment * seg
Definition: parallel.h:42
BlockNumber btws_pages_alloced
Definition: nbtsort.c:266
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
Sharedsort * sharedsort
Definition: nbtsort.c:201
shm_toc_estimator estimator
Definition: parallel.h:41
#define SpinLockInit(lock)
Definition: spin.h:60
#define BTP_LEAF
Definition: nbtree.h:71
void _bt_freeskey(ScanKey skey)
Definition: nbtutils.c:166
#define XLogIsNeeded()
Definition: xlog.h:146
void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
Definition: nbtsort.c:1528
#define Min(x, y)
Definition: c.h:857
union BTPageOpaqueData::@46 btpo
bool havedead
Definition: nbtsort.c:215
Snapshot snapshot
Definition: nbtsort.c:203
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:150
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers, bool serializable_okay)
Definition: parallel.c:152
Sharedsort * sharedsort
Definition: tuplesort.h:56
bool isunique
Definition: nbtsort.c:124
IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup)
Definition: nbtutils.c:2102
int errcode(int sqlerrcode)
Definition: elog.c:575
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:412
bool isconcurrent
Definition: nbtsort.c:125
uint32 BlockNumber
Definition: block.h:31
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1400
Tuplesortstate * sortstate
Definition: nbtsort.c:104
#define heap_close(r, l)
Definition: heapam.h:97
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:1745
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:524
BlockNumber btps_blkno
Definition: nbtsort.c:250
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:4384
unsigned int Oid
Definition: postgres_ext.h:31
#define shm_toc_estimate_chunk(e, sz)
Definition: shm_toc.h:51
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:247
bool btws_use_wal
Definition: nbtsort.c:265
BTSpool * spool
Definition: nbtsort.c:217
static void _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
Definition: nbtsort.c:617
static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
Definition: pairingheap.c:79
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:304
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
#define BTreeTupleSetNAtts(itup, n)
Definition: nbtree.h:257
ParallelContext * pcxt
Definition: nbtsort.c:180
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
ParallelHeapScanDescData heapdesc
Definition: nbtsort.c:171
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:578
Relation heap
Definition: nbtsort.c:105
signed int int32
Definition: c.h:313
struct BTBuildState BTBuildState
IndexTuple btps_minkey
Definition: nbtsort.c:251
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
#define RelationOpenSmgr(relation)
Definition: rel.h:465
void ResetUsage(void)
Definition: postgres.c:4446
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
Definition: parallel.c:708
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5246
#define SpinLockAcquire(lock)
Definition: spin.h:62
void ConditionVariableInit(ConditionVariable *cv)
void DestroyParallelContext(ParallelContext *pcxt)
Definition: parallel.c:862
Page btps_page
Definition: nbtsort.c:249
static void _bt_build_callback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: nbtsort.c:561
struct BTSpool BTSpool
#define ItemIdGetLength(itemId)
Definition: itemid.h:58
void pfree(void *pointer)
Definition: mcxt.c:1031
void ConditionVariableCancelSleep(void)
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:140
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
Size btps_full
Definition: nbtsort.c:254
#define ERROR
Definition: elog.h:43
static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
Definition: nbtsort.c:1442
static Size _bt_parallel_estimate_shared(Snapshot snapshot)
Definition: nbtsort.c:1416
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:306
double IndexBuildHeapScan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, bool allow_sync, IndexBuildCallback callback, void *callback_state, HeapScanDesc scan)
Definition: index.c:2418
bool isunique
Definition: nbtsort.c:107
MemoryContext ssup_cxt
Definition: sortsupport.h:66
ItemPointerData t_self
Definition: htup.h:65
BTCycleId btpo_cycleid
Definition: nbtree.h:65
void ConditionVariableSignal(ConditionVariable *cv)
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
Definition: nbtsort.c:801
void ExitParallelMode(void)
Definition: xact.c:885
Oid heaprelid
Definition: nbtsort.c:122
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
Definition: nbtpage.c:50
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem)
Definition: nbtsort.c:1631
BlockNumber btpo_prev
Definition: nbtree.h:57
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:441
OffsetNumber btps_lastoff
Definition: nbtsort.c:252
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:647
BTSpool * spool2
Definition: nbtsort.c:223
#define RowExclusiveLock
Definition: lockdefs.h:38
IndexTupleData * IndexTuple
Definition: itup.h:53
struct BTShared BTShared
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:89
#define P_FIRSTKEY
Definition: nbtree.h:186
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:419
double indtuples
Definition: nbtsort.c:224
bool isunique
Definition: nbtsort.c:214
#define RelationGetRelationName(relation)
Definition: rel.h:441
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:88
bool ii_BrokenHotChain
Definition: execnodes.h:167
unsigned int uint32
Definition: c.h:325
struct ItemIdData ItemIdData
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:426
int nworkers_launched
Definition: parallel.h:37
void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup)
Definition: sortsupport.c:160
bool brokenhotchain
Definition: nbtsort.c:164
void LaunchParallelWorkers(ParallelContext *pcxt)
Definition: parallel.c:481
#define ereport(elevel, rest)
Definition: elog.h:122
#define BTREE_DEFAULT_FILLFACTOR
Definition: nbtree.h:139
#define BTREE_METAPAGE
Definition: nbtree.h:115
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition: rel.h:298
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:905
Relation index
Definition: nbtsort.c:106
Relation index
Definition: nbtsort.c:264
const char * debug_query_string
Definition: postgres.c:86
void InitializeParallelDSM(ParallelContext *pcxt)
Definition: parallel.c:206
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:490
uint32 level
Definition: nbtree.h:61
#define SpinLockRelease(lock)
Definition: spin.h:64
Size EstimateSnapshotSpace(Snapshot snap)
Definition: snapmgr.c:2044
struct BTLeader BTLeader
void heap_parallelscan_initialize(ParallelHeapScanDesc target, Relation relation, Snapshot snapshot)
Definition: heapam.c:1622
BlockNumber btws_pages_written
Definition: nbtsort.c:267
void * palloc0(Size size)
Definition: mcxt.c:955
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:59
uint32 btps_level
Definition: nbtsort.c:253
uintptr_t Datum
Definition: postgres.h:365
struct IndexTupleData IndexTupleData
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
Definition: nbtsort.c:675
#define SnapshotAny
Definition: tqual.h:28
Size add_size(Size s1, Size s2)
Definition: shmem.c:475
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition: nbtsort.c:375
bool log_btree_build_stats
Definition: guc.c:445
Relation heap_open(Oid relationId, LOCKMODE lockmode)
Definition: heapam.c:1294
BTLeader * btleader
Definition: nbtsort.c:231
int work_mem
Definition: globals.c:122
double reltuples
Definition: nbtsort.c:161
AttrNumber ssup_attno
Definition: sortsupport.h:81
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:199
double indtuples
Definition: nbtsort.c:163
struct BTPageState * btps_next
Definition: nbtsort.c:255
#define InvalidOffsetNumber
Definition: off.h:26
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
Definition: tuplesort.c:1477
slock_t mutex
Definition: nbtsort.c:142
int maintenance_work_mem
Definition: globals.c:123
bool havedead
Definition: nbtsort.c:162
RelFileNode rd_node
Definition: rel.h:55
bool ii_Unique
Definition: execnodes.h:164
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:4408
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
PageHeaderData * PageHeader
Definition: bufpage.h:162
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:699
#define IsMVCCSnapshot(snapshot)
Definition: tqual.h:31
#define SK_BT_DESC
Definition: nbtree.h:489
Definition: regguts.h:298
int nparticipanttuplesorts
Definition: nbtsort.c:188
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
int ii_ParallelWorkers
Definition: execnodes.h:168
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define shm_toc_estimate_keys(e, cnt)
Definition: shm_toc.h:53
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1195
#define MAXALIGN(LEN)
Definition: c.h:652
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
void EnterParallelMode(void)
Definition: xact.c:872
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
Definition: shm_toc.c:88
static void _bt_leader_participate_as_worker(BTBuildState *buildstate)
Definition: nbtsort.c:1476
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:4363
#define RelationNeedsWAL(relation)
Definition: rel.h:510
bool ii_Concurrent
Definition: execnodes.h:166
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:503
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:176
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:600
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:226
#define BTMaxItemSize(page)
Definition: nbtree.h:126
#define P_HIKEY
Definition: nbtree.h:185
static Datum values[MAXATTR]
Definition: bootstrap.c:164
static void _bt_slideleft(Page page)
Definition: nbtsort.c:708
struct BTPageState BTPageState
#define AccessExclusiveLock
Definition: lockdefs.h:45
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
Definition: shm_toc.c:171
void * palloc(Size size)
Definition: mcxt.c:924
int errmsg(const char *fmt,...)
Definition: elog.c:797
static Page _bt_blnewpage(uint32 level)
Definition: nbtsort.c:590
Oid sk_collation
Definition: skey.h:70
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:972
#define ShareLock
Definition: lockdefs.h:41
int i
static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
Definition: nbtsort.c:513
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:891
static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off)
Definition: nbtsort.c:743
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:98
#define elog
Definition: elog.h:219
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:975
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
Definition: nbtsort.c:1012
ConditionVariable workersdonecv
Definition: nbtsort.c:134
unsigned short t_info
Definition: itup.h:49
Relation heap
Definition: nbtsort.c:263
#define ItemIdSetUnused(itemId)
Definition: itemid.h:127
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:86
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1235
#define BTLessStrategyNumber
Definition: stratnum.h:29
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:201
uint16 btpo_flags
Definition: nbtree.h:64
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
Definition: parallel.c:601
HeapScanDesc heap_beginscan_parallel(Relation relation, ParallelHeapScanDesc parallel_scan)
Definition: heapam.c:1666
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:734
#define RelationGetRelid(relation)
Definition: rel.h:407
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:150
float4 reltuples
Definition: pg_class.h:44
#define offsetof(type, field)
Definition: c.h:622
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
AttrNumber sk_attno
Definition: skey.h:67
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:71
shm_toc * toc
Definition: parallel.h:44
double index_tuples
Definition: genam.h:33
double heap_tuples
Definition: genam.h:32
#define P_ISLEAF(opaque)
Definition: nbtree.h:158