PostgreSQL Source Code  git master
gistbuild.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * gistbuild.c
4  * build algorithm for GiST indexes implementation.
5  *
6  * There are two different strategies:
7  *
8  * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted
9  * order, and create downlinks and internal pages as we go. This builds
10  * the index from the bottom up, similar to how B-tree index build
11  * works.
12  *
13  * 2. Start with an empty index, and insert all tuples one by one.
14  *
15  * The sorted method is used if the operator classes for all columns have
16  * a 'sortsupport' defined. Otherwise, we resort to the second strategy.
17  *
18  * The second strategy can optionally use buffers at different levels of
19  * the tree to reduce I/O, see "Buffering build algorithm" in the README
20  * for a more detailed explanation. It initially calls insert over and
21  * over, but switches to the buffered algorithm after a certain number of
22  * tuples (unless buffering mode is disabled).
23  *
24  *
25  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
26  * Portions Copyright (c) 1994, Regents of the University of California
27  *
28  * IDENTIFICATION
29  * src/backend/access/gist/gistbuild.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34 
35 #include <math.h>
36 
37 #include "access/genam.h"
38 #include "access/gist_private.h"
39 #include "access/gistxlog.h"
40 #include "access/tableam.h"
41 #include "access/xloginsert.h"
42 #include "catalog/index.h"
43 #include "miscadmin.h"
44 #include "optimizer/optimizer.h"
45 #include "storage/bufmgr.h"
46 #include "storage/smgr.h"
47 #include "utils/memutils.h"
48 #include "utils/rel.h"
49 #include "utils/tuplesort.h"
50 
51 /* Step of index tuples for check whether to switch to buffering build mode */
52 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
53 
54 /*
55  * Number of tuples to process in the slow way before switching to buffering
56  * mode, when buffering is explicitly turned on. Also, the number of tuples
57  * to process between readjusting the buffer size parameter, while in
58  * buffering mode.
59  */
60 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
61 
62 /*
63  * Strategy used to build the index. It can change between the
64  * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used,
65  * that needs to be decided up-front and cannot be changed afterwards.
66  */
67 typedef enum
68 {
69  GIST_SORTED_BUILD, /* bottom-up build by sorting */
70  GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
71  * switch */
72  GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
73  * buffering build mode if the index grows too
74  * big */
75  GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
76  * before switching to the buffering build
77  * mode */
78  GIST_BUFFERING_ACTIVE /* in buffering build mode */
80 
81 /* Working state for gistbuild and its callback */
82 typedef struct
83 {
87 
88  Size freespace; /* amount of free space to leave on pages */
89 
91 
92  int64 indtuples; /* number of tuples indexed */
93 
94  /*
95  * Extra data structures used during a buffering build. 'gfbb' contains
96  * information related to managing the build buffers. 'parentMap' is a
97  * lookup table of the parent of each internal page.
98  */
99  int64 indtuplesSize; /* total size of all indexed tuples */
102 
103  /*
104  * Extra data structures used during a sorting build.
105  */
106  Tuplesortstate *sortstate; /* state data for tuplesort.c */
107 
110 
113  Page ready_pages[XLR_MAX_BLOCK_ID];
115 
116 #define GIST_SORTED_BUILD_PAGE_NUM 4
117 
118 /*
119  * In sorted build, we use a stack of these structs, one for each level,
120  * to hold an in-memory buffer of last pages at the level.
121  *
122  * Sorting GiST build requires good linearization of the sort opclass. This is
123  * not always the case in multidimensional data. To tackle the anomalies, we
124  * buffer index tuples and apply picksplit that can be multidimension-aware.
125  */
127 {
130  struct GistSortedBuildLevelState *parent; /* Upper level, if any */
133 
134 /* prototypes for private functions */
135 
137  Datum *values, bool *isnull,
138  bool tupleIsAlive, void *state);
141  GistSortedBuildLevelState *levelstate,
142  IndexTuple itup);
144  GistSortedBuildLevelState *levelstate);
146 
147 static void gistInitBuffering(GISTBuildState *buildstate);
148 static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
149 static void gistBuildCallback(Relation index,
150  ItemPointer tid,
151  Datum *values,
152  bool *isnull,
153  bool tupleIsAlive,
154  void *state);
155 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
156  IndexTuple itup);
157 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
158  BlockNumber startblkno, int startlevel);
160  Buffer buffer, int level,
161  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
162  BlockNumber parentblk, OffsetNumber downlinkoffnum);
164  BlockNumber childblkno, int level,
165  BlockNumber *parentblk,
166  OffsetNumber *downlinkoffnum);
167 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
168 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
169 static int gistGetMaxLevel(Relation index);
170 
171 static void gistInitParentMap(GISTBuildState *buildstate);
172 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
174 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
175 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
176 
177 
178 /*
179  * Main entry point to GiST index build.
180  */
183 {
184  IndexBuildResult *result;
185  double reltuples;
186  GISTBuildState buildstate;
188  int fillfactor;
189  Oid SortSupportFnOids[INDEX_MAX_KEYS];
190  GiSTOptions *options = (GiSTOptions *) index->rd_options;
191 
192  /*
193  * We expect to be called exactly once for any index relation. If that's
194  * not the case, big trouble's what we have.
195  */
197  elog(ERROR, "index \"%s\" already contains data",
199 
200  buildstate.indexrel = index;
201  buildstate.heaprel = heap;
202  buildstate.sortstate = NULL;
203  buildstate.giststate = initGISTstate(index);
204 
205  /*
206  * Create a temporary memory context that is reset once for each tuple
207  * processed. (Note: we don't bother to make this a child of the
208  * giststate's scanCxt, so we have to delete it separately at the end.)
209  */
210  buildstate.giststate->tempCxt = createTempGistContext();
211 
212  /*
213  * Choose build strategy. First check whether the user specified to use
214  * buffering mode. (The use-case for that in the field is somewhat
215  * questionable perhaps, but it's important for testing purposes.)
216  */
217  if (options)
218  {
219  if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
220  buildstate.buildMode = GIST_BUFFERING_STATS;
221  else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
222  buildstate.buildMode = GIST_BUFFERING_DISABLED;
223  else /* must be "auto" */
224  buildstate.buildMode = GIST_BUFFERING_AUTO;
225  }
226  else
227  {
228  buildstate.buildMode = GIST_BUFFERING_AUTO;
229  }
230 
231  /*
232  * Unless buffering mode was forced, see if we can use sorting instead.
233  */
234  if (buildstate.buildMode != GIST_BUFFERING_STATS)
235  {
236  bool hasallsortsupports = true;
238 
239  for (int i = 0; i < keyscount; i++)
240  {
241  SortSupportFnOids[i] = index_getprocid(index, i + 1,
243  if (!OidIsValid(SortSupportFnOids[i]))
244  {
245  hasallsortsupports = false;
246  break;
247  }
248  }
249  if (hasallsortsupports)
250  buildstate.buildMode = GIST_SORTED_BUILD;
251  }
252 
253  /*
254  * Calculate target amount of free space to leave on pages.
255  */
257  buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
258 
259  /*
260  * Build the index using the chosen strategy.
261  */
262  buildstate.indtuples = 0;
263  buildstate.indtuplesSize = 0;
264 
265  if (buildstate.buildMode == GIST_SORTED_BUILD)
266  {
267  /*
268  * Sort all data, build the index from bottom up.
269  */
270  buildstate.sortstate = tuplesort_begin_index_gist(heap,
271  index,
273  NULL,
275 
276  /* Scan the table, adding all tuples to the tuplesort */
277  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
279  (void *) &buildstate, NULL);
280 
281  /*
282  * Perform the sort and build index pages.
283  */
284  tuplesort_performsort(buildstate.sortstate);
285 
286  gist_indexsortbuild(&buildstate);
287 
288  tuplesort_end(buildstate.sortstate);
289  }
290  else
291  {
292  /*
293  * Initialize an empty index and insert all tuples, possibly using
294  * buffers on intermediate levels.
295  */
296  Buffer buffer;
297  Page page;
298 
299  /* initialize the root page */
300  buffer = gistNewBuffer(index);
302  page = BufferGetPage(buffer);
303 
305 
306  GISTInitBuffer(buffer, F_LEAF);
307 
308  MarkBufferDirty(buffer);
309  PageSetLSN(page, GistBuildLSN);
310 
311  UnlockReleaseBuffer(buffer);
312 
314 
315  /* Scan the table, inserting all the tuples to the index. */
316  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
318  (void *) &buildstate, NULL);
319 
320  /*
321  * If buffering was used, flush out all the tuples that are still in
322  * the buffers.
323  */
324  if (buildstate.buildMode == GIST_BUFFERING_ACTIVE)
325  {
326  elog(DEBUG1, "all tuples processed, emptying buffers");
327  gistEmptyAllBuffers(&buildstate);
328  gistFreeBuildBuffers(buildstate.gfbb);
329  }
330 
331  /*
332  * We didn't write WAL records as we built the index, so if
333  * WAL-logging is required, write all pages to the WAL now.
334  */
335  if (RelationNeedsWAL(index))
336  {
339  true);
340  }
341  }
342 
343  /* okay, all heap tuples are indexed */
344  MemoryContextSwitchTo(oldcxt);
346 
347  freeGISTstate(buildstate.giststate);
348 
349  /*
350  * Return statistics
351  */
352  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
353 
354  result->heap_tuples = reltuples;
355  result->index_tuples = (double) buildstate.indtuples;
356 
357  return result;
358 }
359 
360 /*-------------------------------------------------------------------------
361  * Routines for sorted build
362  *-------------------------------------------------------------------------
363  */
364 
365 /*
366  * Per-tuple callback for table_index_build_scan.
367  */
368 static void
370  ItemPointer tid,
371  Datum *values,
372  bool *isnull,
373  bool tupleIsAlive,
374  void *state)
375 {
376  GISTBuildState *buildstate = (GISTBuildState *) state;
377  MemoryContext oldCtx;
378  Datum compressed_values[INDEX_MAX_KEYS];
379 
380  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
381 
382  /* Form an index tuple and point it at the heap tuple */
383  gistCompressValues(buildstate->giststate, index,
384  values, isnull,
385  true, compressed_values);
386 
388  buildstate->indexrel,
389  tid,
390  compressed_values, isnull);
391 
392  MemoryContextSwitchTo(oldCtx);
393  MemoryContextReset(buildstate->giststate->tempCxt);
394 
395  /* Update tuple count. */
396  buildstate->indtuples += 1;
397 }
398 
399 /*
400  * Build GiST index from bottom up from pre-sorted tuples.
401  */
402 static void
404 {
405  IndexTuple itup;
406  GistSortedBuildLevelState *levelstate;
407  Page page;
408 
409  state->pages_allocated = 0;
410  state->pages_written = 0;
411  state->ready_num_pages = 0;
412 
413  /*
414  * Write an empty page as a placeholder for the root page. It will be
415  * replaced with the real root page at the end.
416  */
417  page = palloc0(BLCKSZ);
419  page, true);
420  state->pages_allocated++;
421  state->pages_written++;
422 
423  /* Allocate a temporary buffer for the first leaf page batch. */
424  levelstate = palloc0(sizeof(GistSortedBuildLevelState));
425  levelstate->pages[0] = page;
426  levelstate->parent = NULL;
427  gistinitpage(page, F_LEAF);
428 
429  /*
430  * Fill index pages with tuples in the sorted order.
431  */
432  while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
433  {
434  gist_indexsortbuild_levelstate_add(state, levelstate, itup);
435  MemoryContextReset(state->giststate->tempCxt);
436  }
437 
438  /*
439  * Write out the partially full non-root pages.
440  *
441  * Keep in mind that flush can build a new root. If number of pages is > 1
442  * then new root is required.
443  */
444  while (levelstate->parent != NULL || levelstate->current_page != 0)
445  {
447 
449  parent = levelstate->parent;
450  for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
451  if (levelstate->pages[i])
452  pfree(levelstate->pages[i]);
453  pfree(levelstate);
454  levelstate = parent;
455  }
456 
458 
459  /* Write out the root */
460  PageSetLSN(levelstate->pages[0], GistBuildLSN);
463  levelstate->pages[0], true);
464  if (RelationNeedsWAL(state->indexrel))
465  log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
466  levelstate->pages[0], true);
467 
468  pfree(levelstate->pages[0]);
469  pfree(levelstate);
470 
471  /*
472  * When we WAL-logged index pages, we must nonetheless fsync index files.
473  * Since we're building outside shared buffers, a CHECKPOINT occurring
474  * during the build has no way to flush the previously written data to
475  * disk (indeed it won't know the index even exists). A crash later on
476  * would replay WAL from the checkpoint, therefore it wouldn't replay our
477  * earlier WAL entries. If we do not fsync those pages here, they might
478  * still not be on disk when the crash occurs.
479  */
480  if (RelationNeedsWAL(state->indexrel))
482 }
483 
484 /*
485  * Add tuple to a page. If the pages are full, write them out and re-initialize
486  * a new page first.
487  */
488 static void
490  GistSortedBuildLevelState *levelstate,
491  IndexTuple itup)
492 {
493  Size sizeNeeded;
494 
495  /* Check if tuple can be added to the current page */
496  sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
497  if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
498  {
499  Page newPage;
500  Page old_page = levelstate->pages[levelstate->current_page];
501  uint16 old_page_flags = GistPageGetOpaque(old_page)->flags;
502 
503  if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
504  {
506  }
507  else
508  levelstate->current_page++;
509 
510  if (levelstate->pages[levelstate->current_page] == NULL)
511  levelstate->pages[levelstate->current_page] = palloc(BLCKSZ);
512 
513  newPage = levelstate->pages[levelstate->current_page];
514  gistinitpage(newPage, old_page_flags);
515  }
516 
517  gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
518 }
519 
520 static void
522  GistSortedBuildLevelState *levelstate)
523 {
525  BlockNumber blkno;
526  MemoryContext oldCtx;
527  IndexTuple union_tuple;
528  SplitedPageLayout *dist;
529  IndexTuple *itvec;
530  int vect_len;
531  bool isleaf = GistPageIsLeaf(levelstate->pages[0]);
532 
534 
535  oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
536 
537  /* Get index tuples from first page */
538  itvec = gistextractpage(levelstate->pages[0], &vect_len);
539  if (levelstate->current_page > 0)
540  {
541  /* Append tuples from each page */
542  for (int i = 1; i < levelstate->current_page + 1; i++)
543  {
544  int len_local;
545  IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);
546 
547  itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
548  pfree(itvec_local);
549  }
550 
551  /* Apply picksplit to list of all collected tuples */
552  dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
553  }
554  else
555  {
556  /* Create splitted layout from single page */
557  dist = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout));
558  union_tuple = gistunion(state->indexrel, itvec, vect_len,
559  state->giststate);
560  dist->itup = union_tuple;
561  dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
562  dist->block.num = vect_len;
563  }
564 
565  MemoryContextSwitchTo(oldCtx);
566 
567  /* Reset page counter */
568  levelstate->current_page = 0;
569 
570  /* Create pages for all partitions in split result */
571  for (; dist != NULL; dist = dist->next)
572  {
573  char *data;
574  Page target;
575 
576  /* check once per page */
578 
579  /* Create page and copy data */
580  data = (char *) (dist->list);
581  target = palloc0(BLCKSZ);
582  gistinitpage(target, isleaf ? F_LEAF : 0);
583  for (int i = 0; i < dist->block.num; i++)
584  {
585  IndexTuple thistup = (IndexTuple) data;
586 
587  if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
588  elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));
589 
590  data += IndexTupleSize(thistup);
591  }
592  union_tuple = dist->itup;
593 
594  if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
596 
597  /*
598  * The page is now complete. Assign a block number to it, and add it
599  * to the list of finished pages. (We don't write it out immediately,
600  * because we want to WAL-log the pages in batches.)
601  */
602  blkno = state->pages_allocated++;
603  state->ready_blknos[state->ready_num_pages] = blkno;
604  state->ready_pages[state->ready_num_pages] = target;
605  state->ready_num_pages++;
606  ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
607 
608  /*
609  * Set the right link to point to the previous page. This is just for
610  * debugging purposes: GiST only follows the right link if a page is
611  * split concurrently to a scan, and that cannot happen during index
612  * build.
613  *
614  * It's a bit counterintuitive that we set the right link on the new
615  * page to point to the previous page, not the other way around. But
616  * GiST pages are not ordered like B-tree pages are, so as long as the
617  * right-links form a chain through all the pages at the same level,
618  * the order doesn't matter.
619  */
620  if (levelstate->last_blkno)
621  GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
622  levelstate->last_blkno = blkno;
623 
624  /*
625  * Insert the downlink to the parent page. If this was the root,
626  * create a new page as the parent, which becomes the new root.
627  */
628  parent = levelstate->parent;
629  if (parent == NULL)
630  {
632  parent->pages[0] = (Page) palloc(BLCKSZ);
633  parent->parent = NULL;
634  gistinitpage(parent->pages[0], 0);
635 
636  levelstate->parent = parent;
637  }
639  }
640 }
641 
642 static void
644 {
645  if (state->ready_num_pages == 0)
646  return;
647 
648  for (int i = 0; i < state->ready_num_pages; i++)
649  {
650  Page page = state->ready_pages[i];
651  BlockNumber blkno = state->ready_blknos[i];
652 
653  /* Currently, the blocks must be buffered in order. */
654  if (blkno != state->pages_written)
655  elog(ERROR, "unexpected block number to flush GiST sorting build");
656 
657  PageSetLSN(page, GistBuildLSN);
658  PageSetChecksumInplace(page, blkno);
659  smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page,
660  true);
661 
662  state->pages_written++;
663  }
664 
665  if (RelationNeedsWAL(state->indexrel))
666  log_newpages(&state->indexrel->rd_node, MAIN_FORKNUM, state->ready_num_pages,
667  state->ready_blknos, state->ready_pages, true);
668 
669  for (int i = 0; i < state->ready_num_pages; i++)
670  pfree(state->ready_pages[i]);
671 
672  state->ready_num_pages = 0;
673 }
674 
675 
676 /*-------------------------------------------------------------------------
677  * Routines for non-sorted build
678  *-------------------------------------------------------------------------
679  */
680 
681 /*
682  * Attempt to switch to buffering mode.
683  *
684  * If there is not enough memory for buffering build, sets bufferingMode
685  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
686  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
687  * GIST_BUFFERING_ACTIVE.
688  */
689 static void
691 {
692  Relation index = buildstate->indexrel;
693  int pagesPerBuffer;
694  Size pageFreeSpace;
695  Size itupAvgSize,
696  itupMinSize;
697  double avgIndexTuplesPerPage,
698  maxIndexTuplesPerPage;
699  int i;
700  int levelStep;
701 
702  /* Calc space of index page which is available for index tuples */
703  pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
704  - sizeof(ItemIdData)
705  - buildstate->freespace;
706 
707  /*
708  * Calculate average size of already inserted index tuples using gathered
709  * statistics.
710  */
711  itupAvgSize = (double) buildstate->indtuplesSize /
712  (double) buildstate->indtuples;
713 
714  /*
715  * Calculate minimal possible size of index tuple by index metadata.
716  * Minimal possible size of varlena is VARHDRSZ.
717  *
718  * XXX: that's not actually true, as a short varlen can be just 2 bytes.
719  * And we should take padding into account here.
720  */
721  itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
722  for (i = 0; i < index->rd_att->natts; i++)
723  {
724  if (TupleDescAttr(index->rd_att, i)->attlen < 0)
725  itupMinSize += VARHDRSZ;
726  else
727  itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
728  }
729 
730  /* Calculate average and maximal number of index tuples which fit to page */
731  avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
732  maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
733 
734  /*
735  * We need to calculate two parameters for the buffering algorithm:
736  * levelStep and pagesPerBuffer.
737  *
738  * levelStep determines the size of subtree that we operate on, while
739  * emptying a buffer. A higher value is better, as you need fewer buffer
740  * emptying steps to build the index. However, if you set it too high, the
741  * subtree doesn't fit in cache anymore, and you quickly lose the benefit
742  * of the buffers.
743  *
744  * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
745  * the number of tuples on page (ie. fanout), and M is the amount of
746  * internal memory available. Curiously, they doesn't explain *why* that
747  * setting is optimal. We calculate it by taking the highest levelStep so
748  * that a subtree still fits in cache. For a small B, our way of
749  * calculating levelStep is very close to Arge et al's formula. For a
750  * large B, our formula gives a value that is 2x higher.
751  *
752  * The average size (in pages) of a subtree of depth n can be calculated
753  * as a geometric series:
754  *
755  * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
756  *
757  * where B is the average number of index tuples on page. The subtree is
758  * cached in the shared buffer cache and the OS cache, so we choose
759  * levelStep so that the subtree size is comfortably smaller than
760  * effective_cache_size, with a safety factor of 4.
761  *
762  * The estimate on the average number of index tuples on page is based on
763  * average tuple sizes observed before switching to buffered build, so the
764  * real subtree size can be somewhat larger. Also, it would selfish to
765  * gobble the whole cache for our index build. The safety factor of 4
766  * should account for those effects.
767  *
768  * The other limiting factor for setting levelStep is that while
769  * processing a subtree, we need to hold one page for each buffer at the
770  * next lower buffered level. The max. number of buffers needed for that
771  * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
772  * hopefully maintenance_work_mem is set high enough that you're
773  * constrained by effective_cache_size rather than maintenance_work_mem.
774  *
775  * XXX: the buffer hash table consumes a fair amount of memory too per
776  * buffer, but that is not currently taken into account. That scales on
777  * the total number of buffers used, ie. the index size and on levelStep.
778  * Note that a higher levelStep *reduces* the amount of memory needed for
779  * the hash table.
780  */
781  levelStep = 1;
782  for (;;)
783  {
784  double subtreesize;
785  double maxlowestlevelpages;
786 
787  /* size of an average subtree at this levelStep (in pages). */
788  subtreesize =
789  (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
790  (1 - avgIndexTuplesPerPage);
791 
792  /* max number of pages at the lowest level of a subtree */
793  maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
794 
795  /* subtree must fit in cache (with safety factor of 4) */
796  if (subtreesize > effective_cache_size / 4)
797  break;
798 
799  /* each node in the lowest level of a subtree has one page in memory */
800  if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
801  break;
802 
803  /* Good, we can handle this levelStep. See if we can go one higher. */
804  levelStep++;
805  }
806 
807  /*
808  * We just reached an unacceptable value of levelStep in previous loop.
809  * So, decrease levelStep to get last acceptable value.
810  */
811  levelStep--;
812 
813  /*
814  * If there's not enough cache or maintenance_work_mem, fall back to plain
815  * inserts.
816  */
817  if (levelStep <= 0)
818  {
819  elog(DEBUG1, "failed to switch to buffered GiST build");
820  buildstate->buildMode = GIST_BUFFERING_DISABLED;
821  return;
822  }
823 
824  /*
825  * The second parameter to set is pagesPerBuffer, which determines the
826  * size of each buffer. We adjust pagesPerBuffer also during the build,
827  * which is why this calculation is in a separate function.
828  */
829  pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
830 
831  /* Initialize GISTBuildBuffers with these parameters */
832  buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
834 
835  gistInitParentMap(buildstate);
836 
837  buildstate->buildMode = GIST_BUFFERING_ACTIVE;
838 
839  elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
840  levelStep, pagesPerBuffer);
841 }
842 
843 /*
844  * Calculate pagesPerBuffer parameter for the buffering algorithm.
845  *
846  * Buffer size is chosen so that assuming that tuples are distributed
847  * randomly, emptying half a buffer fills on average one page in every buffer
848  * at the next lower level.
849  */
850 static int
851 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
852 {
853  double pagesPerBuffer;
854  double avgIndexTuplesPerPage;
855  double itupAvgSize;
856  Size pageFreeSpace;
857 
858  /* Calc space of index page which is available for index tuples */
859  pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
860  - sizeof(ItemIdData)
861  - buildstate->freespace;
862 
863  /*
864  * Calculate average size of already inserted index tuples using gathered
865  * statistics.
866  */
867  itupAvgSize = (double) buildstate->indtuplesSize /
868  (double) buildstate->indtuples;
869 
870  avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
871 
872  /*
873  * Recalculate required size of buffers.
874  */
875  pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
876 
877  return (int) rint(pagesPerBuffer);
878 }
879 
880 /*
881  * Per-tuple callback for table_index_build_scan.
882  */
883 static void
885  ItemPointer tid,
886  Datum *values,
887  bool *isnull,
888  bool tupleIsAlive,
889  void *state)
890 {
891  GISTBuildState *buildstate = (GISTBuildState *) state;
892  IndexTuple itup;
893  MemoryContext oldCtx;
894 
895  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
896 
897  /* form an index tuple and point it at the heap tuple */
898  itup = gistFormTuple(buildstate->giststate, index,
899  values, isnull,
900  true);
901  itup->t_tid = *tid;
902 
903  if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
904  {
905  /* We have buffers, so use them. */
906  gistBufferingBuildInsert(buildstate, itup);
907  }
908  else
909  {
910  /*
911  * There's no buffers (yet). Since we already have the index relation
912  * locked, we call gistdoinsert directly.
913  */
914  gistdoinsert(index, itup, buildstate->freespace,
915  buildstate->giststate, buildstate->heaprel, true);
916  }
917 
918  /* Update tuple count and total size. */
919  buildstate->indtuples += 1;
920  buildstate->indtuplesSize += IndexTupleSize(itup);
921 
922  MemoryContextSwitchTo(oldCtx);
923  MemoryContextReset(buildstate->giststate->tempCxt);
924 
925  if (buildstate->buildMode == GIST_BUFFERING_ACTIVE &&
927  {
928  /* Adjust the target buffer size now */
929  buildstate->gfbb->pagesPerBuffer =
930  calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
931  }
932 
933  /*
934  * In 'auto' mode, check if the index has grown too large to fit in cache,
935  * and switch to buffering mode if it has.
936  *
937  * To avoid excessive calls to smgrnblocks(), only check this every
938  * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples.
939  *
940  * In 'stats' state, switch as soon as we have seen enough tuples to have
941  * some idea of the average tuple size.
942  */
943  if ((buildstate->buildMode == GIST_BUFFERING_AUTO &&
944  buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
946  MAIN_FORKNUM)) ||
947  (buildstate->buildMode == GIST_BUFFERING_STATS &&
949  {
950  /*
951  * Index doesn't fit in effective cache anymore. Try to switch to
952  * buffering build mode.
953  */
954  gistInitBuffering(buildstate);
955  }
956 }
957 
958 /*
959  * Insert function for buffering index build.
960  */
961 static void
963 {
964  /* Insert the tuple to buffers. */
965  gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
966 
967  /* If we filled up (half of a) buffer, process buffer emptying. */
968  gistProcessEmptyingQueue(buildstate);
969 }
970 
971 /*
972  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
973  * page or node buffer, and inserts the tuple there. Returns true if we have
974  * to stop buffer emptying process (because one of child buffers can't take
975  * index tuples anymore).
976  */
977 static bool
979  BlockNumber startblkno, int startlevel)
980 {
981  GISTSTATE *giststate = buildstate->giststate;
982  GISTBuildBuffers *gfbb = buildstate->gfbb;
983  Relation indexrel = buildstate->indexrel;
984  BlockNumber childblkno;
985  Buffer buffer;
986  bool result = false;
987  BlockNumber blkno;
988  int level;
989  OffsetNumber downlinkoffnum = InvalidOffsetNumber;
990  BlockNumber parentblkno = InvalidBlockNumber;
991 
993 
994  /*
995  * Loop until we reach a leaf page (level == 0) or a level with buffers
996  * (not including the level we start at, because we would otherwise make
997  * no progress).
998  */
999  blkno = startblkno;
1000  level = startlevel;
1001  for (;;)
1002  {
1003  ItemId iid;
1004  IndexTuple idxtuple,
1005  newtup;
1006  Page page;
1007  OffsetNumber childoffnum;
1008 
1009  /* Have we reached a level with buffers? */
1010  if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
1011  break;
1012 
1013  /* Have we reached a leaf page? */
1014  if (level == 0)
1015  break;
1016 
1017  /*
1018  * Nope. Descend down to the next level then. Choose a child to
1019  * descend down to.
1020  */
1021 
1022  buffer = ReadBuffer(indexrel, blkno);
1023  LockBuffer(buffer, GIST_EXCLUSIVE);
1024 
1025  page = (Page) BufferGetPage(buffer);
1026  childoffnum = gistchoose(indexrel, page, itup, giststate);
1027  iid = PageGetItemId(page, childoffnum);
1028  idxtuple = (IndexTuple) PageGetItem(page, iid);
1029  childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1030 
1031  if (level > 1)
1032  gistMemorizeParent(buildstate, childblkno, blkno);
1033 
1034  /*
1035  * Check that the key representing the target child node is consistent
1036  * with the key we're inserting. Update it if it's not.
1037  */
1038  newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
1039  if (newtup)
1040  {
1041  blkno = gistbufferinginserttuples(buildstate,
1042  buffer,
1043  level,
1044  &newtup,
1045  1,
1046  childoffnum,
1049  /* gistbufferinginserttuples() released the buffer */
1050  }
1051  else
1052  UnlockReleaseBuffer(buffer);
1053 
1054  /* Descend to the child */
1055  parentblkno = blkno;
1056  blkno = childblkno;
1057  downlinkoffnum = childoffnum;
1058  Assert(level > 0);
1059  level--;
1060  }
1061 
1062  if (LEVEL_HAS_BUFFERS(level, gfbb))
1063  {
1064  /*
1065  * We've reached level with buffers. Place the index tuple to the
1066  * buffer, and add the buffer to the emptying queue if it overflows.
1067  */
1068  GISTNodeBuffer *childNodeBuffer;
1069 
1070  /* Find the buffer or create a new one */
1071  childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
1072 
1073  /* Add index tuple to it */
1074  gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
1075 
1076  if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
1077  result = true;
1078  }
1079  else
1080  {
1081  /*
1082  * We've reached a leaf page. Place the tuple here.
1083  */
1084  Assert(level == 0);
1085  buffer = ReadBuffer(indexrel, blkno);
1086  LockBuffer(buffer, GIST_EXCLUSIVE);
1087  gistbufferinginserttuples(buildstate, buffer, level,
1088  &itup, 1, InvalidOffsetNumber,
1089  parentblkno, downlinkoffnum);
1090  /* gistbufferinginserttuples() released the buffer */
1091  }
1092 
1093  return result;
1094 }
1095 
1096 /*
1097  * Insert tuples to a given page.
1098  *
1099  * This is analogous with gistinserttuples() in the regular insertion code.
1100  *
1101  * Returns the block number of the page where the (first) new or updated tuple
1102  * was inserted. Usually that's the original page, but might be a sibling page
1103  * if the original page was split.
1104  *
1105  * Caller should hold a lock on 'buffer' on entry. This function will unlock
1106  * and unpin it.
1107  */
1108 static BlockNumber
1109 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
1110  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
1111  BlockNumber parentblk, OffsetNumber downlinkoffnum)
1112 {
1113  GISTBuildBuffers *gfbb = buildstate->gfbb;
1114  List *splitinfo;
1115  bool is_split;
1116  BlockNumber placed_to_blk = InvalidBlockNumber;
1117 
1118  is_split = gistplacetopage(buildstate->indexrel,
1119  buildstate->freespace,
1120  buildstate->giststate,
1121  buffer,
1122  itup, ntup, oldoffnum, &placed_to_blk,
1123  InvalidBuffer,
1124  &splitinfo,
1125  false,
1126  buildstate->heaprel, true);
1127 
1128  /*
1129  * If this is a root split, update the root path item kept in memory. This
1130  * ensures that all path stacks are always complete, including all parent
1131  * nodes up to the root. That simplifies the algorithm to re-find correct
1132  * parent.
1133  */
1134  if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
1135  {
1136  Page page = BufferGetPage(buffer);
1137  OffsetNumber off;
1138  OffsetNumber maxoff;
1139 
1140  Assert(level == gfbb->rootlevel);
1141  gfbb->rootlevel++;
1142 
1143  elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
1144 
1145  /*
1146  * All the downlinks on the old root page are now on one of the child
1147  * pages. Visit all the new child pages to memorize the parents of the
1148  * grandchildren.
1149  */
1150  if (gfbb->rootlevel > 1)
1151  {
1152  maxoff = PageGetMaxOffsetNumber(page);
1153  for (off = FirstOffsetNumber; off <= maxoff; off++)
1154  {
1155  ItemId iid = PageGetItemId(page, off);
1156  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1157  BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1158  Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
1159 
1160  LockBuffer(childbuf, GIST_SHARE);
1161  gistMemorizeAllDownlinks(buildstate, childbuf);
1162  UnlockReleaseBuffer(childbuf);
1163 
1164  /*
1165  * Also remember that the parent of the new child page is the
1166  * root block.
1167  */
1168  gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
1169  }
1170  }
1171  }
1172 
1173  if (splitinfo)
1174  {
1175  /*
1176  * Insert the downlinks to the parent. This is analogous with
1177  * gistfinishsplit() in the regular insertion code, but the locking is
1178  * simpler, and we have to maintain the buffers on internal nodes and
1179  * the parent map.
1180  */
1181  IndexTuple *downlinks;
1182  int ndownlinks,
1183  i;
1184  Buffer parentBuffer;
1185  ListCell *lc;
1186 
1187  /* Parent may have changed since we memorized this path. */
1188  parentBuffer =
1189  gistBufferingFindCorrectParent(buildstate,
1190  BufferGetBlockNumber(buffer),
1191  level,
1192  &parentblk,
1193  &downlinkoffnum);
1194 
1195  /*
1196  * If there's a buffer associated with this page, that needs to be
1197  * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
1198  * downlinks in 'splitinfo', to make sure they're consistent not only
1199  * with the tuples already on the pages, but also the tuples in the
1200  * buffers that will eventually be inserted to them.
1201  */
1203  buildstate->giststate,
1204  buildstate->indexrel,
1205  level,
1206  buffer, splitinfo);
1207 
1208  /* Create an array of all the downlink tuples */
1209  ndownlinks = list_length(splitinfo);
1210  downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
1211  i = 0;
1212  foreach(lc, splitinfo)
1213  {
1214  GISTPageSplitInfo *splitinfo = lfirst(lc);
1215 
1216  /*
1217  * Remember the parent of each new child page in our parent map.
1218  * This assumes that the downlinks fit on the parent page. If the
1219  * parent page is split, too, when we recurse up to insert the
1220  * downlinks, the recursive gistbufferinginserttuples() call will
1221  * update the map again.
1222  */
1223  if (level > 0)
1224  gistMemorizeParent(buildstate,
1225  BufferGetBlockNumber(splitinfo->buf),
1226  BufferGetBlockNumber(parentBuffer));
1227 
1228  /*
1229  * Also update the parent map for all the downlinks that got moved
1230  * to a different page. (actually this also loops through the
1231  * downlinks that stayed on the original page, but it does no
1232  * harm).
1233  */
1234  if (level > 1)
1235  gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
1236 
1237  /*
1238  * Since there's no concurrent access, we can release the lower
1239  * level buffers immediately. This includes the original page.
1240  */
1241  UnlockReleaseBuffer(splitinfo->buf);
1242  downlinks[i++] = splitinfo->downlink;
1243  }
1244 
1245  /* Insert them into parent. */
1246  gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
1247  downlinks, ndownlinks, downlinkoffnum,
1249 
1250  list_free_deep(splitinfo); /* we don't need this anymore */
1251  }
1252  else
1253  UnlockReleaseBuffer(buffer);
1254 
1255  return placed_to_blk;
1256 }
1257 
1258 /*
1259  * Find the downlink pointing to a child page.
1260  *
1261  * 'childblkno' indicates the child page to find the parent for. 'level' is
1262  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
1263  * point to a location where the downlink used to be - we will check that
1264  * location first, and save some cycles if it hasn't moved. The function
1265  * returns a buffer containing the downlink, exclusively-locked, and
1266  * *parentblkno and *downlinkoffnum are set to the real location of the
1267  * downlink.
1268  *
1269  * If the child page is a leaf (level == 0), the caller must supply a correct
1270  * parentblkno. Otherwise we use the parent map hash table to find the parent
1271  * block.
1272  *
1273  * This function serves the same purpose as gistFindCorrectParent() during
1274  * normal index inserts, but this is simpler because we don't need to deal
1275  * with concurrent inserts.
1276  */
1277 static Buffer
1279  BlockNumber childblkno, int level,
1280  BlockNumber *parentblkno,
1281  OffsetNumber *downlinkoffnum)
1282 {
1284  Buffer buffer;
1285  Page page;
1286  OffsetNumber maxoff;
1287  OffsetNumber off;
1288 
1289  if (level > 0)
1290  parent = gistGetParent(buildstate, childblkno);
1291  else
1292  {
1293  /*
1294  * For a leaf page, the caller must supply a correct parent block
1295  * number.
1296  */
1297  if (*parentblkno == InvalidBlockNumber)
1298  elog(ERROR, "no parent buffer provided of child %u", childblkno);
1299  parent = *parentblkno;
1300  }
1301 
1302  buffer = ReadBuffer(buildstate->indexrel, parent);
1303  page = BufferGetPage(buffer);
1304  LockBuffer(buffer, GIST_EXCLUSIVE);
1305  gistcheckpage(buildstate->indexrel, buffer);
1306  maxoff = PageGetMaxOffsetNumber(page);
1307 
1308  /* Check if it was not moved */
1309  if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
1310  *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
1311  {
1312  ItemId iid = PageGetItemId(page, *downlinkoffnum);
1313  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1314 
1315  if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1316  {
1317  /* Still there */
1318  return buffer;
1319  }
1320  }
1321 
1322  /*
1323  * Downlink was not at the offset where it used to be. Scan the page to
1324  * find it. During normal gist insertions, it might've moved to another
1325  * page, to the right, but during a buffering build, we keep track of the
1326  * parent of each page in the lookup table so we should always know what
1327  * page it's on.
1328  */
1329  for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
1330  {
1331  ItemId iid = PageGetItemId(page, off);
1332  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1333 
1334  if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1335  {
1336  /* yes!!, found it */
1337  *downlinkoffnum = off;
1338  return buffer;
1339  }
1340  }
1341 
1342  elog(ERROR, "failed to re-find parent for block %u", childblkno);
1343  return InvalidBuffer; /* keep compiler quiet */
1344 }
1345 
1346 /*
1347  * Process buffers emptying stack. Emptying of one buffer can cause emptying
1348  * of other buffers. This function iterates until this cascading emptying
1349  * process finished, e.g. until buffers emptying stack is empty.
1350  */
1351 static void
1353 {
1354  GISTBuildBuffers *gfbb = buildstate->gfbb;
1355 
1356  /* Iterate while we have elements in buffers emptying stack. */
1357  while (gfbb->bufferEmptyingQueue != NIL)
1358  {
1359  GISTNodeBuffer *emptyingNodeBuffer;
1360 
1361  /* Get node buffer from emptying stack. */
1362  emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
1364  emptyingNodeBuffer->queuedForEmptying = false;
1365 
1366  /*
1367  * We are going to load last pages of buffers where emptying will be
1368  * to. So let's unload any previously loaded buffers.
1369  */
1370  gistUnloadNodeBuffers(gfbb);
1371 
1372  /*
1373  * Pop tuples from the buffer and run them down to the buffers at
1374  * lower level, or leaf pages. We continue until one of the lower
1375  * level buffers fills up, or this buffer runs empty.
1376  *
1377  * In Arge et al's paper, the buffer emptying is stopped after
1378  * processing 1/2 node buffer worth of tuples, to avoid overfilling
1379  * any of the lower level buffers. However, it's more efficient to
1380  * keep going until one of the lower level buffers actually fills up,
1381  * so that's what we do. This doesn't need to be exact, if a buffer
1382  * overfills by a few tuples, there's no harm done.
1383  */
1384  while (true)
1385  {
1386  IndexTuple itup;
1387 
1388  /* Get next index tuple from the buffer */
1389  if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
1390  break;
1391 
1392  /*
1393  * Run it down to the underlying node buffer or leaf page.
1394  *
1395  * Note: it's possible that the buffer we're emptying splits as a
1396  * result of this call. If that happens, our emptyingNodeBuffer
1397  * points to the left half of the split. After split, it's very
1398  * likely that the new left buffer is no longer over the half-full
1399  * threshold, but we might as well keep flushing tuples from it
1400  * until we fill a lower-level buffer.
1401  */
1402  if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
1403  {
1404  /*
1405  * A lower level buffer filled up. Stop emptying this buffer,
1406  * to avoid overflowing the lower level buffer.
1407  */
1408  break;
1409  }
1410 
1411  /* Free all the memory allocated during index tuple processing */
1412  MemoryContextReset(buildstate->giststate->tempCxt);
1413  }
1414  }
1415 }
1416 
1417 /*
1418  * Empty all node buffers, from top to bottom. This is done at the end of
1419  * index build to flush all remaining tuples to the index.
1420  *
1421  * Note: This destroys the buffersOnLevels lists, so the buffers should not
1422  * be inserted to after this call.
1423  */
1424 static void
1426 {
1427  GISTBuildBuffers *gfbb = buildstate->gfbb;
1428  MemoryContext oldCtx;
1429  int i;
1430 
1431  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1432 
1433  /*
1434  * Iterate through the levels from top to bottom.
1435  */
1436  for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1437  {
1438  /*
1439  * Empty all buffers on this level. Note that new buffers can pop up
1440  * in the list during the processing, as a result of page splits, so a
1441  * simple walk through the list won't work. We remove buffers from the
1442  * list when we see them empty; a buffer can't become non-empty once
1443  * it's been fully emptied.
1444  */
1445  while (gfbb->buffersOnLevels[i] != NIL)
1446  {
1447  GISTNodeBuffer *nodeBuffer;
1448 
1449  nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1450 
1451  if (nodeBuffer->blocksCount != 0)
1452  {
1453  /*
1454  * Add this buffer to the emptying queue, and proceed to empty
1455  * the queue.
1456  */
1457  if (!nodeBuffer->queuedForEmptying)
1458  {
1460  nodeBuffer->queuedForEmptying = true;
1461  gfbb->bufferEmptyingQueue =
1462  lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1463  MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1464  }
1465  gistProcessEmptyingQueue(buildstate);
1466  }
1467  else
1468  gfbb->buffersOnLevels[i] =
1470  }
1471  elog(DEBUG2, "emptied all buffers at level %d", i);
1472  }
1473  MemoryContextSwitchTo(oldCtx);
1474 }
1475 
1476 /*
1477  * Get the depth of the GiST index.
1478  */
1479 static int
1481 {
1482  int maxLevel;
1483  BlockNumber blkno;
1484 
1485  /*
1486  * Traverse down the tree, starting from the root, until we hit the leaf
1487  * level.
1488  */
1489  maxLevel = 0;
1490  blkno = GIST_ROOT_BLKNO;
1491  while (true)
1492  {
1493  Buffer buffer;
1494  Page page;
1495  IndexTuple itup;
1496 
1497  buffer = ReadBuffer(index, blkno);
1498 
1499  /*
1500  * There's no concurrent access during index build, so locking is just
1501  * pro forma.
1502  */
1503  LockBuffer(buffer, GIST_SHARE);
1504  page = (Page) BufferGetPage(buffer);
1505 
1506  if (GistPageIsLeaf(page))
1507  {
1508  /* We hit the bottom, so we're done. */
1509  UnlockReleaseBuffer(buffer);
1510  break;
1511  }
1512 
1513  /*
1514  * Pick the first downlink on the page, and follow it. It doesn't
1515  * matter which downlink we choose, the tree has the same depth
1516  * everywhere, so we just pick the first one.
1517  */
1518  itup = (IndexTuple) PageGetItem(page,
1520  blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1521  UnlockReleaseBuffer(buffer);
1522 
1523  /*
1524  * We're going down on the tree. It means that there is yet one more
1525  * level in the tree.
1526  */
1527  maxLevel++;
1528  }
1529  return maxLevel;
1530 }
1531 
1532 
1533 /*
1534  * Routines for managing the parent map.
1535  *
1536  * Whenever a page is split, we need to insert the downlinks into the parent.
1537  * We need to somehow find the parent page to do that. In normal insertions,
1538  * we keep a stack of nodes visited when we descend the tree. However, in
1539  * buffering build, we can start descending the tree from any internal node,
1540  * when we empty a buffer by cascading tuples to its children. So we don't
1541  * have a full stack up to the root available at that time.
1542  *
1543  * So instead, we maintain a hash table to track the parent of every internal
1544  * page. We don't need to track the parents of leaf nodes, however. Whenever
1545  * we insert to a leaf, we've just descended down from its parent, so we know
1546  * its immediate parent already. This helps a lot to limit the memory used
1547  * by this hash table.
1548  *
1549  * Whenever an internal node is split, the parent map needs to be updated.
1550  * the parent of the new child page needs to be recorded, and also the
1551  * entries for all page whose downlinks are moved to a new page at the split
1552  * needs to be updated.
1553  *
1554  * We also update the parent map whenever we descend the tree. That might seem
1555  * unnecessary, because we maintain the map whenever a downlink is moved or
1556  * created, but it is needed because we switch to buffering mode after
1557  * creating a tree with regular index inserts. Any pages created before
1558  * switching to buffering mode will not be present in the parent map initially,
1559  * but will be added there the first time we visit them.
1560  */
1561 
1562 typedef struct
1563 {
1564  BlockNumber childblkno; /* hash key */
1566 } ParentMapEntry;
1567 
1568 static void
1570 {
1571  HASHCTL hashCtl;
1572 
1573  hashCtl.keysize = sizeof(BlockNumber);
1574  hashCtl.entrysize = sizeof(ParentMapEntry);
1575  hashCtl.hcxt = CurrentMemoryContext;
1576  buildstate->parentMap = hash_create("gistbuild parent map",
1577  1024,
1578  &hashCtl,
1580 }
1581 
1582 static void
1584 {
1585  ParentMapEntry *entry;
1586  bool found;
1587 
1588  entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1589  (const void *) &child,
1590  HASH_ENTER,
1591  &found);
1592  entry->parentblkno = parent;
1593 }
1594 
1595 /*
1596  * Scan all downlinks on a page, and memorize their parent.
1597  */
1598 static void
1600 {
1601  OffsetNumber maxoff;
1602  OffsetNumber off;
1603  BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1604  Page page = BufferGetPage(parentbuf);
1605 
1606  Assert(!GistPageIsLeaf(page));
1607 
1608  maxoff = PageGetMaxOffsetNumber(page);
1609  for (off = FirstOffsetNumber; off <= maxoff; off++)
1610  {
1611  ItemId iid = PageGetItemId(page, off);
1612  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1613  BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1614 
1615  gistMemorizeParent(buildstate, childblkno, parentblkno);
1616  }
1617 }
1618 
1619 static BlockNumber
1621 {
1622  ParentMapEntry *entry;
1623  bool found;
1624 
1625  /* Find node buffer in hash table */
1626  entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1627  (const void *) &child,
1628  HASH_FIND,
1629  &found);
1630  if (!found)
1631  elog(ERROR, "could not find parent of block %u in lookup table", child);
1632 
1633  return entry->parentblkno;
1634 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static Datum values[MAXATTR]
Definition: bootstrap.c:156
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2755
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3938
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1573
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4156
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:702
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:216
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1539
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:907
Pointer Page
Definition: bufpage.h:78
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:356
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:234
#define PageGetItem(page, itemId)
Definition: bufpage.h:339
#define SizeOfPageHeaderData
Definition: bufpage.h:215
#define PageSetLSN(page, lsn)
Definition: bufpage.h:367
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:415
unsigned short uint16
Definition: c.h:440
#define MAXALIGN(LEN)
Definition: c.h:757
#define VARHDRSZ
Definition: c.h:627
#define OidIsValid(objectId)
Definition: c.h:710
size_t Size
Definition: c.h:540
int effective_cache_size
Definition: costsize.c:129
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:954
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:349
#define DEBUG2
Definition: elog.h:23
#define DEBUG1
Definition: elog.h:24
#define ERROR
Definition: elog.h:33
#define elog(elevel,...)
Definition: elog.h:218
void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate, Relation heapRel, bool is_build)
Definition: gist.c:633
bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright, Relation heapRel, bool is_build)
Definition: gist.c:223
SplitedPageLayout * gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate)
Definition: gist.c:1417
GISTSTATE * initGISTstate(Relation index)
Definition: gist.c:1504
MemoryContext createTempGistContext(void)
Definition: gist.c:120
void freeGISTstate(GISTSTATE *giststate)
Definition: gist.c:1632
#define F_LEAF
Definition: gist.h:46
#define GIST_SORTSUPPORT_PROC
Definition: gist.h:40
struct GISTPageOpaqueData GISTPageOpaqueData
#define GistPageIsLeaf(page)
Definition: gist.h:167
#define GistPageGetOpaque(page)
Definition: gist.h:165
#define GistBuildLSN
Definition: gist.h:67
#define LEVEL_HAS_BUFFERS(nlevel, gfbb)
Definition: gist_private.h:319
@ GIST_OPTION_BUFFERING_OFF
Definition: gist_private.h:388
@ GIST_OPTION_BUFFERING_ON
Definition: gist_private.h:387
#define GIST_ROOT_BLKNO
Definition: gist_private.h:262
#define GIST_DEFAULT_FILLFACTOR
Definition: gist_private.h:479
#define BUFFER_OVERFLOWED(nodeBuffer, gfbb)
Definition: gist_private.h:332
#define GIST_EXCLUSIVE
Definition: gist_private.h:43
#define GIST_SHARE
Definition: gist_private.h:42
GistBuildMode
Definition: gistbuild.c:68
@ GIST_BUFFERING_AUTO
Definition: gistbuild.c:72
@ GIST_SORTED_BUILD
Definition: gistbuild.c:69
@ GIST_BUFFERING_ACTIVE
Definition: gistbuild.c:78
@ GIST_BUFFERING_STATS
Definition: gistbuild.c:75
@ GIST_BUFFERING_DISABLED
Definition: gistbuild.c:70
static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber parentblk, OffsetNumber downlinkoffnum)
Definition: gistbuild.c:1109
static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent)
Definition: gistbuild.c:1599
static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, BlockNumber startblkno, int startlevel)
Definition: gistbuild.c:978
static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state, GistSortedBuildLevelState *levelstate)
Definition: gistbuild.c:521
#define BUFFERING_MODE_SWITCH_CHECK_STEP
Definition: gistbuild.c:52
static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
Definition: gistbuild.c:851
static void gist_indexsortbuild_levelstate_add(GISTBuildState *state, GistSortedBuildLevelState *levelstate, IndexTuple itup)
Definition: gistbuild.c:489
static void gist_indexsortbuild(GISTBuildState *state)
Definition: gistbuild.c:403
static void gistInitParentMap(GISTBuildState *buildstate)
Definition: gistbuild.c:1569
static void gistBuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: gistbuild.c:884
static void gistSortedBuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: gistbuild.c:369
static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
Definition: gistbuild.c:1583
#define GIST_SORTED_BUILD_PAGE_NUM
Definition: gistbuild.c:116
static void gistEmptyAllBuffers(GISTBuildState *buildstate)
Definition: gistbuild.c:1425
#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET
Definition: gistbuild.c:60
static void gistInitBuffering(GISTBuildState *buildstate)
Definition: gistbuild.c:690
static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
Definition: gistbuild.c:643
static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate, BlockNumber childblkno, int level, BlockNumber *parentblk, OffsetNumber *downlinkoffnum)
Definition: gistbuild.c:1278
static void gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
Definition: gistbuild.c:962
IndexBuildResult * gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: gistbuild.c:182
static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child)
Definition: gistbuild.c:1620
static int gistGetMaxLevel(Relation index)
Definition: gistbuild.c:1480
struct GistSortedBuildLevelState GistSortedBuildLevelState
static void gistProcessEmptyingQueue(GISTBuildState *buildstate)
Definition: gistbuild.c:1352
void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, IndexTuple itup)
void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, Relation r, int level, Buffer buffer, List *splitinfo)
GISTBuildBuffers * gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
void gistFreeBuildBuffers(GISTBuildBuffers *gfbb)
GISTNodeBuffer * gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, BlockNumber nodeBlocknum, int level)
bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, IndexTuple *itup)
void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb)
IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, Datum *attdata, bool *isnull, bool isleaf)
Definition: gistutil.c:575
Buffer gistNewBuffer(Relation r)
Definition: gistutil.c:824
void gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off)
Definition: gistutil.c:34
IndexTuple * gistextractpage(Page page, int *len)
Definition: gistutil.c:95
IndexTuple gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate)
Definition: gistutil.c:316
OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, GISTSTATE *giststate)
Definition: gistutil.c:374
IndexTuple * gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen)
Definition: gistutil.c:114
void gistinitpage(Page page, uint32 f)
Definition: gistutil.c:757
IndexTuple gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate)
Definition: gistutil.c:219
void GISTInitBuffer(Buffer b, uint32 f)
Definition: gistutil.c:773
IndexTupleData * gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
Definition: gistutil.c:127
void gistCompressValues(GISTSTATE *giststate, Relation r, Datum *attdata, bool *isnull, bool isleaf, Datum *compatt)
Definition: gistutil.c:596
void gistcheckpage(Relation rel, Buffer buf)
Definition: gistutil.c:785
int maintenance_work_mem
Definition: globals.c:127
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:769
int i
Definition: isn.c:73
Pointer Item
Definition: item.h:17
struct ItemIdData ItemIdData
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define ItemPointerSetBlockNumber(pointer, blockNumber)
Definition: itemptr.h:138
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
Assert(fmt[strlen(fmt) - 1] !='\n')
List * list_delete_first(List *list)
Definition: list.c:902
List * lcons(void *datum, List *list)
Definition: list.c:474
void list_free_deep(List *list)
Definition: list.c:1519
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:143
void pfree(void *pointer)
Definition: mcxt.c:1175
void * palloc0(Size size)
Definition: mcxt.c:1099
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:218
void * palloc(Size size)
Definition: mcxt.c:1068
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define INDEX_MAX_KEYS
const void * data
#define lfirst(lc)
Definition: pg_list.h:169
static int list_length(const List *l)
Definition: pg_list.h:149
#define NIL
Definition: pg_list.h:65
#define linitial(l)
Definition: pg_list.h:174
int fillfactor
Definition: pgbench.c:200
uintptr_t Datum
Definition: postgres.h:411
unsigned int Oid
Definition: postgres_ext.h:31
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:555
#define RelationGetRelationName(relation)
Definition: rel.h:522
#define RelationNeedsWAL(relation)
Definition: rel.h:612
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:507
@ MAIN_FORKNUM
Definition: relpath.h:43
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:579
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:554
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:493
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:691
List * bufferEmptyingQueue
Definition: gist_private.h:357
List ** buffersOnLevels
Definition: gist_private.h:368
MemoryContext context
Definition: gist_private.h:341
GISTBuildBuffers * gfbb
Definition: gistbuild.c:100
BlockNumber pages_allocated
Definition: gistbuild.c:108
int ready_num_pages
Definition: gistbuild.c:111
Relation indexrel
Definition: gistbuild.c:84
GISTSTATE * giststate
Definition: gistbuild.c:86
HTAB * parentMap
Definition: gistbuild.c:101
int64 indtuples
Definition: gistbuild.c:92
Size freespace
Definition: gistbuild.c:88
Relation heaprel
Definition: gistbuild.c:85
int64 indtuplesSize
Definition: gistbuild.c:99
BlockNumber pages_written
Definition: gistbuild.c:109
Tuplesortstate * sortstate
Definition: gistbuild.c:106
GistBuildMode buildMode
Definition: gistbuild.c:90
BlockNumber nodeBlocknum
Definition: gist_private.h:300
bool queuedForEmptying
Definition: gist_private.h:307
IndexTuple downlink
Definition: gist_private.h:422
MemoryContext tempCxt
Definition: gist_private.h:78
Page pages[GIST_SORTED_BUILD_PAGE_NUM]
Definition: gistbuild.c:131
struct GistSortedBuildLevelState * parent
Definition: gistbuild.c:130
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
MemoryContext hcxt
Definition: hsearch.h:86
Definition: dynahash.c:220
double heap_tuples
Definition: genam.h:32
double index_tuples
Definition: genam.h:33
ItemPointerData t_tid
Definition: itup.h:37
Definition: pg_list.h:51
BlockNumber parentblkno
Definition: gistbuild.c:1565
BlockNumber childblkno
Definition: gistbuild.c:1564
gistxlogPage block
Definition: gist_private.h:193
IndexTupleData * list
Definition: gist_private.h:194
struct SplitedPageLayout * next
Definition: gist_private.h:200
Definition: type.h:90
Definition: regguts.h:318
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1747
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2618
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:2196
Tuplesortstate * tuplesort_begin_index_gist(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesort.c:1338
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1620
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
Definition: tuplesort.c:1883
#define TUPLESORT_NONE
Definition: tuplesort.h:90
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:1083
void log_newpages(RelFileNode *rnode, ForkNumber forkNum, int num_pages, BlockNumber *blknos, Page *pages, bool page_std)
Definition: xloginsert.c:1115
void log_newpage_range(Relation rel, ForkNumber forkNum, BlockNumber startblk, BlockNumber endblk, bool page_std)
Definition: xloginsert.c:1210
#define XLR_MAX_BLOCK_ID
Definition: xlogrecord.h:228