PostgreSQL Source Code  git master
gistbuild.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * gistbuild.c
4  * build algorithm for GiST indexes implementation.
5  *
6  * There are two different strategies:
7  *
8  * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted
9  * order, and create downlinks and internal pages as we go. This builds
10  * the index from the bottom up, similar to how B-tree index build
11  * works.
12  *
13  * 2. Start with an empty index, and insert all tuples one by one.
14  *
15  * The sorted method is used if the operator classes for all columns have
16  * a 'sortsupport' defined. Otherwise, we resort to the second strategy.
17  *
18  * The second strategy can optionally use buffers at different levels of
19  * the tree to reduce I/O, see "Buffering build algorithm" in the README
20  * for a more detailed explanation. It initially calls insert over and
21  * over, but switches to the buffered algorithm after a certain number of
22  * tuples (unless buffering mode is disabled).
23  *
24  *
25  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
26  * Portions Copyright (c) 1994, Regents of the University of California
27  *
28  * IDENTIFICATION
29  * src/backend/access/gist/gistbuild.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34 
35 #include <math.h>
36 
37 #include "access/genam.h"
38 #include "access/gist_private.h"
39 #include "access/gistxlog.h"
40 #include "access/tableam.h"
41 #include "access/xloginsert.h"
42 #include "catalog/index.h"
43 #include "miscadmin.h"
44 #include "optimizer/optimizer.h"
45 #include "storage/bufmgr.h"
46 #include "storage/smgr.h"
47 #include "utils/memutils.h"
48 #include "utils/rel.h"
49 #include "utils/tuplesort.h"
50 
51 /* Step of index tuples for check whether to switch to buffering build mode */
52 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
53 
54 /*
55  * Number of tuples to process in the slow way before switching to buffering
56  * mode, when buffering is explicitly turned on. Also, the number of tuples
57  * to process between readjusting the buffer size parameter, while in
58  * buffering mode.
59  */
60 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
61 
62 /*
63  * Strategy used to build the index. It can change between the
64  * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used,
65  * that needs to be decided up-front and cannot be changed afterwards.
66  */
67 typedef enum
68 {
69  GIST_SORTED_BUILD, /* bottom-up build by sorting */
70  GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
71  * switch */
72  GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
73  * buffering build mode if the index grows too
74  * big */
75  GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
76  * before switching to the buffering build
77  * mode */
78  GIST_BUFFERING_ACTIVE /* in buffering build mode */
80 
81 /* Working state for gistbuild and its callback */
82 typedef struct
83 {
87 
88  Size freespace; /* amount of free space to leave on pages */
89 
91 
92  int64 indtuples; /* number of tuples indexed */
93 
94  /*
95  * Extra data structures used during a buffering build. 'gfbb' contains
96  * information related to managing the build buffers. 'parentMap' is a
97  * lookup table of the parent of each internal page.
98  */
99  int64 indtuplesSize; /* total size of all indexed tuples */
102 
103  /*
104  * Extra data structures used during a sorting build.
105  */
106  Tuplesortstate *sortstate; /* state data for tuplesort.c */
107 
110 
113  Page ready_pages[XLR_MAX_BLOCK_ID];
115 
116 #define GIST_SORTED_BUILD_PAGE_NUM 4
117 
118 /*
119  * In sorted build, we use a stack of these structs, one for each level,
120  * to hold an in-memory buffer of last pages at the level.
121  *
122  * Sorting GiST build requires good linearization of the sort opclass. This is
123  * not always the case in multidimensional data. To tackle the anomalies, we
124  * buffer index tuples and apply picksplit that can be multidimension-aware.
125  */
127 {
130  struct GistSortedBuildLevelState *parent; /* Upper level, if any */
133 
134 /* prototypes for private functions */
135 
137  Datum *values, bool *isnull,
138  bool tupleIsAlive, void *state);
141  GistSortedBuildLevelState *levelstate,
142  IndexTuple itup);
144  GistSortedBuildLevelState *levelstate);
146 
147 static void gistInitBuffering(GISTBuildState *buildstate);
148 static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
149 static void gistBuildCallback(Relation index,
150  ItemPointer tid,
151  Datum *values,
152  bool *isnull,
153  bool tupleIsAlive,
154  void *state);
155 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
156  IndexTuple itup);
157 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
158  BlockNumber startblkno, int startlevel);
160  Buffer buffer, int level,
161  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
162  BlockNumber parentblk, OffsetNumber downlinkoffnum);
164  BlockNumber childblkno, int level,
165  BlockNumber *parentblkno,
166  OffsetNumber *downlinkoffnum);
167 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
168 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
169 static int gistGetMaxLevel(Relation index);
170 
171 static void gistInitParentMap(GISTBuildState *buildstate);
172 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
174 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate,
175  Buffer parentbuf);
176 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
177 
178 
179 /*
180  * Main entry point to GiST index build.
181  */
184 {
185  IndexBuildResult *result;
186  double reltuples;
187  GISTBuildState buildstate;
189  int fillfactor;
190  Oid SortSupportFnOids[INDEX_MAX_KEYS];
191  GiSTOptions *options = (GiSTOptions *) index->rd_options;
192 
193  /*
194  * We expect to be called exactly once for any index relation. If that's
195  * not the case, big trouble's what we have.
196  */
198  elog(ERROR, "index \"%s\" already contains data",
200 
201  buildstate.indexrel = index;
202  buildstate.heaprel = heap;
203  buildstate.sortstate = NULL;
204  buildstate.giststate = initGISTstate(index);
205 
206  /*
207  * Create a temporary memory context that is reset once for each tuple
208  * processed. (Note: we don't bother to make this a child of the
209  * giststate's scanCxt, so we have to delete it separately at the end.)
210  */
211  buildstate.giststate->tempCxt = createTempGistContext();
212 
213  /*
214  * Choose build strategy. First check whether the user specified to use
215  * buffering mode. (The use-case for that in the field is somewhat
216  * questionable perhaps, but it's important for testing purposes.)
217  */
218  if (options)
219  {
220  if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
221  buildstate.buildMode = GIST_BUFFERING_STATS;
222  else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
223  buildstate.buildMode = GIST_BUFFERING_DISABLED;
224  else /* must be "auto" */
225  buildstate.buildMode = GIST_BUFFERING_AUTO;
226  }
227  else
228  {
229  buildstate.buildMode = GIST_BUFFERING_AUTO;
230  }
231 
232  /*
233  * Unless buffering mode was forced, see if we can use sorting instead.
234  */
235  if (buildstate.buildMode != GIST_BUFFERING_STATS)
236  {
237  bool hasallsortsupports = true;
239 
240  for (int i = 0; i < keyscount; i++)
241  {
242  SortSupportFnOids[i] = index_getprocid(index, i + 1,
244  if (!OidIsValid(SortSupportFnOids[i]))
245  {
246  hasallsortsupports = false;
247  break;
248  }
249  }
250  if (hasallsortsupports)
251  buildstate.buildMode = GIST_SORTED_BUILD;
252  }
253 
254  /*
255  * Calculate target amount of free space to leave on pages.
256  */
258  buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
259 
260  /*
261  * Build the index using the chosen strategy.
262  */
263  buildstate.indtuples = 0;
264  buildstate.indtuplesSize = 0;
265 
266  if (buildstate.buildMode == GIST_SORTED_BUILD)
267  {
268  /*
269  * Sort all data, build the index from bottom up.
270  */
271  buildstate.sortstate = tuplesort_begin_index_gist(heap,
272  index,
274  NULL,
276 
277  /* Scan the table, adding all tuples to the tuplesort */
278  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
280  (void *) &buildstate, NULL);
281 
282  /*
283  * Perform the sort and build index pages.
284  */
285  tuplesort_performsort(buildstate.sortstate);
286 
287  gist_indexsortbuild(&buildstate);
288 
289  tuplesort_end(buildstate.sortstate);
290  }
291  else
292  {
293  /*
294  * Initialize an empty index and insert all tuples, possibly using
295  * buffers on intermediate levels.
296  */
297  Buffer buffer;
298  Page page;
299 
300  /* initialize the root page */
301  buffer = gistNewBuffer(index, heap);
303  page = BufferGetPage(buffer);
304 
306 
307  GISTInitBuffer(buffer, F_LEAF);
308 
309  MarkBufferDirty(buffer);
310  PageSetLSN(page, GistBuildLSN);
311 
312  UnlockReleaseBuffer(buffer);
313 
315 
316  /* Scan the table, inserting all the tuples to the index. */
317  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
319  (void *) &buildstate, NULL);
320 
321  /*
322  * If buffering was used, flush out all the tuples that are still in
323  * the buffers.
324  */
325  if (buildstate.buildMode == GIST_BUFFERING_ACTIVE)
326  {
327  elog(DEBUG1, "all tuples processed, emptying buffers");
328  gistEmptyAllBuffers(&buildstate);
329  gistFreeBuildBuffers(buildstate.gfbb);
330  }
331 
332  /*
333  * We didn't write WAL records as we built the index, so if
334  * WAL-logging is required, write all pages to the WAL now.
335  */
336  if (RelationNeedsWAL(index))
337  {
340  true);
341  }
342  }
343 
344  /* okay, all heap tuples are indexed */
345  MemoryContextSwitchTo(oldcxt);
347 
348  freeGISTstate(buildstate.giststate);
349 
350  /*
351  * Return statistics
352  */
353  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
354 
355  result->heap_tuples = reltuples;
356  result->index_tuples = (double) buildstate.indtuples;
357 
358  return result;
359 }
360 
361 /*-------------------------------------------------------------------------
362  * Routines for sorted build
363  *-------------------------------------------------------------------------
364  */
365 
366 /*
367  * Per-tuple callback for table_index_build_scan.
368  */
369 static void
371  ItemPointer tid,
372  Datum *values,
373  bool *isnull,
374  bool tupleIsAlive,
375  void *state)
376 {
377  GISTBuildState *buildstate = (GISTBuildState *) state;
378  MemoryContext oldCtx;
379  Datum compressed_values[INDEX_MAX_KEYS];
380 
381  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
382 
383  /* Form an index tuple and point it at the heap tuple */
384  gistCompressValues(buildstate->giststate, index,
385  values, isnull,
386  true, compressed_values);
387 
389  buildstate->indexrel,
390  tid,
391  compressed_values, isnull);
392 
393  MemoryContextSwitchTo(oldCtx);
394  MemoryContextReset(buildstate->giststate->tempCxt);
395 
396  /* Update tuple count. */
397  buildstate->indtuples += 1;
398 }
399 
400 /*
401  * Build GiST index from bottom up from pre-sorted tuples.
402  */
403 static void
405 {
406  IndexTuple itup;
407  GistSortedBuildLevelState *levelstate;
408  Page page;
409 
410  state->pages_allocated = 0;
411  state->pages_written = 0;
412  state->ready_num_pages = 0;
413 
414  /*
415  * Write an empty page as a placeholder for the root page. It will be
416  * replaced with the real root page at the end.
417  */
420  page, true);
421  state->pages_allocated++;
422  state->pages_written++;
423 
424  /* Allocate a temporary buffer for the first leaf page batch. */
425  levelstate = palloc0(sizeof(GistSortedBuildLevelState));
426  levelstate->pages[0] = page;
427  levelstate->parent = NULL;
428  gistinitpage(page, F_LEAF);
429 
430  /*
431  * Fill index pages with tuples in the sorted order.
432  */
433  while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
434  {
435  gist_indexsortbuild_levelstate_add(state, levelstate, itup);
436  MemoryContextReset(state->giststate->tempCxt);
437  }
438 
439  /*
440  * Write out the partially full non-root pages.
441  *
442  * Keep in mind that flush can build a new root. If number of pages is > 1
443  * then new root is required.
444  */
445  while (levelstate->parent != NULL || levelstate->current_page != 0)
446  {
448 
450  parent = levelstate->parent;
451  for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
452  if (levelstate->pages[i])
453  pfree(levelstate->pages[i]);
454  pfree(levelstate);
455  levelstate = parent;
456  }
457 
459 
460  /* Write out the root */
461  PageSetLSN(levelstate->pages[0], GistBuildLSN);
464  levelstate->pages[0], true);
465  if (RelationNeedsWAL(state->indexrel))
466  log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO,
467  levelstate->pages[0], true);
468 
469  pfree(levelstate->pages[0]);
470  pfree(levelstate);
471 
472  /*
473  * When we WAL-logged index pages, we must nonetheless fsync index files.
474  * Since we're building outside shared buffers, a CHECKPOINT occurring
475  * during the build has no way to flush the previously written data to
476  * disk (indeed it won't know the index even exists). A crash later on
477  * would replay WAL from the checkpoint, therefore it wouldn't replay our
478  * earlier WAL entries. If we do not fsync those pages here, they might
479  * still not be on disk when the crash occurs.
480  */
481  if (RelationNeedsWAL(state->indexrel))
483 }
484 
485 /*
486  * Add tuple to a page. If the pages are full, write them out and re-initialize
487  * a new page first.
488  */
489 static void
491  GistSortedBuildLevelState *levelstate,
492  IndexTuple itup)
493 {
494  Size sizeNeeded;
495 
496  /* Check if tuple can be added to the current page */
497  sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
498  if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
499  {
500  Page newPage;
501  Page old_page = levelstate->pages[levelstate->current_page];
502  uint16 old_page_flags = GistPageGetOpaque(old_page)->flags;
503 
504  if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
505  {
507  }
508  else
509  levelstate->current_page++;
510 
511  if (levelstate->pages[levelstate->current_page] == NULL)
512  levelstate->pages[levelstate->current_page] =
513  palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
514 
515  newPage = levelstate->pages[levelstate->current_page];
516  gistinitpage(newPage, old_page_flags);
517  }
518 
519  gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
520 }
521 
522 static void
524  GistSortedBuildLevelState *levelstate)
525 {
527  BlockNumber blkno;
528  MemoryContext oldCtx;
529  IndexTuple union_tuple;
530  SplitedPageLayout *dist;
531  IndexTuple *itvec;
532  int vect_len;
533  bool isleaf = GistPageIsLeaf(levelstate->pages[0]);
534 
536 
537  oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
538 
539  /* Get index tuples from first page */
540  itvec = gistextractpage(levelstate->pages[0], &vect_len);
541  if (levelstate->current_page > 0)
542  {
543  /* Append tuples from each page */
544  for (int i = 1; i < levelstate->current_page + 1; i++)
545  {
546  int len_local;
547  IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);
548 
549  itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
550  pfree(itvec_local);
551  }
552 
553  /* Apply picksplit to list of all collected tuples */
554  dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
555  }
556  else
557  {
558  /* Create splitted layout from single page */
559  dist = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout));
560  union_tuple = gistunion(state->indexrel, itvec, vect_len,
561  state->giststate);
562  dist->itup = union_tuple;
563  dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
564  dist->block.num = vect_len;
565  }
566 
567  MemoryContextSwitchTo(oldCtx);
568 
569  /* Reset page counter */
570  levelstate->current_page = 0;
571 
572  /* Create pages for all partitions in split result */
573  for (; dist != NULL; dist = dist->next)
574  {
575  char *data;
576  Page target;
577 
578  /* check once per page */
580 
581  /* Create page and copy data */
582  data = (char *) (dist->list);
584  gistinitpage(target, isleaf ? F_LEAF : 0);
585  for (int i = 0; i < dist->block.num; i++)
586  {
587  IndexTuple thistup = (IndexTuple) data;
588 
589  if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
590  elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));
591 
592  data += IndexTupleSize(thistup);
593  }
594  union_tuple = dist->itup;
595 
596  if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
598 
599  /*
600  * The page is now complete. Assign a block number to it, and add it
601  * to the list of finished pages. (We don't write it out immediately,
602  * because we want to WAL-log the pages in batches.)
603  */
604  blkno = state->pages_allocated++;
605  state->ready_blknos[state->ready_num_pages] = blkno;
606  state->ready_pages[state->ready_num_pages] = target;
607  state->ready_num_pages++;
608  ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
609 
610  /*
611  * Set the right link to point to the previous page. This is just for
612  * debugging purposes: GiST only follows the right link if a page is
613  * split concurrently to a scan, and that cannot happen during index
614  * build.
615  *
616  * It's a bit counterintuitive that we set the right link on the new
617  * page to point to the previous page, not the other way around. But
618  * GiST pages are not ordered like B-tree pages are, so as long as the
619  * right-links form a chain through all the pages at the same level,
620  * the order doesn't matter.
621  */
622  if (levelstate->last_blkno)
623  GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
624  levelstate->last_blkno = blkno;
625 
626  /*
627  * Insert the downlink to the parent page. If this was the root,
628  * create a new page as the parent, which becomes the new root.
629  */
630  parent = levelstate->parent;
631  if (parent == NULL)
632  {
634  parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
635  parent->parent = NULL;
636  gistinitpage(parent->pages[0], 0);
637 
638  levelstate->parent = parent;
639  }
641  }
642 }
643 
644 static void
646 {
647  if (state->ready_num_pages == 0)
648  return;
649 
650  for (int i = 0; i < state->ready_num_pages; i++)
651  {
652  Page page = state->ready_pages[i];
653  BlockNumber blkno = state->ready_blknos[i];
654 
655  /* Currently, the blocks must be buffered in order. */
656  if (blkno != state->pages_written)
657  elog(ERROR, "unexpected block number to flush GiST sorting build");
658 
659  PageSetLSN(page, GistBuildLSN);
660  PageSetChecksumInplace(page, blkno);
661  smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page,
662  true);
663 
664  state->pages_written++;
665  }
666 
667  if (RelationNeedsWAL(state->indexrel))
668  log_newpages(&state->indexrel->rd_locator, MAIN_FORKNUM, state->ready_num_pages,
669  state->ready_blknos, state->ready_pages, true);
670 
671  for (int i = 0; i < state->ready_num_pages; i++)
672  pfree(state->ready_pages[i]);
673 
674  state->ready_num_pages = 0;
675 }
676 
677 
678 /*-------------------------------------------------------------------------
679  * Routines for non-sorted build
680  *-------------------------------------------------------------------------
681  */
682 
683 /*
684  * Attempt to switch to buffering mode.
685  *
686  * If there is not enough memory for buffering build, sets bufferingMode
687  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
688  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
689  * GIST_BUFFERING_ACTIVE.
690  */
691 static void
693 {
694  Relation index = buildstate->indexrel;
695  int pagesPerBuffer;
696  Size pageFreeSpace;
697  Size itupAvgSize,
698  itupMinSize;
699  double avgIndexTuplesPerPage,
700  maxIndexTuplesPerPage;
701  int i;
702  int levelStep;
703 
704  /* Calc space of index page which is available for index tuples */
705  pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
706  - sizeof(ItemIdData)
707  - buildstate->freespace;
708 
709  /*
710  * Calculate average size of already inserted index tuples using gathered
711  * statistics.
712  */
713  itupAvgSize = (double) buildstate->indtuplesSize /
714  (double) buildstate->indtuples;
715 
716  /*
717  * Calculate minimal possible size of index tuple by index metadata.
718  * Minimal possible size of varlena is VARHDRSZ.
719  *
720  * XXX: that's not actually true, as a short varlen can be just 2 bytes.
721  * And we should take padding into account here.
722  */
723  itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
724  for (i = 0; i < index->rd_att->natts; i++)
725  {
726  if (TupleDescAttr(index->rd_att, i)->attlen < 0)
727  itupMinSize += VARHDRSZ;
728  else
729  itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
730  }
731 
732  /* Calculate average and maximal number of index tuples which fit to page */
733  avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
734  maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
735 
736  /*
737  * We need to calculate two parameters for the buffering algorithm:
738  * levelStep and pagesPerBuffer.
739  *
740  * levelStep determines the size of subtree that we operate on, while
741  * emptying a buffer. A higher value is better, as you need fewer buffer
742  * emptying steps to build the index. However, if you set it too high, the
743  * subtree doesn't fit in cache anymore, and you quickly lose the benefit
744  * of the buffers.
745  *
746  * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
747  * the number of tuples on page (ie. fanout), and M is the amount of
748  * internal memory available. Curiously, they doesn't explain *why* that
749  * setting is optimal. We calculate it by taking the highest levelStep so
750  * that a subtree still fits in cache. For a small B, our way of
751  * calculating levelStep is very close to Arge et al's formula. For a
752  * large B, our formula gives a value that is 2x higher.
753  *
754  * The average size (in pages) of a subtree of depth n can be calculated
755  * as a geometric series:
756  *
757  * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
758  *
759  * where B is the average number of index tuples on page. The subtree is
760  * cached in the shared buffer cache and the OS cache, so we choose
761  * levelStep so that the subtree size is comfortably smaller than
762  * effective_cache_size, with a safety factor of 4.
763  *
764  * The estimate on the average number of index tuples on page is based on
765  * average tuple sizes observed before switching to buffered build, so the
766  * real subtree size can be somewhat larger. Also, it would selfish to
767  * gobble the whole cache for our index build. The safety factor of 4
768  * should account for those effects.
769  *
770  * The other limiting factor for setting levelStep is that while
771  * processing a subtree, we need to hold one page for each buffer at the
772  * next lower buffered level. The max. number of buffers needed for that
773  * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
774  * hopefully maintenance_work_mem is set high enough that you're
775  * constrained by effective_cache_size rather than maintenance_work_mem.
776  *
777  * XXX: the buffer hash table consumes a fair amount of memory too per
778  * buffer, but that is not currently taken into account. That scales on
779  * the total number of buffers used, ie. the index size and on levelStep.
780  * Note that a higher levelStep *reduces* the amount of memory needed for
781  * the hash table.
782  */
783  levelStep = 1;
784  for (;;)
785  {
786  double subtreesize;
787  double maxlowestlevelpages;
788 
789  /* size of an average subtree at this levelStep (in pages). */
790  subtreesize =
791  (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
792  (1 - avgIndexTuplesPerPage);
793 
794  /* max number of pages at the lowest level of a subtree */
795  maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
796 
797  /* subtree must fit in cache (with safety factor of 4) */
798  if (subtreesize > effective_cache_size / 4)
799  break;
800 
801  /* each node in the lowest level of a subtree has one page in memory */
802  if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
803  break;
804 
805  /* Good, we can handle this levelStep. See if we can go one higher. */
806  levelStep++;
807  }
808 
809  /*
810  * We just reached an unacceptable value of levelStep in previous loop.
811  * So, decrease levelStep to get last acceptable value.
812  */
813  levelStep--;
814 
815  /*
816  * If there's not enough cache or maintenance_work_mem, fall back to plain
817  * inserts.
818  */
819  if (levelStep <= 0)
820  {
821  elog(DEBUG1, "failed to switch to buffered GiST build");
822  buildstate->buildMode = GIST_BUFFERING_DISABLED;
823  return;
824  }
825 
826  /*
827  * The second parameter to set is pagesPerBuffer, which determines the
828  * size of each buffer. We adjust pagesPerBuffer also during the build,
829  * which is why this calculation is in a separate function.
830  */
831  pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
832 
833  /* Initialize GISTBuildBuffers with these parameters */
834  buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
836 
837  gistInitParentMap(buildstate);
838 
839  buildstate->buildMode = GIST_BUFFERING_ACTIVE;
840 
841  elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
842  levelStep, pagesPerBuffer);
843 }
844 
845 /*
846  * Calculate pagesPerBuffer parameter for the buffering algorithm.
847  *
848  * Buffer size is chosen so that assuming that tuples are distributed
849  * randomly, emptying half a buffer fills on average one page in every buffer
850  * at the next lower level.
851  */
852 static int
853 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
854 {
855  double pagesPerBuffer;
856  double avgIndexTuplesPerPage;
857  double itupAvgSize;
858  Size pageFreeSpace;
859 
860  /* Calc space of index page which is available for index tuples */
861  pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
862  - sizeof(ItemIdData)
863  - buildstate->freespace;
864 
865  /*
866  * Calculate average size of already inserted index tuples using gathered
867  * statistics.
868  */
869  itupAvgSize = (double) buildstate->indtuplesSize /
870  (double) buildstate->indtuples;
871 
872  avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
873 
874  /*
875  * Recalculate required size of buffers.
876  */
877  pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
878 
879  return (int) rint(pagesPerBuffer);
880 }
881 
882 /*
883  * Per-tuple callback for table_index_build_scan.
884  */
885 static void
887  ItemPointer tid,
888  Datum *values,
889  bool *isnull,
890  bool tupleIsAlive,
891  void *state)
892 {
893  GISTBuildState *buildstate = (GISTBuildState *) state;
894  IndexTuple itup;
895  MemoryContext oldCtx;
896 
897  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
898 
899  /* form an index tuple and point it at the heap tuple */
900  itup = gistFormTuple(buildstate->giststate, index,
901  values, isnull,
902  true);
903  itup->t_tid = *tid;
904 
905  /* Update tuple count and total size. */
906  buildstate->indtuples += 1;
907  buildstate->indtuplesSize += IndexTupleSize(itup);
908 
909  /*
910  * XXX In buffering builds, the tempCxt is also reset down inside
911  * gistProcessEmptyingQueue(). This is not great because it risks
912  * confusion and possible use of dangling pointers (for example, itup
913  * might be already freed when control returns here). It's generally
914  * better that a memory context be "owned" by only one function. However,
915  * currently this isn't causing issues so it doesn't seem worth the amount
916  * of refactoring that would be needed to avoid it.
917  */
918  if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
919  {
920  /* We have buffers, so use them. */
921  gistBufferingBuildInsert(buildstate, itup);
922  }
923  else
924  {
925  /*
926  * There's no buffers (yet). Since we already have the index relation
927  * locked, we call gistdoinsert directly.
928  */
929  gistdoinsert(index, itup, buildstate->freespace,
930  buildstate->giststate, buildstate->heaprel, true);
931  }
932 
933  MemoryContextSwitchTo(oldCtx);
934  MemoryContextReset(buildstate->giststate->tempCxt);
935 
936  if (buildstate->buildMode == GIST_BUFFERING_ACTIVE &&
938  {
939  /* Adjust the target buffer size now */
940  buildstate->gfbb->pagesPerBuffer =
941  calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
942  }
943 
944  /*
945  * In 'auto' mode, check if the index has grown too large to fit in cache,
946  * and switch to buffering mode if it has.
947  *
948  * To avoid excessive calls to smgrnblocks(), only check this every
949  * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples.
950  *
951  * In 'stats' state, switch as soon as we have seen enough tuples to have
952  * some idea of the average tuple size.
953  */
954  if ((buildstate->buildMode == GIST_BUFFERING_AUTO &&
955  buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
957  MAIN_FORKNUM)) ||
958  (buildstate->buildMode == GIST_BUFFERING_STATS &&
960  {
961  /*
962  * Index doesn't fit in effective cache anymore. Try to switch to
963  * buffering build mode.
964  */
965  gistInitBuffering(buildstate);
966  }
967 }
968 
969 /*
970  * Insert function for buffering index build.
971  */
972 static void
974 {
975  /* Insert the tuple to buffers. */
976  gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
977 
978  /* If we filled up (half of a) buffer, process buffer emptying. */
979  gistProcessEmptyingQueue(buildstate);
980 }
981 
982 /*
983  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
984  * page or node buffer, and inserts the tuple there. Returns true if we have
985  * to stop buffer emptying process (because one of child buffers can't take
986  * index tuples anymore).
987  */
988 static bool
990  BlockNumber startblkno, int startlevel)
991 {
992  GISTSTATE *giststate = buildstate->giststate;
993  GISTBuildBuffers *gfbb = buildstate->gfbb;
994  Relation indexrel = buildstate->indexrel;
995  BlockNumber childblkno;
996  Buffer buffer;
997  bool result = false;
998  BlockNumber blkno;
999  int level;
1000  OffsetNumber downlinkoffnum = InvalidOffsetNumber;
1001  BlockNumber parentblkno = InvalidBlockNumber;
1002 
1004 
1005  /*
1006  * Loop until we reach a leaf page (level == 0) or a level with buffers
1007  * (not including the level we start at, because we would otherwise make
1008  * no progress).
1009  */
1010  blkno = startblkno;
1011  level = startlevel;
1012  for (;;)
1013  {
1014  ItemId iid;
1015  IndexTuple idxtuple,
1016  newtup;
1017  Page page;
1018  OffsetNumber childoffnum;
1019 
1020  /* Have we reached a level with buffers? */
1021  if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
1022  break;
1023 
1024  /* Have we reached a leaf page? */
1025  if (level == 0)
1026  break;
1027 
1028  /*
1029  * Nope. Descend down to the next level then. Choose a child to
1030  * descend down to.
1031  */
1032 
1033  buffer = ReadBuffer(indexrel, blkno);
1034  LockBuffer(buffer, GIST_EXCLUSIVE);
1035 
1036  page = (Page) BufferGetPage(buffer);
1037  childoffnum = gistchoose(indexrel, page, itup, giststate);
1038  iid = PageGetItemId(page, childoffnum);
1039  idxtuple = (IndexTuple) PageGetItem(page, iid);
1040  childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1041 
1042  if (level > 1)
1043  gistMemorizeParent(buildstate, childblkno, blkno);
1044 
1045  /*
1046  * Check that the key representing the target child node is consistent
1047  * with the key we're inserting. Update it if it's not.
1048  */
1049  newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
1050  if (newtup)
1051  {
1052  blkno = gistbufferinginserttuples(buildstate,
1053  buffer,
1054  level,
1055  &newtup,
1056  1,
1057  childoffnum,
1060  /* gistbufferinginserttuples() released the buffer */
1061  }
1062  else
1063  UnlockReleaseBuffer(buffer);
1064 
1065  /* Descend to the child */
1066  parentblkno = blkno;
1067  blkno = childblkno;
1068  downlinkoffnum = childoffnum;
1069  Assert(level > 0);
1070  level--;
1071  }
1072 
1073  if (LEVEL_HAS_BUFFERS(level, gfbb))
1074  {
1075  /*
1076  * We've reached level with buffers. Place the index tuple to the
1077  * buffer, and add the buffer to the emptying queue if it overflows.
1078  */
1079  GISTNodeBuffer *childNodeBuffer;
1080 
1081  /* Find the buffer or create a new one */
1082  childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
1083 
1084  /* Add index tuple to it */
1085  gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
1086 
1087  if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
1088  result = true;
1089  }
1090  else
1091  {
1092  /*
1093  * We've reached a leaf page. Place the tuple here.
1094  */
1095  Assert(level == 0);
1096  buffer = ReadBuffer(indexrel, blkno);
1097  LockBuffer(buffer, GIST_EXCLUSIVE);
1098  gistbufferinginserttuples(buildstate, buffer, level,
1099  &itup, 1, InvalidOffsetNumber,
1100  parentblkno, downlinkoffnum);
1101  /* gistbufferinginserttuples() released the buffer */
1102  }
1103 
1104  return result;
1105 }
1106 
1107 /*
1108  * Insert tuples to a given page.
1109  *
1110  * This is analogous with gistinserttuples() in the regular insertion code.
1111  *
1112  * Returns the block number of the page where the (first) new or updated tuple
1113  * was inserted. Usually that's the original page, but might be a sibling page
1114  * if the original page was split.
1115  *
1116  * Caller should hold a lock on 'buffer' on entry. This function will unlock
1117  * and unpin it.
1118  */
1119 static BlockNumber
1120 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
1121  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
1122  BlockNumber parentblk, OffsetNumber downlinkoffnum)
1123 {
1124  GISTBuildBuffers *gfbb = buildstate->gfbb;
1125  List *splitinfo;
1126  bool is_split;
1127  BlockNumber placed_to_blk = InvalidBlockNumber;
1128 
1129  is_split = gistplacetopage(buildstate->indexrel,
1130  buildstate->freespace,
1131  buildstate->giststate,
1132  buffer,
1133  itup, ntup, oldoffnum, &placed_to_blk,
1134  InvalidBuffer,
1135  &splitinfo,
1136  false,
1137  buildstate->heaprel, true);
1138 
1139  /*
1140  * If this is a root split, update the root path item kept in memory. This
1141  * ensures that all path stacks are always complete, including all parent
1142  * nodes up to the root. That simplifies the algorithm to re-find correct
1143  * parent.
1144  */
1145  if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
1146  {
1147  Page page = BufferGetPage(buffer);
1148  OffsetNumber off;
1149  OffsetNumber maxoff;
1150 
1151  Assert(level == gfbb->rootlevel);
1152  gfbb->rootlevel++;
1153 
1154  elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
1155 
1156  /*
1157  * All the downlinks on the old root page are now on one of the child
1158  * pages. Visit all the new child pages to memorize the parents of the
1159  * grandchildren.
1160  */
1161  if (gfbb->rootlevel > 1)
1162  {
1163  maxoff = PageGetMaxOffsetNumber(page);
1164  for (off = FirstOffsetNumber; off <= maxoff; off++)
1165  {
1166  ItemId iid = PageGetItemId(page, off);
1167  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1168  BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1169  Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
1170 
1171  LockBuffer(childbuf, GIST_SHARE);
1172  gistMemorizeAllDownlinks(buildstate, childbuf);
1173  UnlockReleaseBuffer(childbuf);
1174 
1175  /*
1176  * Also remember that the parent of the new child page is the
1177  * root block.
1178  */
1179  gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
1180  }
1181  }
1182  }
1183 
1184  if (splitinfo)
1185  {
1186  /*
1187  * Insert the downlinks to the parent. This is analogous with
1188  * gistfinishsplit() in the regular insertion code, but the locking is
1189  * simpler, and we have to maintain the buffers on internal nodes and
1190  * the parent map.
1191  */
1192  IndexTuple *downlinks;
1193  int ndownlinks,
1194  i;
1195  Buffer parentBuffer;
1196  ListCell *lc;
1197 
1198  /* Parent may have changed since we memorized this path. */
1199  parentBuffer =
1200  gistBufferingFindCorrectParent(buildstate,
1201  BufferGetBlockNumber(buffer),
1202  level,
1203  &parentblk,
1204  &downlinkoffnum);
1205 
1206  /*
1207  * If there's a buffer associated with this page, that needs to be
1208  * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
1209  * downlinks in 'splitinfo', to make sure they're consistent not only
1210  * with the tuples already on the pages, but also the tuples in the
1211  * buffers that will eventually be inserted to them.
1212  */
1214  buildstate->giststate,
1215  buildstate->indexrel,
1216  level,
1217  buffer, splitinfo);
1218 
1219  /* Create an array of all the downlink tuples */
1220  ndownlinks = list_length(splitinfo);
1221  downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
1222  i = 0;
1223  foreach(lc, splitinfo)
1224  {
1225  GISTPageSplitInfo *splitinfo = lfirst(lc);
1226 
1227  /*
1228  * Remember the parent of each new child page in our parent map.
1229  * This assumes that the downlinks fit on the parent page. If the
1230  * parent page is split, too, when we recurse up to insert the
1231  * downlinks, the recursive gistbufferinginserttuples() call will
1232  * update the map again.
1233  */
1234  if (level > 0)
1235  gistMemorizeParent(buildstate,
1236  BufferGetBlockNumber(splitinfo->buf),
1237  BufferGetBlockNumber(parentBuffer));
1238 
1239  /*
1240  * Also update the parent map for all the downlinks that got moved
1241  * to a different page. (actually this also loops through the
1242  * downlinks that stayed on the original page, but it does no
1243  * harm).
1244  */
1245  if (level > 1)
1246  gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
1247 
1248  /*
1249  * Since there's no concurrent access, we can release the lower
1250  * level buffers immediately. This includes the original page.
1251  */
1252  UnlockReleaseBuffer(splitinfo->buf);
1253  downlinks[i++] = splitinfo->downlink;
1254  }
1255 
1256  /* Insert them into parent. */
1257  gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
1258  downlinks, ndownlinks, downlinkoffnum,
1260 
1261  list_free_deep(splitinfo); /* we don't need this anymore */
1262  }
1263  else
1264  UnlockReleaseBuffer(buffer);
1265 
1266  return placed_to_blk;
1267 }
1268 
1269 /*
1270  * Find the downlink pointing to a child page.
1271  *
1272  * 'childblkno' indicates the child page to find the parent for. 'level' is
1273  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
1274  * point to a location where the downlink used to be - we will check that
1275  * location first, and save some cycles if it hasn't moved. The function
1276  * returns a buffer containing the downlink, exclusively-locked, and
1277  * *parentblkno and *downlinkoffnum are set to the real location of the
1278  * downlink.
1279  *
1280  * If the child page is a leaf (level == 0), the caller must supply a correct
1281  * parentblkno. Otherwise we use the parent map hash table to find the parent
1282  * block.
1283  *
1284  * This function serves the same purpose as gistFindCorrectParent() during
1285  * normal index inserts, but this is simpler because we don't need to deal
1286  * with concurrent inserts.
1287  */
1288 static Buffer
1290  BlockNumber childblkno, int level,
1291  BlockNumber *parentblkno,
1292  OffsetNumber *downlinkoffnum)
1293 {
1295  Buffer buffer;
1296  Page page;
1297  OffsetNumber maxoff;
1298  OffsetNumber off;
1299 
1300  if (level > 0)
1301  parent = gistGetParent(buildstate, childblkno);
1302  else
1303  {
1304  /*
1305  * For a leaf page, the caller must supply a correct parent block
1306  * number.
1307  */
1308  if (*parentblkno == InvalidBlockNumber)
1309  elog(ERROR, "no parent buffer provided of child %u", childblkno);
1310  parent = *parentblkno;
1311  }
1312 
1313  buffer = ReadBuffer(buildstate->indexrel, parent);
1314  page = BufferGetPage(buffer);
1315  LockBuffer(buffer, GIST_EXCLUSIVE);
1316  gistcheckpage(buildstate->indexrel, buffer);
1317  maxoff = PageGetMaxOffsetNumber(page);
1318 
1319  /* Check if it was not moved */
1320  if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
1321  *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
1322  {
1323  ItemId iid = PageGetItemId(page, *downlinkoffnum);
1324  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1325 
1326  if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1327  {
1328  /* Still there */
1329  return buffer;
1330  }
1331  }
1332 
1333  /*
1334  * Downlink was not at the offset where it used to be. Scan the page to
1335  * find it. During normal gist insertions, it might've moved to another
1336  * page, to the right, but during a buffering build, we keep track of the
1337  * parent of each page in the lookup table so we should always know what
1338  * page it's on.
1339  */
1340  for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
1341  {
1342  ItemId iid = PageGetItemId(page, off);
1343  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1344 
1345  if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1346  {
1347  /* yes!!, found it */
1348  *downlinkoffnum = off;
1349  return buffer;
1350  }
1351  }
1352 
1353  elog(ERROR, "failed to re-find parent for block %u", childblkno);
1354  return InvalidBuffer; /* keep compiler quiet */
1355 }
1356 
1357 /*
1358  * Process buffers emptying stack. Emptying of one buffer can cause emptying
1359  * of other buffers. This function iterates until this cascading emptying
1360  * process finished, e.g. until buffers emptying stack is empty.
1361  */
1362 static void
1364 {
1365  GISTBuildBuffers *gfbb = buildstate->gfbb;
1366 
1367  /* Iterate while we have elements in buffers emptying stack. */
1368  while (gfbb->bufferEmptyingQueue != NIL)
1369  {
1370  GISTNodeBuffer *emptyingNodeBuffer;
1371 
1372  /* Get node buffer from emptying stack. */
1373  emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
1375  emptyingNodeBuffer->queuedForEmptying = false;
1376 
1377  /*
1378  * We are going to load last pages of buffers where emptying will be
1379  * to. So let's unload any previously loaded buffers.
1380  */
1381  gistUnloadNodeBuffers(gfbb);
1382 
1383  /*
1384  * Pop tuples from the buffer and run them down to the buffers at
1385  * lower level, or leaf pages. We continue until one of the lower
1386  * level buffers fills up, or this buffer runs empty.
1387  *
1388  * In Arge et al's paper, the buffer emptying is stopped after
1389  * processing 1/2 node buffer worth of tuples, to avoid overfilling
1390  * any of the lower level buffers. However, it's more efficient to
1391  * keep going until one of the lower level buffers actually fills up,
1392  * so that's what we do. This doesn't need to be exact, if a buffer
1393  * overfills by a few tuples, there's no harm done.
1394  */
1395  while (true)
1396  {
1397  IndexTuple itup;
1398 
1399  /* Get next index tuple from the buffer */
1400  if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
1401  break;
1402 
1403  /*
1404  * Run it down to the underlying node buffer or leaf page.
1405  *
1406  * Note: it's possible that the buffer we're emptying splits as a
1407  * result of this call. If that happens, our emptyingNodeBuffer
1408  * points to the left half of the split. After split, it's very
1409  * likely that the new left buffer is no longer over the half-full
1410  * threshold, but we might as well keep flushing tuples from it
1411  * until we fill a lower-level buffer.
1412  */
1413  if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
1414  {
1415  /*
1416  * A lower level buffer filled up. Stop emptying this buffer,
1417  * to avoid overflowing the lower level buffer.
1418  */
1419  break;
1420  }
1421 
1422  /* Free all the memory allocated during index tuple processing */
1423  MemoryContextReset(buildstate->giststate->tempCxt);
1424  }
1425  }
1426 }
1427 
1428 /*
1429  * Empty all node buffers, from top to bottom. This is done at the end of
1430  * index build to flush all remaining tuples to the index.
1431  *
1432  * Note: This destroys the buffersOnLevels lists, so the buffers should not
1433  * be inserted to after this call.
1434  */
1435 static void
1437 {
1438  GISTBuildBuffers *gfbb = buildstate->gfbb;
1439  MemoryContext oldCtx;
1440  int i;
1441 
1442  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1443 
1444  /*
1445  * Iterate through the levels from top to bottom.
1446  */
1447  for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1448  {
1449  /*
1450  * Empty all buffers on this level. Note that new buffers can pop up
1451  * in the list during the processing, as a result of page splits, so a
1452  * simple walk through the list won't work. We remove buffers from the
1453  * list when we see them empty; a buffer can't become non-empty once
1454  * it's been fully emptied.
1455  */
1456  while (gfbb->buffersOnLevels[i] != NIL)
1457  {
1458  GISTNodeBuffer *nodeBuffer;
1459 
1460  nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1461 
1462  if (nodeBuffer->blocksCount != 0)
1463  {
1464  /*
1465  * Add this buffer to the emptying queue, and proceed to empty
1466  * the queue.
1467  */
1468  if (!nodeBuffer->queuedForEmptying)
1469  {
1471  nodeBuffer->queuedForEmptying = true;
1472  gfbb->bufferEmptyingQueue =
1473  lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1474  MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1475  }
1476  gistProcessEmptyingQueue(buildstate);
1477  }
1478  else
1479  gfbb->buffersOnLevels[i] =
1481  }
1482  elog(DEBUG2, "emptied all buffers at level %d", i);
1483  }
1484  MemoryContextSwitchTo(oldCtx);
1485 }
1486 
1487 /*
1488  * Get the depth of the GiST index.
1489  */
1490 static int
1492 {
1493  int maxLevel;
1494  BlockNumber blkno;
1495 
1496  /*
1497  * Traverse down the tree, starting from the root, until we hit the leaf
1498  * level.
1499  */
1500  maxLevel = 0;
1501  blkno = GIST_ROOT_BLKNO;
1502  while (true)
1503  {
1504  Buffer buffer;
1505  Page page;
1506  IndexTuple itup;
1507 
1508  buffer = ReadBuffer(index, blkno);
1509 
1510  /*
1511  * There's no concurrent access during index build, so locking is just
1512  * pro forma.
1513  */
1514  LockBuffer(buffer, GIST_SHARE);
1515  page = (Page) BufferGetPage(buffer);
1516 
1517  if (GistPageIsLeaf(page))
1518  {
1519  /* We hit the bottom, so we're done. */
1520  UnlockReleaseBuffer(buffer);
1521  break;
1522  }
1523 
1524  /*
1525  * Pick the first downlink on the page, and follow it. It doesn't
1526  * matter which downlink we choose, the tree has the same depth
1527  * everywhere, so we just pick the first one.
1528  */
1529  itup = (IndexTuple) PageGetItem(page,
1531  blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1532  UnlockReleaseBuffer(buffer);
1533 
1534  /*
1535  * We're going down on the tree. It means that there is yet one more
1536  * level in the tree.
1537  */
1538  maxLevel++;
1539  }
1540  return maxLevel;
1541 }
1542 
1543 
1544 /*
1545  * Routines for managing the parent map.
1546  *
1547  * Whenever a page is split, we need to insert the downlinks into the parent.
1548  * We need to somehow find the parent page to do that. In normal insertions,
1549  * we keep a stack of nodes visited when we descend the tree. However, in
1550  * buffering build, we can start descending the tree from any internal node,
1551  * when we empty a buffer by cascading tuples to its children. So we don't
1552  * have a full stack up to the root available at that time.
1553  *
1554  * So instead, we maintain a hash table to track the parent of every internal
1555  * page. We don't need to track the parents of leaf nodes, however. Whenever
1556  * we insert to a leaf, we've just descended down from its parent, so we know
1557  * its immediate parent already. This helps a lot to limit the memory used
1558  * by this hash table.
1559  *
1560  * Whenever an internal node is split, the parent map needs to be updated.
1561  * the parent of the new child page needs to be recorded, and also the
1562  * entries for all page whose downlinks are moved to a new page at the split
1563  * needs to be updated.
1564  *
1565  * We also update the parent map whenever we descend the tree. That might seem
1566  * unnecessary, because we maintain the map whenever a downlink is moved or
1567  * created, but it is needed because we switch to buffering mode after
1568  * creating a tree with regular index inserts. Any pages created before
1569  * switching to buffering mode will not be present in the parent map initially,
1570  * but will be added there the first time we visit them.
1571  */
1572 
1573 typedef struct
1574 {
1575  BlockNumber childblkno; /* hash key */
1577 } ParentMapEntry;
1578 
1579 static void
1581 {
1582  HASHCTL hashCtl;
1583 
1584  hashCtl.keysize = sizeof(BlockNumber);
1585  hashCtl.entrysize = sizeof(ParentMapEntry);
1586  hashCtl.hcxt = CurrentMemoryContext;
1587  buildstate->parentMap = hash_create("gistbuild parent map",
1588  1024,
1589  &hashCtl,
1591 }
1592 
1593 static void
1595 {
1596  ParentMapEntry *entry;
1597  bool found;
1598 
1599  entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1600  &child,
1601  HASH_ENTER,
1602  &found);
1603  entry->parentblkno = parent;
1604 }
1605 
1606 /*
1607  * Scan all downlinks on a page, and memorize their parent.
1608  */
1609 static void
1611 {
1612  OffsetNumber maxoff;
1613  OffsetNumber off;
1614  BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1615  Page page = BufferGetPage(parentbuf);
1616 
1617  Assert(!GistPageIsLeaf(page));
1618 
1619  maxoff = PageGetMaxOffsetNumber(page);
1620  for (off = FirstOffsetNumber; off <= maxoff; off++)
1621  {
1622  ItemId iid = PageGetItemId(page, off);
1623  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1624  BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1625 
1626  gistMemorizeParent(buildstate, childblkno, parentblkno);
1627  }
1628 }
1629 
1630 static BlockNumber
1632 {
1633  ParentMapEntry *entry;
1634  bool found;
1635 
1636  /* Find node buffer in hash table */
1637  entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1638  &child,
1639  HASH_FIND,
1640  &found);
1641  if (!found)
1642  elog(ERROR, "could not find parent of block %u in lookup table", child);
1643 
1644  return entry->parentblkno;
1645 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static Datum values[MAXATTR]
Definition: bootstrap.c:156
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3290
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4497
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2111
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4715
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:708
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:227
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:355
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1542
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:907
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
#define SizeOfPageHeaderData
Definition: bufpage.h:213
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:468
unsigned short uint16
Definition: c.h:489
#define MAXALIGN(LEN)
Definition: c.h:795
#define VARHDRSZ
Definition: c.h:676
#define OidIsValid(objectId)
Definition: c.h:759
size_t Size
Definition: c.h:589
int effective_cache_size
Definition: costsize.c:129
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:953
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:350
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:18
void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate, Relation heapRel, bool is_build)
Definition: gist.c:633
bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright, Relation heapRel, bool is_build)
Definition: gist.c:224
SplitedPageLayout * gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate)
Definition: gist.c:1417
GISTSTATE * initGISTstate(Relation index)
Definition: gist.c:1504
MemoryContext createTempGistContext(void)
Definition: gist.c:121
void freeGISTstate(GISTSTATE *giststate)
Definition: gist.c:1632
#define F_LEAF
Definition: gist.h:46
#define GIST_SORTSUPPORT_PROC
Definition: gist.h:40
struct GISTPageOpaqueData GISTPageOpaqueData
#define GistPageIsLeaf(page)
Definition: gist.h:167
#define GistPageGetOpaque(page)
Definition: gist.h:165
#define GistBuildLSN
Definition: gist.h:67
#define LEVEL_HAS_BUFFERS(nlevel, gfbb)
Definition: gist_private.h:319
@ GIST_OPTION_BUFFERING_OFF
Definition: gist_private.h:388
@ GIST_OPTION_BUFFERING_ON
Definition: gist_private.h:387
#define GIST_ROOT_BLKNO
Definition: gist_private.h:262
#define GIST_DEFAULT_FILLFACTOR
Definition: gist_private.h:480
#define BUFFER_OVERFLOWED(nodeBuffer, gfbb)
Definition: gist_private.h:332
#define GIST_EXCLUSIVE
Definition: gist_private.h:43
#define GIST_SHARE
Definition: gist_private.h:42
GistBuildMode
Definition: gistbuild.c:68
@ GIST_BUFFERING_AUTO
Definition: gistbuild.c:72
@ GIST_SORTED_BUILD
Definition: gistbuild.c:69
@ GIST_BUFFERING_ACTIVE
Definition: gistbuild.c:78
@ GIST_BUFFERING_STATS
Definition: gistbuild.c:75
@ GIST_BUFFERING_DISABLED
Definition: gistbuild.c:70
static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber parentblk, OffsetNumber downlinkoffnum)
Definition: gistbuild.c:1120
static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
Definition: gistbuild.c:1610
static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, BlockNumber startblkno, int startlevel)
Definition: gistbuild.c:989
static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state, GistSortedBuildLevelState *levelstate)
Definition: gistbuild.c:523
#define BUFFERING_MODE_SWITCH_CHECK_STEP
Definition: gistbuild.c:52
static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate, BlockNumber childblkno, int level, BlockNumber *parentblkno, OffsetNumber *downlinkoffnum)
Definition: gistbuild.c:1289
static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
Definition: gistbuild.c:853
static void gist_indexsortbuild_levelstate_add(GISTBuildState *state, GistSortedBuildLevelState *levelstate, IndexTuple itup)
Definition: gistbuild.c:490
static void gist_indexsortbuild(GISTBuildState *state)
Definition: gistbuild.c:404
static void gistInitParentMap(GISTBuildState *buildstate)
Definition: gistbuild.c:1580
static void gistBuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: gistbuild.c:886
static void gistSortedBuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: gistbuild.c:370
static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
Definition: gistbuild.c:1594
#define GIST_SORTED_BUILD_PAGE_NUM
Definition: gistbuild.c:116
static void gistEmptyAllBuffers(GISTBuildState *buildstate)
Definition: gistbuild.c:1436
#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET
Definition: gistbuild.c:60
static void gistInitBuffering(GISTBuildState *buildstate)
Definition: gistbuild.c:692
static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
Definition: gistbuild.c:645
static void gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
Definition: gistbuild.c:973
IndexBuildResult * gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: gistbuild.c:183
static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child)
Definition: gistbuild.c:1631
static int gistGetMaxLevel(Relation index)
Definition: gistbuild.c:1491
struct GistSortedBuildLevelState GistSortedBuildLevelState
static void gistProcessEmptyingQueue(GISTBuildState *buildstate)
Definition: gistbuild.c:1363
void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, IndexTuple itup)
void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, Relation r, int level, Buffer buffer, List *splitinfo)
GISTBuildBuffers * gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
void gistFreeBuildBuffers(GISTBuildBuffers *gfbb)
GISTNodeBuffer * gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, BlockNumber nodeBlocknum, int level)
bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, IndexTuple *itup)
void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb)
IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, Datum *attdata, bool *isnull, bool isleaf)
Definition: gistutil.c:575
Buffer gistNewBuffer(Relation r, Relation heaprel)
Definition: gistutil.c:824
void gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off)
Definition: gistutil.c:34
IndexTuple * gistextractpage(Page page, int *len)
Definition: gistutil.c:95
IndexTuple gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate)
Definition: gistutil.c:316
OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, GISTSTATE *giststate)
Definition: gistutil.c:374
IndexTuple * gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen)
Definition: gistutil.c:114
void gistinitpage(Page page, uint32 f)
Definition: gistutil.c:757
IndexTuple gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate)
Definition: gistutil.c:219
void GISTInitBuffer(Buffer b, uint32 f)
Definition: gistutil.c:773
IndexTupleData * gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
Definition: gistutil.c:127
void gistCompressValues(GISTSTATE *giststate, Relation r, Datum *attdata, bool *isnull, bool isleaf, Datum *compatt)
Definition: gistutil.c:596
void gistcheckpage(Relation rel, Buffer buf)
Definition: gistutil.c:785
int maintenance_work_mem
Definition: globals.c:127
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:777
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
Pointer Item
Definition: item.h:17
struct ItemIdData ItemIdData
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition: itemptr.h:147
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
Assert(fmt[strlen(fmt) - 1] !='\n')
List * list_delete_first(List *list)
Definition: list.c:942
List * lcons(void *datum, List *list)
Definition: list.c:494
void list_free_deep(List *list)
Definition: list.c:1559
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:330
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc0(Size size)
Definition: mcxt.c:1257
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
void * palloc_aligned(Size size, Size alignto, int flags)
Definition: mcxt.c:1446
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:403
void * palloc(Size size)
Definition: mcxt.c:1226
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
#define INDEX_MAX_KEYS
#define PG_IO_ALIGN_SIZE
const void * data
#define lfirst(lc)
Definition: pg_list.h:172
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
#define linitial(l)
Definition: pg_list.h:178
int fillfactor
Definition: pgbench.c:197
uintptr_t Datum
Definition: postgres.h:64
unsigned int Oid
Definition: postgres_ext.h:31
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:572
#define RelationGetRelationName(relation)
Definition: rel.h:538
#define RelationNeedsWAL(relation)
Definition: rel.h:629
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:523
@ MAIN_FORKNUM
Definition: relpath.h:50
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:608
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:583
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:720
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:497
List * bufferEmptyingQueue
Definition: gist_private.h:357
List ** buffersOnLevels
Definition: gist_private.h:368
MemoryContext context
Definition: gist_private.h:341
GISTBuildBuffers * gfbb
Definition: gistbuild.c:100
BlockNumber pages_allocated
Definition: gistbuild.c:108
int ready_num_pages
Definition: gistbuild.c:111
Relation indexrel
Definition: gistbuild.c:84
GISTSTATE * giststate
Definition: gistbuild.c:86
HTAB * parentMap
Definition: gistbuild.c:101
int64 indtuples
Definition: gistbuild.c:92
Size freespace
Definition: gistbuild.c:88
Relation heaprel
Definition: gistbuild.c:85
int64 indtuplesSize
Definition: gistbuild.c:99
BlockNumber pages_written
Definition: gistbuild.c:109
Tuplesortstate * sortstate
Definition: gistbuild.c:106
GistBuildMode buildMode
Definition: gistbuild.c:90
BlockNumber nodeBlocknum
Definition: gist_private.h:300
bool queuedForEmptying
Definition: gist_private.h:307
IndexTuple downlink
Definition: gist_private.h:422
MemoryContext tempCxt
Definition: gist_private.h:78
Page pages[GIST_SORTED_BUILD_PAGE_NUM]
Definition: gistbuild.c:131
struct GistSortedBuildLevelState * parent
Definition: gistbuild.c:130
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
MemoryContext hcxt
Definition: hsearch.h:86
Definition: dynahash.c:220
double heap_tuples
Definition: genam.h:32
double index_tuples
Definition: genam.h:33
ItemPointerData t_tid
Definition: itup.h:37
Definition: pg_list.h:54
BlockNumber parentblkno
Definition: gistbuild.c:1576
BlockNumber childblkno
Definition: gistbuild.c:1575
gistxlogPage block
Definition: gist_private.h:193
IndexTupleData * list
Definition: gist_private.h:194
struct SplitedPageLayout * next
Definition: gist_private.h:200
Definition: type.h:95
Definition: regguts.h:323
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1772
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1385
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:972
#define TUPLESORT_NONE
Definition: tuplesort.h:92
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Tuplesortstate * tuplesort_begin_index_gist(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
XLogRecPtr log_newpage(RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:1131
void log_newpages(RelFileLocator *rlocator, ForkNumber forknum, int num_pages, BlockNumber *blknos, Page *pages, bool page_std)
Definition: xloginsert.c:1163
void log_newpage_range(Relation rel, ForkNumber forknum, BlockNumber startblk, BlockNumber endblk, bool page_std)
Definition: xloginsert.c:1258
#define XLR_MAX_BLOCK_ID
Definition: xlogrecord.h:239