PostgreSQL Source Code  git master
gistbuild.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * gistbuild.c
4  * build algorithm for GiST indexes implementation.
5  *
6  * There are two different strategies:
7  *
8  * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted
9  * order, and create downlinks and internal pages as we go. This builds
10  * the index from the bottom up, similar to how B-tree index build
11  * works.
12  *
13  * 2. Start with an empty index, and insert all tuples one by one.
14  *
15  * The sorted method is used if the operator classes for all columns have
16  * a 'sortsupport' defined. Otherwise, we resort to the second strategy.
17  *
18  * The second strategy can optionally use buffers at different levels of
19  * the tree to reduce I/O, see "Buffering build algorithm" in the README
20  * for a more detailed explanation. It initially calls insert over and
21  * over, but switches to the buffered algorithm after a certain number of
22  * tuples (unless buffering mode is disabled).
23  *
24  *
25  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
26  * Portions Copyright (c) 1994, Regents of the University of California
27  *
28  * IDENTIFICATION
29  * src/backend/access/gist/gistbuild.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34 
35 #include <math.h>
36 
37 #include "access/genam.h"
38 #include "access/gist_private.h"
39 #include "access/tableam.h"
40 #include "access/xloginsert.h"
41 #include "miscadmin.h"
42 #include "nodes/execnodes.h"
43 #include "optimizer/optimizer.h"
44 #include "storage/bufmgr.h"
45 #include "storage/bulk_write.h"
46 
47 #include "utils/memutils.h"
48 #include "utils/rel.h"
49 #include "utils/tuplesort.h"
50 
51 /* Step of index tuples for check whether to switch to buffering build mode */
52 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
53 
54 /*
55  * Number of tuples to process in the slow way before switching to buffering
56  * mode, when buffering is explicitly turned on. Also, the number of tuples
57  * to process between readjusting the buffer size parameter, while in
58  * buffering mode.
59  */
60 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
61 
62 /*
63  * Strategy used to build the index. It can change between the
64  * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used,
65  * that needs to be decided up-front and cannot be changed afterwards.
66  */
67 typedef enum
68 {
69  GIST_SORTED_BUILD, /* bottom-up build by sorting */
70  GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
71  * switch */
72  GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
73  * buffering build mode if the index grows too
74  * big */
75  GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
76  * before switching to the buffering build
77  * mode */
78  GIST_BUFFERING_ACTIVE, /* in buffering build mode */
80 
81 /* Working state for gistbuild and its callback */
82 typedef struct
83 {
87 
88  Size freespace; /* amount of free space to leave on pages */
89 
91 
92  int64 indtuples; /* number of tuples indexed */
93 
94  /*
95  * Extra data structures used during a buffering build. 'gfbb' contains
96  * information related to managing the build buffers. 'parentMap' is a
97  * lookup table of the parent of each internal page.
98  */
99  int64 indtuplesSize; /* total size of all indexed tuples */
102 
103  /*
104  * Extra data structures used during a sorting build.
105  */
106  Tuplesortstate *sortstate; /* state data for tuplesort.c */
107 
109 
112 
113 #define GIST_SORTED_BUILD_PAGE_NUM 4
114 
115 /*
116  * In sorted build, we use a stack of these structs, one for each level,
117  * to hold an in-memory buffer of last pages at the level.
118  *
119  * Sorting GiST build requires good linearization of the sort opclass. This is
120  * not always the case in multidimensional data. To tackle the anomalies, we
121  * buffer index tuples and apply picksplit that can be multidimension-aware.
122  */
124 {
127  struct GistSortedBuildLevelState *parent; /* Upper level, if any */
130 
131 /* prototypes for private functions */
132 
134  Datum *values, bool *isnull,
135  bool tupleIsAlive, void *state);
138  GistSortedBuildLevelState *levelstate,
139  IndexTuple itup);
141  GistSortedBuildLevelState *levelstate);
142 
143 static void gistInitBuffering(GISTBuildState *buildstate);
144 static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
145 static void gistBuildCallback(Relation index,
146  ItemPointer tid,
147  Datum *values,
148  bool *isnull,
149  bool tupleIsAlive,
150  void *state);
151 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
152  IndexTuple itup);
153 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
154  BlockNumber startblkno, int startlevel);
156  Buffer buffer, int level,
157  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
158  BlockNumber parentblk, OffsetNumber downlinkoffnum);
160  BlockNumber childblkno, int level,
161  BlockNumber *parentblkno,
162  OffsetNumber *downlinkoffnum);
163 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
164 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
165 static int gistGetMaxLevel(Relation index);
166 
167 static void gistInitParentMap(GISTBuildState *buildstate);
168 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
170 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate,
171  Buffer parentbuf);
172 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
173 
174 
175 /*
176  * Main entry point to GiST index build.
177  */
180 {
181  IndexBuildResult *result;
182  double reltuples;
183  GISTBuildState buildstate;
185  int fillfactor;
186  Oid SortSupportFnOids[INDEX_MAX_KEYS];
187  GiSTOptions *options = (GiSTOptions *) index->rd_options;
188 
189  /*
190  * We expect to be called exactly once for any index relation. If that's
191  * not the case, big trouble's what we have.
192  */
194  elog(ERROR, "index \"%s\" already contains data",
196 
197  buildstate.indexrel = index;
198  buildstate.heaprel = heap;
199  buildstate.sortstate = NULL;
200  buildstate.giststate = initGISTstate(index);
201 
202  /*
203  * Create a temporary memory context that is reset once for each tuple
204  * processed. (Note: we don't bother to make this a child of the
205  * giststate's scanCxt, so we have to delete it separately at the end.)
206  */
207  buildstate.giststate->tempCxt = createTempGistContext();
208 
209  /*
210  * Choose build strategy. First check whether the user specified to use
211  * buffering mode. (The use-case for that in the field is somewhat
212  * questionable perhaps, but it's important for testing purposes.)
213  */
214  if (options)
215  {
216  if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
217  buildstate.buildMode = GIST_BUFFERING_STATS;
218  else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
219  buildstate.buildMode = GIST_BUFFERING_DISABLED;
220  else /* must be "auto" */
221  buildstate.buildMode = GIST_BUFFERING_AUTO;
222  }
223  else
224  {
225  buildstate.buildMode = GIST_BUFFERING_AUTO;
226  }
227 
228  /*
229  * Unless buffering mode was forced, see if we can use sorting instead.
230  */
231  if (buildstate.buildMode != GIST_BUFFERING_STATS)
232  {
233  bool hasallsortsupports = true;
235 
236  for (int i = 0; i < keyscount; i++)
237  {
238  SortSupportFnOids[i] = index_getprocid(index, i + 1,
240  if (!OidIsValid(SortSupportFnOids[i]))
241  {
242  hasallsortsupports = false;
243  break;
244  }
245  }
246  if (hasallsortsupports)
247  buildstate.buildMode = GIST_SORTED_BUILD;
248  }
249 
250  /*
251  * Calculate target amount of free space to leave on pages.
252  */
254  buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
255 
256  /*
257  * Build the index using the chosen strategy.
258  */
259  buildstate.indtuples = 0;
260  buildstate.indtuplesSize = 0;
261 
262  if (buildstate.buildMode == GIST_SORTED_BUILD)
263  {
264  /*
265  * Sort all data, build the index from bottom up.
266  */
267  buildstate.sortstate = tuplesort_begin_index_gist(heap,
268  index,
270  NULL,
272 
273  /* Scan the table, adding all tuples to the tuplesort */
274  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
276  &buildstate, NULL);
277 
278  /*
279  * Perform the sort and build index pages.
280  */
281  tuplesort_performsort(buildstate.sortstate);
282 
283  gist_indexsortbuild(&buildstate);
284 
285  tuplesort_end(buildstate.sortstate);
286  }
287  else
288  {
289  /*
290  * Initialize an empty index and insert all tuples, possibly using
291  * buffers on intermediate levels.
292  */
293  Buffer buffer;
294  Page page;
295 
296  /* initialize the root page */
297  buffer = gistNewBuffer(index, heap);
299  page = BufferGetPage(buffer);
300 
302 
303  GISTInitBuffer(buffer, F_LEAF);
304 
305  MarkBufferDirty(buffer);
306  PageSetLSN(page, GistBuildLSN);
307 
308  UnlockReleaseBuffer(buffer);
309 
311 
312  /* Scan the table, inserting all the tuples to the index. */
313  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
315  &buildstate, NULL);
316 
317  /*
318  * If buffering was used, flush out all the tuples that are still in
319  * the buffers.
320  */
321  if (buildstate.buildMode == GIST_BUFFERING_ACTIVE)
322  {
323  elog(DEBUG1, "all tuples processed, emptying buffers");
324  gistEmptyAllBuffers(&buildstate);
325  gistFreeBuildBuffers(buildstate.gfbb);
326  }
327 
328  /*
329  * We didn't write WAL records as we built the index, so if
330  * WAL-logging is required, write all pages to the WAL now.
331  */
332  if (RelationNeedsWAL(index))
333  {
336  true);
337  }
338  }
339 
340  /* okay, all heap tuples are indexed */
341  MemoryContextSwitchTo(oldcxt);
343 
344  freeGISTstate(buildstate.giststate);
345 
346  /*
347  * Return statistics
348  */
349  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
350 
351  result->heap_tuples = reltuples;
352  result->index_tuples = (double) buildstate.indtuples;
353 
354  return result;
355 }
356 
357 /*-------------------------------------------------------------------------
358  * Routines for sorted build
359  *-------------------------------------------------------------------------
360  */
361 
362 /*
363  * Per-tuple callback for table_index_build_scan.
364  */
365 static void
367  ItemPointer tid,
368  Datum *values,
369  bool *isnull,
370  bool tupleIsAlive,
371  void *state)
372 {
373  GISTBuildState *buildstate = (GISTBuildState *) state;
374  MemoryContext oldCtx;
375  Datum compressed_values[INDEX_MAX_KEYS];
376 
377  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
378 
379  /* Form an index tuple and point it at the heap tuple */
380  gistCompressValues(buildstate->giststate, index,
381  values, isnull,
382  true, compressed_values);
383 
385  buildstate->indexrel,
386  tid,
387  compressed_values, isnull);
388 
389  MemoryContextSwitchTo(oldCtx);
390  MemoryContextReset(buildstate->giststate->tempCxt);
391 
392  /* Update tuple count. */
393  buildstate->indtuples += 1;
394 }
395 
396 /*
397  * Build GiST index from bottom up from pre-sorted tuples.
398  */
399 static void
401 {
402  IndexTuple itup;
403  GistSortedBuildLevelState *levelstate;
404  BulkWriteBuffer rootbuf;
405 
406  /* Reserve block 0 for the root page */
407  state->pages_allocated = 1;
408 
409  state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM);
410 
411  /* Allocate a temporary buffer for the first leaf page batch. */
412  levelstate = palloc0(sizeof(GistSortedBuildLevelState));
413  levelstate->pages[0] = palloc(BLCKSZ);
414  levelstate->parent = NULL;
415  gistinitpage(levelstate->pages[0], F_LEAF);
416 
417  /*
418  * Fill index pages with tuples in the sorted order.
419  */
420  while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
421  {
422  gist_indexsortbuild_levelstate_add(state, levelstate, itup);
423  MemoryContextReset(state->giststate->tempCxt);
424  }
425 
426  /*
427  * Write out the partially full non-root pages.
428  *
429  * Keep in mind that flush can build a new root. If number of pages is > 1
430  * then new root is required.
431  */
432  while (levelstate->parent != NULL || levelstate->current_page != 0)
433  {
435 
437  parent = levelstate->parent;
438  for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
439  if (levelstate->pages[i])
440  pfree(levelstate->pages[i]);
441  pfree(levelstate);
442  levelstate = parent;
443  }
444 
445  /* Write out the root */
446  PageSetLSN(levelstate->pages[0], GistBuildLSN);
447  rootbuf = smgr_bulk_get_buf(state->bulkstate);
448  memcpy(rootbuf, levelstate->pages[0], BLCKSZ);
449  smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true);
450 
451  pfree(levelstate);
452 
453  smgr_bulk_finish(state->bulkstate);
454 }
455 
456 /*
457  * Add tuple to a page. If the pages are full, write them out and re-initialize
458  * a new page first.
459  */
460 static void
462  GistSortedBuildLevelState *levelstate,
463  IndexTuple itup)
464 {
465  Size sizeNeeded;
466 
467  /* Check if tuple can be added to the current page */
468  sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
469  if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
470  {
471  Page newPage;
472  Page old_page = levelstate->pages[levelstate->current_page];
473  uint16 old_page_flags = GistPageGetOpaque(old_page)->flags;
474 
475  if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
476  {
478  }
479  else
480  levelstate->current_page++;
481 
482  if (levelstate->pages[levelstate->current_page] == NULL)
483  levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ);
484 
485  newPage = levelstate->pages[levelstate->current_page];
486  gistinitpage(newPage, old_page_flags);
487  }
488 
489  gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
490 }
491 
492 static void
494  GistSortedBuildLevelState *levelstate)
495 {
497  BlockNumber blkno;
498  MemoryContext oldCtx;
499  IndexTuple union_tuple;
500  SplitPageLayout *dist;
501  IndexTuple *itvec;
502  int vect_len;
503  bool isleaf = GistPageIsLeaf(levelstate->pages[0]);
504 
506 
507  oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
508 
509  /* Get index tuples from first page */
510  itvec = gistextractpage(levelstate->pages[0], &vect_len);
511  if (levelstate->current_page > 0)
512  {
513  /* Append tuples from each page */
514  for (int i = 1; i < levelstate->current_page + 1; i++)
515  {
516  int len_local;
517  IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);
518 
519  itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
520  pfree(itvec_local);
521  }
522 
523  /* Apply picksplit to list of all collected tuples */
524  dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
525  }
526  else
527  {
528  /* Create split layout from single page */
529  dist = (SplitPageLayout *) palloc0(sizeof(SplitPageLayout));
530  union_tuple = gistunion(state->indexrel, itvec, vect_len,
531  state->giststate);
532  dist->itup = union_tuple;
533  dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
534  dist->block.num = vect_len;
535  }
536 
537  MemoryContextSwitchTo(oldCtx);
538 
539  /* Reset page counter */
540  levelstate->current_page = 0;
541 
542  /* Create pages for all partitions in split result */
543  for (; dist != NULL; dist = dist->next)
544  {
545  char *data;
547  Page target;
548 
549  /* check once per page */
551 
552  /* Create page and copy data */
553  data = (char *) (dist->list);
554  buf = smgr_bulk_get_buf(state->bulkstate);
555  target = (Page) buf;
556  gistinitpage(target, isleaf ? F_LEAF : 0);
557  for (int i = 0; i < dist->block.num; i++)
558  {
559  IndexTuple thistup = (IndexTuple) data;
560 
561  if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
562  elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));
563 
564  data += IndexTupleSize(thistup);
565  }
566  union_tuple = dist->itup;
567 
568  /*
569  * Set the right link to point to the previous page. This is just for
570  * debugging purposes: GiST only follows the right link if a page is
571  * split concurrently to a scan, and that cannot happen during index
572  * build.
573  *
574  * It's a bit counterintuitive that we set the right link on the new
575  * page to point to the previous page, not the other way around. But
576  * GiST pages are not ordered like B-tree pages are, so as long as the
577  * right-links form a chain through all the pages at the same level,
578  * the order doesn't matter.
579  */
580  if (levelstate->last_blkno)
581  GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
582 
583  /*
584  * The page is now complete. Assign a block number to it, and pass it
585  * to the bulk writer.
586  */
587  blkno = state->pages_allocated++;
588  PageSetLSN(target, GistBuildLSN);
589  smgr_bulk_write(state->bulkstate, blkno, buf, true);
590  ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
591  levelstate->last_blkno = blkno;
592 
593  /*
594  * Insert the downlink to the parent page. If this was the root,
595  * create a new page as the parent, which becomes the new root.
596  */
597  parent = levelstate->parent;
598  if (parent == NULL)
599  {
601  parent->pages[0] = palloc(BLCKSZ);
602  parent->parent = NULL;
603  gistinitpage(parent->pages[0], 0);
604 
605  levelstate->parent = parent;
606  }
608  }
609 }
610 
611 
612 /*-------------------------------------------------------------------------
613  * Routines for non-sorted build
614  *-------------------------------------------------------------------------
615  */
616 
617 /*
618  * Attempt to switch to buffering mode.
619  *
620  * If there is not enough memory for buffering build, sets bufferingMode
621  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
622  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
623  * GIST_BUFFERING_ACTIVE.
624  */
625 static void
627 {
628  Relation index = buildstate->indexrel;
629  int pagesPerBuffer;
630  Size pageFreeSpace;
631  Size itupAvgSize,
632  itupMinSize;
633  double avgIndexTuplesPerPage,
634  maxIndexTuplesPerPage;
635  int i;
636  int levelStep;
637 
638  /* Calc space of index page which is available for index tuples */
639  pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
640  - sizeof(ItemIdData)
641  - buildstate->freespace;
642 
643  /*
644  * Calculate average size of already inserted index tuples using gathered
645  * statistics.
646  */
647  itupAvgSize = (double) buildstate->indtuplesSize /
648  (double) buildstate->indtuples;
649 
650  /*
651  * Calculate minimal possible size of index tuple by index metadata.
652  * Minimal possible size of varlena is VARHDRSZ.
653  *
654  * XXX: that's not actually true, as a short varlen can be just 2 bytes.
655  * And we should take padding into account here.
656  */
657  itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
658  for (i = 0; i < index->rd_att->natts; i++)
659  {
660  if (TupleDescAttr(index->rd_att, i)->attlen < 0)
661  itupMinSize += VARHDRSZ;
662  else
663  itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
664  }
665 
666  /* Calculate average and maximal number of index tuples which fit to page */
667  avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
668  maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
669 
670  /*
671  * We need to calculate two parameters for the buffering algorithm:
672  * levelStep and pagesPerBuffer.
673  *
674  * levelStep determines the size of subtree that we operate on, while
675  * emptying a buffer. A higher value is better, as you need fewer buffer
676  * emptying steps to build the index. However, if you set it too high, the
677  * subtree doesn't fit in cache anymore, and you quickly lose the benefit
678  * of the buffers.
679  *
680  * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
681  * the number of tuples on page (ie. fanout), and M is the amount of
682  * internal memory available. Curiously, they doesn't explain *why* that
683  * setting is optimal. We calculate it by taking the highest levelStep so
684  * that a subtree still fits in cache. For a small B, our way of
685  * calculating levelStep is very close to Arge et al's formula. For a
686  * large B, our formula gives a value that is 2x higher.
687  *
688  * The average size (in pages) of a subtree of depth n can be calculated
689  * as a geometric series:
690  *
691  * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
692  *
693  * where B is the average number of index tuples on page. The subtree is
694  * cached in the shared buffer cache and the OS cache, so we choose
695  * levelStep so that the subtree size is comfortably smaller than
696  * effective_cache_size, with a safety factor of 4.
697  *
698  * The estimate on the average number of index tuples on page is based on
699  * average tuple sizes observed before switching to buffered build, so the
700  * real subtree size can be somewhat larger. Also, it would selfish to
701  * gobble the whole cache for our index build. The safety factor of 4
702  * should account for those effects.
703  *
704  * The other limiting factor for setting levelStep is that while
705  * processing a subtree, we need to hold one page for each buffer at the
706  * next lower buffered level. The max. number of buffers needed for that
707  * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
708  * hopefully maintenance_work_mem is set high enough that you're
709  * constrained by effective_cache_size rather than maintenance_work_mem.
710  *
711  * XXX: the buffer hash table consumes a fair amount of memory too per
712  * buffer, but that is not currently taken into account. That scales on
713  * the total number of buffers used, ie. the index size and on levelStep.
714  * Note that a higher levelStep *reduces* the amount of memory needed for
715  * the hash table.
716  */
717  levelStep = 1;
718  for (;;)
719  {
720  double subtreesize;
721  double maxlowestlevelpages;
722 
723  /* size of an average subtree at this levelStep (in pages). */
724  subtreesize =
725  (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
726  (1 - avgIndexTuplesPerPage);
727 
728  /* max number of pages at the lowest level of a subtree */
729  maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
730 
731  /* subtree must fit in cache (with safety factor of 4) */
732  if (subtreesize > effective_cache_size / 4)
733  break;
734 
735  /* each node in the lowest level of a subtree has one page in memory */
736  if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
737  break;
738 
739  /* Good, we can handle this levelStep. See if we can go one higher. */
740  levelStep++;
741  }
742 
743  /*
744  * We just reached an unacceptable value of levelStep in previous loop.
745  * So, decrease levelStep to get last acceptable value.
746  */
747  levelStep--;
748 
749  /*
750  * If there's not enough cache or maintenance_work_mem, fall back to plain
751  * inserts.
752  */
753  if (levelStep <= 0)
754  {
755  elog(DEBUG1, "failed to switch to buffered GiST build");
756  buildstate->buildMode = GIST_BUFFERING_DISABLED;
757  return;
758  }
759 
760  /*
761  * The second parameter to set is pagesPerBuffer, which determines the
762  * size of each buffer. We adjust pagesPerBuffer also during the build,
763  * which is why this calculation is in a separate function.
764  */
765  pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
766 
767  /* Initialize GISTBuildBuffers with these parameters */
768  buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
770 
771  gistInitParentMap(buildstate);
772 
773  buildstate->buildMode = GIST_BUFFERING_ACTIVE;
774 
775  elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
776  levelStep, pagesPerBuffer);
777 }
778 
779 /*
780  * Calculate pagesPerBuffer parameter for the buffering algorithm.
781  *
782  * Buffer size is chosen so that assuming that tuples are distributed
783  * randomly, emptying half a buffer fills on average one page in every buffer
784  * at the next lower level.
785  */
786 static int
787 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
788 {
789  double pagesPerBuffer;
790  double avgIndexTuplesPerPage;
791  double itupAvgSize;
792  Size pageFreeSpace;
793 
794  /* Calc space of index page which is available for index tuples */
795  pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
796  - sizeof(ItemIdData)
797  - buildstate->freespace;
798 
799  /*
800  * Calculate average size of already inserted index tuples using gathered
801  * statistics.
802  */
803  itupAvgSize = (double) buildstate->indtuplesSize /
804  (double) buildstate->indtuples;
805 
806  avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
807 
808  /*
809  * Recalculate required size of buffers.
810  */
811  pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
812 
813  return (int) rint(pagesPerBuffer);
814 }
815 
816 /*
817  * Per-tuple callback for table_index_build_scan.
818  */
819 static void
821  ItemPointer tid,
822  Datum *values,
823  bool *isnull,
824  bool tupleIsAlive,
825  void *state)
826 {
827  GISTBuildState *buildstate = (GISTBuildState *) state;
828  IndexTuple itup;
829  MemoryContext oldCtx;
830 
831  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
832 
833  /* form an index tuple and point it at the heap tuple */
834  itup = gistFormTuple(buildstate->giststate, index,
835  values, isnull,
836  true);
837  itup->t_tid = *tid;
838 
839  /* Update tuple count and total size. */
840  buildstate->indtuples += 1;
841  buildstate->indtuplesSize += IndexTupleSize(itup);
842 
843  /*
844  * XXX In buffering builds, the tempCxt is also reset down inside
845  * gistProcessEmptyingQueue(). This is not great because it risks
846  * confusion and possible use of dangling pointers (for example, itup
847  * might be already freed when control returns here). It's generally
848  * better that a memory context be "owned" by only one function. However,
849  * currently this isn't causing issues so it doesn't seem worth the amount
850  * of refactoring that would be needed to avoid it.
851  */
852  if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
853  {
854  /* We have buffers, so use them. */
855  gistBufferingBuildInsert(buildstate, itup);
856  }
857  else
858  {
859  /*
860  * There's no buffers (yet). Since we already have the index relation
861  * locked, we call gistdoinsert directly.
862  */
863  gistdoinsert(index, itup, buildstate->freespace,
864  buildstate->giststate, buildstate->heaprel, true);
865  }
866 
867  MemoryContextSwitchTo(oldCtx);
868  MemoryContextReset(buildstate->giststate->tempCxt);
869 
870  if (buildstate->buildMode == GIST_BUFFERING_ACTIVE &&
872  {
873  /* Adjust the target buffer size now */
874  buildstate->gfbb->pagesPerBuffer =
875  calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
876  }
877 
878  /*
879  * In 'auto' mode, check if the index has grown too large to fit in cache,
880  * and switch to buffering mode if it has.
881  *
882  * To avoid excessive calls to smgrnblocks(), only check this every
883  * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples.
884  *
885  * In 'stats' state, switch as soon as we have seen enough tuples to have
886  * some idea of the average tuple size.
887  */
888  if ((buildstate->buildMode == GIST_BUFFERING_AUTO &&
889  buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
891  MAIN_FORKNUM)) ||
892  (buildstate->buildMode == GIST_BUFFERING_STATS &&
894  {
895  /*
896  * Index doesn't fit in effective cache anymore. Try to switch to
897  * buffering build mode.
898  */
899  gistInitBuffering(buildstate);
900  }
901 }
902 
903 /*
904  * Insert function for buffering index build.
905  */
906 static void
908 {
909  /* Insert the tuple to buffers. */
910  gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
911 
912  /* If we filled up (half of a) buffer, process buffer emptying. */
913  gistProcessEmptyingQueue(buildstate);
914 }
915 
916 /*
917  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
918  * page or node buffer, and inserts the tuple there. Returns true if we have
919  * to stop buffer emptying process (because one of child buffers can't take
920  * index tuples anymore).
921  */
922 static bool
924  BlockNumber startblkno, int startlevel)
925 {
926  GISTSTATE *giststate = buildstate->giststate;
927  GISTBuildBuffers *gfbb = buildstate->gfbb;
928  Relation indexrel = buildstate->indexrel;
929  BlockNumber childblkno;
930  Buffer buffer;
931  bool result = false;
932  BlockNumber blkno;
933  int level;
934  OffsetNumber downlinkoffnum = InvalidOffsetNumber;
935  BlockNumber parentblkno = InvalidBlockNumber;
936 
938 
939  /*
940  * Loop until we reach a leaf page (level == 0) or a level with buffers
941  * (not including the level we start at, because we would otherwise make
942  * no progress).
943  */
944  blkno = startblkno;
945  level = startlevel;
946  for (;;)
947  {
948  ItemId iid;
949  IndexTuple idxtuple,
950  newtup;
951  Page page;
952  OffsetNumber childoffnum;
953 
954  /* Have we reached a level with buffers? */
955  if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
956  break;
957 
958  /* Have we reached a leaf page? */
959  if (level == 0)
960  break;
961 
962  /*
963  * Nope. Descend down to the next level then. Choose a child to
964  * descend down to.
965  */
966 
967  buffer = ReadBuffer(indexrel, blkno);
968  LockBuffer(buffer, GIST_EXCLUSIVE);
969 
970  page = (Page) BufferGetPage(buffer);
971  childoffnum = gistchoose(indexrel, page, itup, giststate);
972  iid = PageGetItemId(page, childoffnum);
973  idxtuple = (IndexTuple) PageGetItem(page, iid);
974  childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
975 
976  if (level > 1)
977  gistMemorizeParent(buildstate, childblkno, blkno);
978 
979  /*
980  * Check that the key representing the target child node is consistent
981  * with the key we're inserting. Update it if it's not.
982  */
983  newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
984  if (newtup)
985  {
986  blkno = gistbufferinginserttuples(buildstate,
987  buffer,
988  level,
989  &newtup,
990  1,
991  childoffnum,
994  /* gistbufferinginserttuples() released the buffer */
995  }
996  else
997  UnlockReleaseBuffer(buffer);
998 
999  /* Descend to the child */
1000  parentblkno = blkno;
1001  blkno = childblkno;
1002  downlinkoffnum = childoffnum;
1003  Assert(level > 0);
1004  level--;
1005  }
1006 
1007  if (LEVEL_HAS_BUFFERS(level, gfbb))
1008  {
1009  /*
1010  * We've reached level with buffers. Place the index tuple to the
1011  * buffer, and add the buffer to the emptying queue if it overflows.
1012  */
1013  GISTNodeBuffer *childNodeBuffer;
1014 
1015  /* Find the buffer or create a new one */
1016  childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
1017 
1018  /* Add index tuple to it */
1019  gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
1020 
1021  if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
1022  result = true;
1023  }
1024  else
1025  {
1026  /*
1027  * We've reached a leaf page. Place the tuple here.
1028  */
1029  Assert(level == 0);
1030  buffer = ReadBuffer(indexrel, blkno);
1031  LockBuffer(buffer, GIST_EXCLUSIVE);
1032  gistbufferinginserttuples(buildstate, buffer, level,
1033  &itup, 1, InvalidOffsetNumber,
1034  parentblkno, downlinkoffnum);
1035  /* gistbufferinginserttuples() released the buffer */
1036  }
1037 
1038  return result;
1039 }
1040 
1041 /*
1042  * Insert tuples to a given page.
1043  *
1044  * This is analogous with gistinserttuples() in the regular insertion code.
1045  *
1046  * Returns the block number of the page where the (first) new or updated tuple
1047  * was inserted. Usually that's the original page, but might be a sibling page
1048  * if the original page was split.
1049  *
1050  * Caller should hold a lock on 'buffer' on entry. This function will unlock
1051  * and unpin it.
1052  */
1053 static BlockNumber
1054 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
1055  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
1056  BlockNumber parentblk, OffsetNumber downlinkoffnum)
1057 {
1058  GISTBuildBuffers *gfbb = buildstate->gfbb;
1059  List *splitinfo;
1060  bool is_split;
1061  BlockNumber placed_to_blk = InvalidBlockNumber;
1062 
1063  is_split = gistplacetopage(buildstate->indexrel,
1064  buildstate->freespace,
1065  buildstate->giststate,
1066  buffer,
1067  itup, ntup, oldoffnum, &placed_to_blk,
1068  InvalidBuffer,
1069  &splitinfo,
1070  false,
1071  buildstate->heaprel, true);
1072 
1073  /*
1074  * If this is a root split, update the root path item kept in memory. This
1075  * ensures that all path stacks are always complete, including all parent
1076  * nodes up to the root. That simplifies the algorithm to re-find correct
1077  * parent.
1078  */
1079  if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
1080  {
1081  Page page = BufferGetPage(buffer);
1082  OffsetNumber off;
1083  OffsetNumber maxoff;
1084 
1085  Assert(level == gfbb->rootlevel);
1086  gfbb->rootlevel++;
1087 
1088  elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
1089 
1090  /*
1091  * All the downlinks on the old root page are now on one of the child
1092  * pages. Visit all the new child pages to memorize the parents of the
1093  * grandchildren.
1094  */
1095  if (gfbb->rootlevel > 1)
1096  {
1097  maxoff = PageGetMaxOffsetNumber(page);
1098  for (off = FirstOffsetNumber; off <= maxoff; off++)
1099  {
1100  ItemId iid = PageGetItemId(page, off);
1101  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1102  BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1103  Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
1104 
1105  LockBuffer(childbuf, GIST_SHARE);
1106  gistMemorizeAllDownlinks(buildstate, childbuf);
1107  UnlockReleaseBuffer(childbuf);
1108 
1109  /*
1110  * Also remember that the parent of the new child page is the
1111  * root block.
1112  */
1113  gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
1114  }
1115  }
1116  }
1117 
1118  if (splitinfo)
1119  {
1120  /*
1121  * Insert the downlinks to the parent. This is analogous with
1122  * gistfinishsplit() in the regular insertion code, but the locking is
1123  * simpler, and we have to maintain the buffers on internal nodes and
1124  * the parent map.
1125  */
1126  IndexTuple *downlinks;
1127  int ndownlinks,
1128  i;
1129  Buffer parentBuffer;
1130  ListCell *lc;
1131 
1132  /* Parent may have changed since we memorized this path. */
1133  parentBuffer =
1134  gistBufferingFindCorrectParent(buildstate,
1135  BufferGetBlockNumber(buffer),
1136  level,
1137  &parentblk,
1138  &downlinkoffnum);
1139 
1140  /*
1141  * If there's a buffer associated with this page, that needs to be
1142  * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
1143  * downlinks in 'splitinfo', to make sure they're consistent not only
1144  * with the tuples already on the pages, but also the tuples in the
1145  * buffers that will eventually be inserted to them.
1146  */
1148  buildstate->giststate,
1149  buildstate->indexrel,
1150  level,
1151  buffer, splitinfo);
1152 
1153  /* Create an array of all the downlink tuples */
1154  ndownlinks = list_length(splitinfo);
1155  downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
1156  i = 0;
1157  foreach(lc, splitinfo)
1158  {
1159  GISTPageSplitInfo *splitinfo = lfirst(lc);
1160 
1161  /*
1162  * Remember the parent of each new child page in our parent map.
1163  * This assumes that the downlinks fit on the parent page. If the
1164  * parent page is split, too, when we recurse up to insert the
1165  * downlinks, the recursive gistbufferinginserttuples() call will
1166  * update the map again.
1167  */
1168  if (level > 0)
1169  gistMemorizeParent(buildstate,
1170  BufferGetBlockNumber(splitinfo->buf),
1171  BufferGetBlockNumber(parentBuffer));
1172 
1173  /*
1174  * Also update the parent map for all the downlinks that got moved
1175  * to a different page. (actually this also loops through the
1176  * downlinks that stayed on the original page, but it does no
1177  * harm).
1178  */
1179  if (level > 1)
1180  gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
1181 
1182  /*
1183  * Since there's no concurrent access, we can release the lower
1184  * level buffers immediately. This includes the original page.
1185  */
1186  UnlockReleaseBuffer(splitinfo->buf);
1187  downlinks[i++] = splitinfo->downlink;
1188  }
1189 
1190  /* Insert them into parent. */
1191  gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
1192  downlinks, ndownlinks, downlinkoffnum,
1194 
1195  list_free_deep(splitinfo); /* we don't need this anymore */
1196  }
1197  else
1198  UnlockReleaseBuffer(buffer);
1199 
1200  return placed_to_blk;
1201 }
1202 
1203 /*
1204  * Find the downlink pointing to a child page.
1205  *
1206  * 'childblkno' indicates the child page to find the parent for. 'level' is
1207  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
1208  * point to a location where the downlink used to be - we will check that
1209  * location first, and save some cycles if it hasn't moved. The function
1210  * returns a buffer containing the downlink, exclusively-locked, and
1211  * *parentblkno and *downlinkoffnum are set to the real location of the
1212  * downlink.
1213  *
1214  * If the child page is a leaf (level == 0), the caller must supply a correct
1215  * parentblkno. Otherwise we use the parent map hash table to find the parent
1216  * block.
1217  *
1218  * This function serves the same purpose as gistFindCorrectParent() during
1219  * normal index inserts, but this is simpler because we don't need to deal
1220  * with concurrent inserts.
1221  */
1222 static Buffer
1224  BlockNumber childblkno, int level,
1225  BlockNumber *parentblkno,
1226  OffsetNumber *downlinkoffnum)
1227 {
1229  Buffer buffer;
1230  Page page;
1231  OffsetNumber maxoff;
1232  OffsetNumber off;
1233 
1234  if (level > 0)
1235  parent = gistGetParent(buildstate, childblkno);
1236  else
1237  {
1238  /*
1239  * For a leaf page, the caller must supply a correct parent block
1240  * number.
1241  */
1242  if (*parentblkno == InvalidBlockNumber)
1243  elog(ERROR, "no parent buffer provided of child %u", childblkno);
1244  parent = *parentblkno;
1245  }
1246 
1247  buffer = ReadBuffer(buildstate->indexrel, parent);
1248  page = BufferGetPage(buffer);
1249  LockBuffer(buffer, GIST_EXCLUSIVE);
1250  gistcheckpage(buildstate->indexrel, buffer);
1251  maxoff = PageGetMaxOffsetNumber(page);
1252 
1253  /* Check if it was not moved */
1254  if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
1255  *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
1256  {
1257  ItemId iid = PageGetItemId(page, *downlinkoffnum);
1258  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1259 
1260  if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1261  {
1262  /* Still there */
1263  return buffer;
1264  }
1265  }
1266 
1267  /*
1268  * Downlink was not at the offset where it used to be. Scan the page to
1269  * find it. During normal gist insertions, it might've moved to another
1270  * page, to the right, but during a buffering build, we keep track of the
1271  * parent of each page in the lookup table so we should always know what
1272  * page it's on.
1273  */
1274  for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
1275  {
1276  ItemId iid = PageGetItemId(page, off);
1277  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1278 
1279  if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1280  {
1281  /* yes!!, found it */
1282  *downlinkoffnum = off;
1283  return buffer;
1284  }
1285  }
1286 
1287  elog(ERROR, "failed to re-find parent for block %u", childblkno);
1288  return InvalidBuffer; /* keep compiler quiet */
1289 }
1290 
1291 /*
1292  * Process buffers emptying stack. Emptying of one buffer can cause emptying
1293  * of other buffers. This function iterates until this cascading emptying
1294  * process finished, e.g. until buffers emptying stack is empty.
1295  */
1296 static void
1298 {
1299  GISTBuildBuffers *gfbb = buildstate->gfbb;
1300 
1301  /* Iterate while we have elements in buffers emptying stack. */
1302  while (gfbb->bufferEmptyingQueue != NIL)
1303  {
1304  GISTNodeBuffer *emptyingNodeBuffer;
1305 
1306  /* Get node buffer from emptying stack. */
1307  emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
1309  emptyingNodeBuffer->queuedForEmptying = false;
1310 
1311  /*
1312  * We are going to load last pages of buffers where emptying will be
1313  * to. So let's unload any previously loaded buffers.
1314  */
1315  gistUnloadNodeBuffers(gfbb);
1316 
1317  /*
1318  * Pop tuples from the buffer and run them down to the buffers at
1319  * lower level, or leaf pages. We continue until one of the lower
1320  * level buffers fills up, or this buffer runs empty.
1321  *
1322  * In Arge et al's paper, the buffer emptying is stopped after
1323  * processing 1/2 node buffer worth of tuples, to avoid overfilling
1324  * any of the lower level buffers. However, it's more efficient to
1325  * keep going until one of the lower level buffers actually fills up,
1326  * so that's what we do. This doesn't need to be exact, if a buffer
1327  * overfills by a few tuples, there's no harm done.
1328  */
1329  while (true)
1330  {
1331  IndexTuple itup;
1332 
1333  /* Get next index tuple from the buffer */
1334  if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
1335  break;
1336 
1337  /*
1338  * Run it down to the underlying node buffer or leaf page.
1339  *
1340  * Note: it's possible that the buffer we're emptying splits as a
1341  * result of this call. If that happens, our emptyingNodeBuffer
1342  * points to the left half of the split. After split, it's very
1343  * likely that the new left buffer is no longer over the half-full
1344  * threshold, but we might as well keep flushing tuples from it
1345  * until we fill a lower-level buffer.
1346  */
1347  if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
1348  {
1349  /*
1350  * A lower level buffer filled up. Stop emptying this buffer,
1351  * to avoid overflowing the lower level buffer.
1352  */
1353  break;
1354  }
1355 
1356  /* Free all the memory allocated during index tuple processing */
1357  MemoryContextReset(buildstate->giststate->tempCxt);
1358  }
1359  }
1360 }
1361 
1362 /*
1363  * Empty all node buffers, from top to bottom. This is done at the end of
1364  * index build to flush all remaining tuples to the index.
1365  *
1366  * Note: This destroys the buffersOnLevels lists, so the buffers should not
1367  * be inserted to after this call.
1368  */
1369 static void
1371 {
1372  GISTBuildBuffers *gfbb = buildstate->gfbb;
1373  MemoryContext oldCtx;
1374  int i;
1375 
1376  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1377 
1378  /*
1379  * Iterate through the levels from top to bottom.
1380  */
1381  for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1382  {
1383  /*
1384  * Empty all buffers on this level. Note that new buffers can pop up
1385  * in the list during the processing, as a result of page splits, so a
1386  * simple walk through the list won't work. We remove buffers from the
1387  * list when we see them empty; a buffer can't become non-empty once
1388  * it's been fully emptied.
1389  */
1390  while (gfbb->buffersOnLevels[i] != NIL)
1391  {
1392  GISTNodeBuffer *nodeBuffer;
1393 
1394  nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1395 
1396  if (nodeBuffer->blocksCount != 0)
1397  {
1398  /*
1399  * Add this buffer to the emptying queue, and proceed to empty
1400  * the queue.
1401  */
1402  if (!nodeBuffer->queuedForEmptying)
1403  {
1405  nodeBuffer->queuedForEmptying = true;
1406  gfbb->bufferEmptyingQueue =
1407  lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1408  MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1409  }
1410  gistProcessEmptyingQueue(buildstate);
1411  }
1412  else
1413  gfbb->buffersOnLevels[i] =
1415  }
1416  elog(DEBUG2, "emptied all buffers at level %d", i);
1417  }
1418  MemoryContextSwitchTo(oldCtx);
1419 }
1420 
1421 /*
1422  * Get the depth of the GiST index.
1423  */
1424 static int
1426 {
1427  int maxLevel;
1428  BlockNumber blkno;
1429 
1430  /*
1431  * Traverse down the tree, starting from the root, until we hit the leaf
1432  * level.
1433  */
1434  maxLevel = 0;
1435  blkno = GIST_ROOT_BLKNO;
1436  while (true)
1437  {
1438  Buffer buffer;
1439  Page page;
1440  IndexTuple itup;
1441 
1442  buffer = ReadBuffer(index, blkno);
1443 
1444  /*
1445  * There's no concurrent access during index build, so locking is just
1446  * pro forma.
1447  */
1448  LockBuffer(buffer, GIST_SHARE);
1449  page = (Page) BufferGetPage(buffer);
1450 
1451  if (GistPageIsLeaf(page))
1452  {
1453  /* We hit the bottom, so we're done. */
1454  UnlockReleaseBuffer(buffer);
1455  break;
1456  }
1457 
1458  /*
1459  * Pick the first downlink on the page, and follow it. It doesn't
1460  * matter which downlink we choose, the tree has the same depth
1461  * everywhere, so we just pick the first one.
1462  */
1463  itup = (IndexTuple) PageGetItem(page,
1465  blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1466  UnlockReleaseBuffer(buffer);
1467 
1468  /*
1469  * We're going down on the tree. It means that there is yet one more
1470  * level in the tree.
1471  */
1472  maxLevel++;
1473  }
1474  return maxLevel;
1475 }
1476 
1477 
1478 /*
1479  * Routines for managing the parent map.
1480  *
1481  * Whenever a page is split, we need to insert the downlinks into the parent.
1482  * We need to somehow find the parent page to do that. In normal insertions,
1483  * we keep a stack of nodes visited when we descend the tree. However, in
1484  * buffering build, we can start descending the tree from any internal node,
1485  * when we empty a buffer by cascading tuples to its children. So we don't
1486  * have a full stack up to the root available at that time.
1487  *
1488  * So instead, we maintain a hash table to track the parent of every internal
1489  * page. We don't need to track the parents of leaf nodes, however. Whenever
1490  * we insert to a leaf, we've just descended down from its parent, so we know
1491  * its immediate parent already. This helps a lot to limit the memory used
1492  * by this hash table.
1493  *
1494  * Whenever an internal node is split, the parent map needs to be updated.
1495  * the parent of the new child page needs to be recorded, and also the
1496  * entries for all page whose downlinks are moved to a new page at the split
1497  * needs to be updated.
1498  *
1499  * We also update the parent map whenever we descend the tree. That might seem
1500  * unnecessary, because we maintain the map whenever a downlink is moved or
1501  * created, but it is needed because we switch to buffering mode after
1502  * creating a tree with regular index inserts. Any pages created before
1503  * switching to buffering mode will not be present in the parent map initially,
1504  * but will be added there the first time we visit them.
1505  */
1506 
1507 typedef struct
1508 {
1509  BlockNumber childblkno; /* hash key */
1511 } ParentMapEntry;
1512 
1513 static void
1515 {
1516  HASHCTL hashCtl;
1517 
1518  hashCtl.keysize = sizeof(BlockNumber);
1519  hashCtl.entrysize = sizeof(ParentMapEntry);
1520  hashCtl.hcxt = CurrentMemoryContext;
1521  buildstate->parentMap = hash_create("gistbuild parent map",
1522  1024,
1523  &hashCtl,
1525 }
1526 
1527 static void
1529 {
1530  ParentMapEntry *entry;
1531  bool found;
1532 
1533  entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1534  &child,
1535  HASH_ENTER,
1536  &found);
1537  entry->parentblkno = parent;
1538 }
1539 
1540 /*
1541  * Scan all downlinks on a page, and memorize their parent.
1542  */
1543 static void
1545 {
1546  OffsetNumber maxoff;
1547  OffsetNumber off;
1548  BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1549  Page page = BufferGetPage(parentbuf);
1550 
1551  Assert(!GistPageIsLeaf(page));
1552 
1553  maxoff = PageGetMaxOffsetNumber(page);
1554  for (off = FirstOffsetNumber; off <= maxoff; off++)
1555  {
1556  ItemId iid = PageGetItemId(page, off);
1557  IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1558  BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1559 
1560  gistMemorizeParent(buildstate, childblkno, parentblkno);
1561  }
1562 }
1563 
1564 static BlockNumber
1566 {
1567  ParentMapEntry *entry;
1568  bool found;
1569 
1570  /* Find node buffer in hash table */
1571  entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1572  &child,
1573  HASH_FIND,
1574  &found);
1575  if (!found)
1576  elog(ERROR, "could not find parent of block %u in lookup table", child);
1577 
1578  return entry->parentblkno;
1579 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static Datum values[MAXATTR]
Definition: bootstrap.c:151
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3724
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4941
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2532
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5158
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:746
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:273
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:896
Pointer Page
Definition: bufpage.h:81
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:354
#define SizeOfPageHeaderData
Definition: bufpage.h:216
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:372
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:471
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:87
#define MAXALIGN(LEN)
Definition: c.h:765
#define VARHDRSZ
Definition: c.h:646
#define Assert(condition)
Definition: c.h:812
int64_t int64
Definition: c.h:482
uint16_t uint16
Definition: c.h:484
#define OidIsValid(objectId)
Definition: c.h:729
size_t Size
Definition: c.h:559
int effective_cache_size
Definition: costsize.c:139
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate, Relation heapRel, bool is_build)
Definition: gist.c:634
bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright, Relation heapRel, bool is_build)
Definition: gist.c:225
GISTSTATE * initGISTstate(Relation index)
Definition: gist.c:1525
MemoryContext createTempGistContext(void)
Definition: gist.c:123
void freeGISTstate(GISTSTATE *giststate)
Definition: gist.c:1653
SplitPageLayout * gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate)
Definition: gist.c:1438
#define F_LEAF
Definition: gist.h:48
#define GIST_SORTSUPPORT_PROC
Definition: gist.h:41
struct GISTPageOpaqueData GISTPageOpaqueData
#define GistPageIsLeaf(page)
Definition: gist.h:169
#define GistPageGetOpaque(page)
Definition: gist.h:167
#define GistBuildLSN
Definition: gist.h:69
#define LEVEL_HAS_BUFFERS(nlevel, gfbb)
Definition: gist_private.h:319
@ GIST_OPTION_BUFFERING_OFF
Definition: gist_private.h:388
@ GIST_OPTION_BUFFERING_ON
Definition: gist_private.h:387
#define GIST_ROOT_BLKNO
Definition: gist_private.h:262
#define GIST_DEFAULT_FILLFACTOR
Definition: gist_private.h:480
#define BUFFER_OVERFLOWED(nodeBuffer, gfbb)
Definition: gist_private.h:332
#define GIST_EXCLUSIVE
Definition: gist_private.h:43
#define GIST_SHARE
Definition: gist_private.h:42
GistBuildMode
Definition: gistbuild.c:68
@ GIST_BUFFERING_AUTO
Definition: gistbuild.c:72
@ GIST_SORTED_BUILD
Definition: gistbuild.c:69
@ GIST_BUFFERING_ACTIVE
Definition: gistbuild.c:78
@ GIST_BUFFERING_STATS
Definition: gistbuild.c:75
@ GIST_BUFFERING_DISABLED
Definition: gistbuild.c:70
static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber parentblk, OffsetNumber downlinkoffnum)
Definition: gistbuild.c:1054
static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
Definition: gistbuild.c:1544
static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, BlockNumber startblkno, int startlevel)
Definition: gistbuild.c:923
static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state, GistSortedBuildLevelState *levelstate)
Definition: gistbuild.c:493
#define BUFFERING_MODE_SWITCH_CHECK_STEP
Definition: gistbuild.c:52
static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate, BlockNumber childblkno, int level, BlockNumber *parentblkno, OffsetNumber *downlinkoffnum)
Definition: gistbuild.c:1223
static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
Definition: gistbuild.c:787
static void gist_indexsortbuild_levelstate_add(GISTBuildState *state, GistSortedBuildLevelState *levelstate, IndexTuple itup)
Definition: gistbuild.c:461
static void gist_indexsortbuild(GISTBuildState *state)
Definition: gistbuild.c:400
static void gistInitParentMap(GISTBuildState *buildstate)
Definition: gistbuild.c:1514
static void gistBuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: gistbuild.c:820
static void gistSortedBuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: gistbuild.c:366
static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
Definition: gistbuild.c:1528
#define GIST_SORTED_BUILD_PAGE_NUM
Definition: gistbuild.c:113
static void gistEmptyAllBuffers(GISTBuildState *buildstate)
Definition: gistbuild.c:1370
#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET
Definition: gistbuild.c:60
static void gistInitBuffering(GISTBuildState *buildstate)
Definition: gistbuild.c:626
static void gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
Definition: gistbuild.c:907
IndexBuildResult * gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: gistbuild.c:179
static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child)
Definition: gistbuild.c:1565
static int gistGetMaxLevel(Relation index)
Definition: gistbuild.c:1425
struct GistSortedBuildLevelState GistSortedBuildLevelState
static void gistProcessEmptyingQueue(GISTBuildState *buildstate)
Definition: gistbuild.c:1297
void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, IndexTuple itup)
void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, Relation r, int level, Buffer buffer, List *splitinfo)
GISTBuildBuffers * gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
void gistFreeBuildBuffers(GISTBuildBuffers *gfbb)
GISTNodeBuffer * gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, BlockNumber nodeBlocknum, int level)
bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, IndexTuple *itup)
void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb)
Buffer gistNewBuffer(Relation r, Relation heaprel)
Definition: gistutil.c:824
void gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off)
Definition: gistutil.c:34
IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, const Datum *attdata, const bool *isnull, bool isleaf)
Definition: gistutil.c:575
IndexTuple * gistextractpage(Page page, int *len)
Definition: gistutil.c:95
IndexTuple gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate)
Definition: gistutil.c:316
OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, GISTSTATE *giststate)
Definition: gistutil.c:374
void gistCompressValues(GISTSTATE *giststate, Relation r, const Datum *attdata, const bool *isnull, bool isleaf, Datum *compatt)
Definition: gistutil.c:596
IndexTuple * gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen)
Definition: gistutil.c:114
void gistinitpage(Page page, uint32 f)
Definition: gistutil.c:757
IndexTuple gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate)
Definition: gistutil.c:219
void GISTInitBuffer(Buffer b, uint32 f)
Definition: gistutil.c:773
IndexTupleData * gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
Definition: gistutil.c:127
void gistcheckpage(Relation rel, Buffer buf)
Definition: gistutil.c:785
int maintenance_work_mem
Definition: globals.c:132
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:828
int i
Definition: isn.c:72
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
Pointer Item
Definition: item.h:17
struct ItemIdData ItemIdData
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition: itemptr.h:147
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
List * list_delete_first(List *list)
Definition: list.c:943
List * lcons(void *datum, List *list)
Definition: list.c:495
void list_free_deep(List *list)
Definition: list.c:1560
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:383
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
void * palloc(Size size)
Definition: mcxt.c:1317
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
#define FirstOffsetNumber
Definition: off.h:27
#define INDEX_MAX_KEYS
const void * data
#define lfirst(lc)
Definition: pg_list.h:172
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
#define linitial(l)
Definition: pg_list.h:178
static char * buf
Definition: pg_test_fsync.c:72
static int fillfactor
Definition: pgbench.c:187
uintptr_t Datum
Definition: postgres.h:64
unsigned int Oid
Definition: postgres_ext.h:31
MemoryContextSwitchTo(old_ctx)
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:567
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelationNeedsWAL(relation)
Definition: rel.h:628
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:524
@ MAIN_FORKNUM
Definition: relpath.h:58
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:677
List * bufferEmptyingQueue
Definition: gist_private.h:357
List ** buffersOnLevels
Definition: gist_private.h:368
MemoryContext context
Definition: gist_private.h:341
GISTBuildBuffers * gfbb
Definition: gistbuild.c:100
BlockNumber pages_allocated
Definition: gistbuild.c:108
Relation indexrel
Definition: gistbuild.c:84
GISTSTATE * giststate
Definition: gistbuild.c:86
HTAB * parentMap
Definition: gistbuild.c:101
int64 indtuples
Definition: gistbuild.c:92
BulkWriteState * bulkstate
Definition: gistbuild.c:110
Size freespace
Definition: gistbuild.c:88
Relation heaprel
Definition: gistbuild.c:85
int64 indtuplesSize
Definition: gistbuild.c:99
Tuplesortstate * sortstate
Definition: gistbuild.c:106
GistBuildMode buildMode
Definition: gistbuild.c:90
BlockNumber nodeBlocknum
Definition: gist_private.h:300
bool queuedForEmptying
Definition: gist_private.h:307
IndexTuple downlink
Definition: gist_private.h:422
MemoryContext tempCxt
Definition: gist_private.h:78
Page pages[GIST_SORTED_BUILD_PAGE_NUM]
Definition: gistbuild.c:128
struct GistSortedBuildLevelState * parent
Definition: gistbuild.c:127
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
MemoryContext hcxt
Definition: hsearch.h:86
Definition: dynahash.c:220
double heap_tuples
Definition: genam.h:34
double index_tuples
Definition: genam.h:35
ItemPointerData t_tid
Definition: itup.h:37
Definition: pg_list.h:54
BlockNumber parentblkno
Definition: gistbuild.c:1510
BlockNumber childblkno
Definition: gistbuild.c:1509
gistxlogPage block
Definition: gist_private.h:193
struct SplitPageLayout * next
Definition: gist_private.h:200
IndexTuple itup
Definition: gist_private.h:196
IndexTupleData * list
Definition: gist_private.h:194
Definition: type.h:96
Definition: regguts.h:323
static double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1784
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1363
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:951
#define TUPLESORT_NONE
Definition: tuplesort.h:93
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, const Datum *values, const bool *isnull)
Tuplesortstate * tuplesort_begin_index_gist(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
void log_newpage_range(Relation rel, ForkNumber forknum, BlockNumber startblk, BlockNumber endblk, bool page_std)
Definition: xloginsert.c:1270