PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/relscan.h"
23 #include "access/xlog.h"
24 #include "catalog/index.h"
25 #include "commands/vacuum.h"
26 #include "pgstat.h"
28 #include "storage/indexfsm.h"
29 #include "storage/ipc.h"
30 #include "storage/lmgr.h"
31 #include "storage/smgr.h"
32 #include "tcop/tcopprot.h" /* pgrminclude ignore */
33 #include "utils/builtins.h"
34 #include "utils/index_selfuncs.h"
35 #include "utils/memutils.h"
36 
37 
38 /* Working state for btbuild and its callback */
39 typedef struct
40 {
41  bool isUnique;
42  bool haveDead;
45 
46  /*
47  * spool2 is needed only when the index is a unique index. Dead tuples are
48  * put into spool2 instead of spool in order to avoid uniqueness check.
49  */
51  double indtuples;
52 } BTBuildState;
53 
54 /* Working state needed by btvacuumpage */
55 typedef struct
56 {
62  BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
63  BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
64  BlockNumber totFreePages; /* true total # of free pages */
66 } BTVacState;
67 
68 /*
69  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
70  *
71  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
72  * a new page; others must wait.
73  *
74  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
75  * to a new page; some process can start doing that.
76  *
77  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
78  * We reach this state once for every distinct combination of array keys.
79  */
80 typedef enum
81 {
86 } BTPS_State;
87 
88 /*
89  * BTParallelScanDescData contains btree specific shared information required
90  * for parallel scan.
91  */
92 typedef struct BTParallelScanDescData
93 {
94  BlockNumber btps_scanPage; /* latest or next page to be scanned */
95  BTPS_State btps_pageStatus;/* indicates whether next page is available
96  * for scan. see above for possible states of
97  * parallel scan. */
98  int btps_arrayKeyCount; /* count indicating number of array
99  * scan keys processed by parallel
100  * scan */
101  slock_t btps_mutex; /* protects above variables */
102  ConditionVariable btps_cv; /* used to synchronize parallel scan */
104 
106 
107 
108 static void btbuildCallback(Relation index,
109  HeapTuple htup,
110  Datum *values,
111  bool *isnull,
112  bool tupleIsAlive,
113  void *state);
114 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
115  IndexBulkDeleteCallback callback, void *callback_state,
116  BTCycleId cycleid);
117 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
118  BlockNumber orig_blkno);
119 
120 
121 /*
122  * Btree handler function: return IndexAmRoutine with access method parameters
123  * and callbacks.
124  */
125 Datum
127 {
128  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
129 
130  amroutine->amstrategies = BTMaxStrategyNumber;
131  amroutine->amsupport = BTNProcs;
132  amroutine->amcanorder = true;
133  amroutine->amcanorderbyop = false;
134  amroutine->amcanbackward = true;
135  amroutine->amcanunique = true;
136  amroutine->amcanmulticol = true;
137  amroutine->amoptionalkey = true;
138  amroutine->amsearcharray = true;
139  amroutine->amsearchnulls = true;
140  amroutine->amstorage = false;
141  amroutine->amclusterable = true;
142  amroutine->ampredlocks = true;
143  amroutine->amcanparallel = true;
144  amroutine->amkeytype = InvalidOid;
145 
146  amroutine->ambuild = btbuild;
147  amroutine->ambuildempty = btbuildempty;
148  amroutine->aminsert = btinsert;
149  amroutine->ambulkdelete = btbulkdelete;
150  amroutine->amvacuumcleanup = btvacuumcleanup;
151  amroutine->amcanreturn = btcanreturn;
152  amroutine->amcostestimate = btcostestimate;
153  amroutine->amoptions = btoptions;
154  amroutine->amproperty = btproperty;
155  amroutine->amvalidate = btvalidate;
156  amroutine->ambeginscan = btbeginscan;
157  amroutine->amrescan = btrescan;
158  amroutine->amgettuple = btgettuple;
159  amroutine->amgetbitmap = btgetbitmap;
160  amroutine->amendscan = btendscan;
161  amroutine->ammarkpos = btmarkpos;
162  amroutine->amrestrpos = btrestrpos;
165  amroutine->amparallelrescan = btparallelrescan;
166 
167  PG_RETURN_POINTER(amroutine);
168 }
169 
170 /*
171  * btbuild() -- build a new btree index.
172  */
175 {
177  double reltuples;
178  BTBuildState buildstate;
179 
180  buildstate.isUnique = indexInfo->ii_Unique;
181  buildstate.haveDead = false;
182  buildstate.heapRel = heap;
183  buildstate.spool = NULL;
184  buildstate.spool2 = NULL;
185  buildstate.indtuples = 0;
186 
187 #ifdef BTREE_BUILD_STATS
189  ResetUsage();
190 #endif /* BTREE_BUILD_STATS */
191 
192  /*
193  * We expect to be called exactly once for any index relation. If that's
194  * not the case, big trouble's what we have.
195  */
196  if (RelationGetNumberOfBlocks(index) != 0)
197  elog(ERROR, "index \"%s\" already contains data",
198  RelationGetRelationName(index));
199 
200  buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);
201 
202  /*
203  * If building a unique index, put dead tuples in a second spool to keep
204  * them out of the uniqueness check.
205  */
206  if (indexInfo->ii_Unique)
207  buildstate.spool2 = _bt_spoolinit(heap, index, false, true);
208 
209  /* do the heap scan */
210  reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
211  btbuildCallback, (void *) &buildstate);
212 
213  /* okay, all heap tuples are indexed */
214  if (buildstate.spool2 && !buildstate.haveDead)
215  {
216  /* spool2 turns out to be unnecessary */
217  _bt_spooldestroy(buildstate.spool2);
218  buildstate.spool2 = NULL;
219  }
220 
221  /*
222  * Finish the build by (1) completing the sort of the spool file, (2)
223  * inserting the sorted tuples into btree pages and (3) building the upper
224  * levels.
225  */
226  _bt_leafbuild(buildstate.spool, buildstate.spool2);
227  _bt_spooldestroy(buildstate.spool);
228  if (buildstate.spool2)
229  _bt_spooldestroy(buildstate.spool2);
230 
231 #ifdef BTREE_BUILD_STATS
233  {
234  ShowUsage("BTREE BUILD STATS");
235  ResetUsage();
236  }
237 #endif /* BTREE_BUILD_STATS */
238 
239  /*
240  * Return statistics
241  */
242  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
243 
244  result->heap_tuples = reltuples;
245  result->index_tuples = buildstate.indtuples;
246 
247  return result;
248 }
249 
250 /*
251  * Per-tuple callback from IndexBuildHeapScan
252  */
253 static void
255  HeapTuple htup,
256  Datum *values,
257  bool *isnull,
258  bool tupleIsAlive,
259  void *state)
260 {
261  BTBuildState *buildstate = (BTBuildState *) state;
262 
263  /*
264  * insert the index tuple into the appropriate spool file for subsequent
265  * processing
266  */
267  if (tupleIsAlive || buildstate->spool2 == NULL)
268  _bt_spool(buildstate->spool, &htup->t_self, values, isnull);
269  else
270  {
271  /* dead tuples are put into spool2 */
272  buildstate->haveDead = true;
273  _bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
274  }
275 
276  buildstate->indtuples += 1;
277 }
278 
279 /*
280  * btbuildempty() -- build an empty btree index in the initialization fork
281  */
282 void
284 {
285  Page metapage;
286 
287  /* Construct metapage. */
288  metapage = (Page) palloc(BLCKSZ);
289  _bt_initmetapage(metapage, P_NONE, 0);
290 
291  /*
292  * Write the page and log it. It might seem that an immediate sync
293  * would be sufficient to guarantee that the file exists on disk, but
294  * recovery itself might remove it while replaying, for example, an
295  * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we
296  * need this even when wal_level=minimal.
297  */
300  (char *) metapage, true);
302  BTREE_METAPAGE, metapage, false);
303 
304  /*
305  * An immediate sync is required even if we xlog'd the page, because the
306  * write did not go through shared_buffers and therefore a concurrent
307  * checkpoint may have moved the redo pointer past our xlog record.
308  */
310 }
311 
312 /*
313  * btinsert() -- insert an index tuple into a btree.
314  *
315  * Descend the tree recursively, find the appropriate location for our
316  * new tuple, and put it there.
317  */
318 bool
319 btinsert(Relation rel, Datum *values, bool *isnull,
320  ItemPointer ht_ctid, Relation heapRel,
321  IndexUniqueCheck checkUnique,
322  IndexInfo *indexInfo)
323 {
324  bool result;
325  IndexTuple itup;
326 
327  /* generate an index tuple */
328  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
329  itup->t_tid = *ht_ctid;
330 
331  result = _bt_doinsert(rel, itup, checkUnique, heapRel);
332 
333  pfree(itup);
334 
335  return result;
336 }
337 
338 /*
339  * btgettuple() -- Get the next tuple in the scan.
340  */
341 bool
343 {
344  BTScanOpaque so = (BTScanOpaque) scan->opaque;
345  bool res;
346 
347  /* btree indexes are never lossy */
348  scan->xs_recheck = false;
349 
350  /*
351  * If we have any array keys, initialize them during first call for a
352  * scan. We can't do this in btrescan because we don't know the scan
353  * direction at that time.
354  */
355  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
356  {
357  /* punt if we have any unsatisfiable array keys */
358  if (so->numArrayKeys < 0)
359  return false;
360 
361  _bt_start_array_keys(scan, dir);
362  }
363 
364  /* This loop handles advancing to the next array elements, if any */
365  do
366  {
367  /*
368  * If we've already initialized this scan, we can just advance it in
369  * the appropriate direction. If we haven't done so yet, we call
370  * _bt_first() to get the first item in the scan.
371  */
372  if (!BTScanPosIsValid(so->currPos))
373  res = _bt_first(scan, dir);
374  else
375  {
376  /*
377  * Check to see if we should kill the previously-fetched tuple.
378  */
379  if (scan->kill_prior_tuple)
380  {
381  /*
382  * Yes, remember it for later. (We'll deal with all such
383  * tuples at once right before leaving the index page.) The
384  * test for numKilled overrun is not just paranoia: if the
385  * caller reverses direction in the indexscan then the same
386  * item might get entered multiple times. It's not worth
387  * trying to optimize that, so we don't detect it, but instead
388  * just forget any excess entries.
389  */
390  if (so->killedItems == NULL)
391  so->killedItems = (int *)
392  palloc(MaxIndexTuplesPerPage * sizeof(int));
394  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
395  }
396 
397  /*
398  * Now continue the scan.
399  */
400  res = _bt_next(scan, dir);
401  }
402 
403  /* If we have a tuple, return it ... */
404  if (res)
405  break;
406  /* ... otherwise see if we have more array keys to deal with */
407  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
408 
409  return res;
410 }
411 
412 /*
413  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
414  */
415 int64
417 {
418  BTScanOpaque so = (BTScanOpaque) scan->opaque;
419  int64 ntids = 0;
420  ItemPointer heapTid;
421 
422  /*
423  * If we have any array keys, initialize them.
424  */
425  if (so->numArrayKeys)
426  {
427  /* punt if we have any unsatisfiable array keys */
428  if (so->numArrayKeys < 0)
429  return ntids;
430 
432  }
433 
434  /* This loop handles advancing to the next array elements, if any */
435  do
436  {
437  /* Fetch the first page & tuple */
438  if (_bt_first(scan, ForwardScanDirection))
439  {
440  /* Save tuple ID, and continue scanning */
441  heapTid = &scan->xs_ctup.t_self;
442  tbm_add_tuples(tbm, heapTid, 1, false);
443  ntids++;
444 
445  for (;;)
446  {
447  /*
448  * Advance to next tuple within page. This is the same as the
449  * easy case in _bt_next().
450  */
451  if (++so->currPos.itemIndex > so->currPos.lastItem)
452  {
453  /* let _bt_next do the heavy lifting */
454  if (!_bt_next(scan, ForwardScanDirection))
455  break;
456  }
457 
458  /* Save tuple ID, and continue scanning */
459  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
460  tbm_add_tuples(tbm, heapTid, 1, false);
461  ntids++;
462  }
463  }
464  /* Now see if we have more array keys to deal with */
466 
467  return ntids;
468 }
469 
470 /*
471  * btbeginscan() -- start a scan on a btree index
472  */
474 btbeginscan(Relation rel, int nkeys, int norderbys)
475 {
476  IndexScanDesc scan;
477  BTScanOpaque so;
478 
479  /* no order by operators allowed */
480  Assert(norderbys == 0);
481 
482  /* get the scan */
483  scan = RelationGetIndexScan(rel, nkeys, norderbys);
484 
485  /* allocate private workspace */
486  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
489  if (scan->numberOfKeys > 0)
490  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
491  else
492  so->keyData = NULL;
493 
494  so->arrayKeyData = NULL; /* assume no array keys for now */
495  so->numArrayKeys = 0;
496  so->arrayKeys = NULL;
497  so->arrayContext = NULL;
498 
499  so->killedItems = NULL; /* until needed */
500  so->numKilled = 0;
501 
502  /*
503  * We don't know yet whether the scan will be index-only, so we do not
504  * allocate the tuple workspace arrays until btrescan. However, we set up
505  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
506  */
507  so->currTuples = so->markTuples = NULL;
508 
509  scan->xs_itupdesc = RelationGetDescr(rel);
510 
511  scan->opaque = so;
512 
513  return scan;
514 }
515 
516 /*
517  * btrescan() -- rescan an index relation
518  */
519 void
520 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
521  ScanKey orderbys, int norderbys)
522 {
523  BTScanOpaque so = (BTScanOpaque) scan->opaque;
524 
525  /* we aren't holding any read locks, but gotta drop the pins */
526  if (BTScanPosIsValid(so->currPos))
527  {
528  /* Before leaving current page, deal with any killed items */
529  if (so->numKilled > 0)
530  _bt_killitems(scan);
533  }
534 
535  so->markItemIndex = -1;
536  so->arrayKeyCount = 0;
539 
540  /*
541  * Allocate tuple workspace arrays, if needed for an index-only scan and
542  * not already done in a previous rescan call. To save on palloc
543  * overhead, both workspaces are allocated as one palloc block; only this
544  * function and btendscan know that.
545  *
546  * NOTE: this data structure also makes it safe to return data from a
547  * "name" column, even though btree name_ops uses an underlying storage
548  * datatype of cstring. The risk there is that "name" is supposed to be
549  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
550  * However, since we only return data out of tuples sitting in the
551  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
552  * data out of the markTuples array --- running off the end of memory for
553  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
554  * adding special-case treatment for name_ops elsewhere.
555  */
556  if (scan->xs_want_itup && so->currTuples == NULL)
557  {
558  so->currTuples = (char *) palloc(BLCKSZ * 2);
559  so->markTuples = so->currTuples + BLCKSZ;
560  }
561 
562  /*
563  * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
564  * - vadim 05/05/97
565  */
566  if (scankey && scan->numberOfKeys > 0)
567  memmove(scan->keyData,
568  scankey,
569  scan->numberOfKeys * sizeof(ScanKeyData));
570  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
571 
572  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
574 }
575 
576 /*
577  * btendscan() -- close down a scan
578  */
579 void
581 {
582  BTScanOpaque so = (BTScanOpaque) scan->opaque;
583 
584  /* we aren't holding any read locks, but gotta drop the pins */
585  if (BTScanPosIsValid(so->currPos))
586  {
587  /* Before leaving current page, deal with any killed items */
588  if (so->numKilled > 0)
589  _bt_killitems(scan);
591  }
592 
593  so->markItemIndex = -1;
595 
596  /* No need to invalidate positions, the RAM is about to be freed. */
597 
598  /* Release storage */
599  if (so->keyData != NULL)
600  pfree(so->keyData);
601  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
602  if (so->arrayContext != NULL)
604  if (so->killedItems != NULL)
605  pfree(so->killedItems);
606  if (so->currTuples != NULL)
607  pfree(so->currTuples);
608  /* so->markTuples should not be pfree'd, see btrescan */
609  pfree(so);
610 }
611 
612 /*
613  * btmarkpos() -- save current scan position
614  */
615 void
617 {
618  BTScanOpaque so = (BTScanOpaque) scan->opaque;
619 
620  /* There may be an old mark with a pin (but no lock). */
622 
623  /*
624  * Just record the current itemIndex. If we later step to next page
625  * before releasing the marked position, _bt_steppage makes a full copy of
626  * the currPos struct in markPos. If (as often happens) the mark is moved
627  * before we leave the page, we don't have to do that work.
628  */
629  if (BTScanPosIsValid(so->currPos))
630  so->markItemIndex = so->currPos.itemIndex;
631  else
632  {
634  so->markItemIndex = -1;
635  }
636 
637  /* Also record the current positions of any array keys */
638  if (so->numArrayKeys)
639  _bt_mark_array_keys(scan);
640 }
641 
642 /*
643  * btrestrpos() -- restore scan to last saved position
644  */
645 void
647 {
648  BTScanOpaque so = (BTScanOpaque) scan->opaque;
649 
650  /* Restore the marked positions of any array keys */
651  if (so->numArrayKeys)
653 
654  if (so->markItemIndex >= 0)
655  {
656  /*
657  * The scan has never moved to a new page since the last mark. Just
658  * restore the itemIndex.
659  *
660  * NB: In this case we can't count on anything in so->markPos to be
661  * accurate.
662  */
663  so->currPos.itemIndex = so->markItemIndex;
664  }
665  else
666  {
667  /*
668  * The scan moved to a new page after last mark or restore, and we are
669  * now restoring to the marked page. We aren't holding any read
670  * locks, but if we're still holding the pin for the current position,
671  * we must drop it.
672  */
673  if (BTScanPosIsValid(so->currPos))
674  {
675  /* Before leaving current page, deal with any killed items */
676  if (so->numKilled > 0)
677  _bt_killitems(scan);
679  }
680 
681  if (BTScanPosIsValid(so->markPos))
682  {
683  /* bump pin on mark buffer for assignment to current buffer */
684  if (BTScanPosIsPinned(so->markPos))
686  memcpy(&so->currPos, &so->markPos,
687  offsetof(BTScanPosData, items[1]) +
688  so->markPos.lastItem * sizeof(BTScanPosItem));
689  if (so->currTuples)
690  memcpy(so->currTuples, so->markTuples,
692  }
693  else
695  }
696 }
697 
698 /*
699  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
700  */
701 Size
703 {
704  return sizeof(BTParallelScanDescData);
705 }
706 
707 /*
708  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
709  */
710 void
711 btinitparallelscan(void *target)
712 {
713  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
714 
715  SpinLockInit(&bt_target->btps_mutex);
716  bt_target->btps_scanPage = InvalidBlockNumber;
718  bt_target->btps_arrayKeyCount = 0;
719  ConditionVariableInit(&bt_target->btps_cv);
720 }
721 
722 /*
723  * btparallelrescan() -- reset parallel scan
724  */
725 void
727 {
728  BTParallelScanDesc btscan;
729  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
730 
731  Assert(parallel_scan);
732 
733  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
734  parallel_scan->ps_offset);
735 
736  /*
737  * In theory, we don't need to acquire the spinlock here, because there
738  * shouldn't be any other workers running at this point, but we do so for
739  * consistency.
740  */
741  SpinLockAcquire(&btscan->btps_mutex);
744  btscan->btps_arrayKeyCount = 0;
745  SpinLockRelease(&btscan->btps_mutex);
746 }
747 
748 /*
749  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
750  * page. Other scans must wait until we call bt_parallel_release() or
751  * bt_parallel_done().
752  *
753  * The return value is true if we successfully seized the scan and false
754  * if we did not. The latter case occurs if no pages remain for the current
755  * set of scankeys.
756  *
757  * If the return value is true, *pageno returns the next or current page
758  * of the scan (depending on the scan direction). An invalid block number
759  * means the scan hasn't yet started, and P_NONE means we've reached the end.
760  * The first time a participating process reaches the last page, it will return
761  * true and set *pageno to P_NONE; after that, further attempts to seize the
762  * scan will return false.
763  *
764  * Callers should ignore the value of pageno if the return value is false.
765  */
766 bool
768 {
769  BTScanOpaque so = (BTScanOpaque) scan->opaque;
770  BTPS_State pageStatus;
771  bool exit_loop = false;
772  bool status = true;
773  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
774  BTParallelScanDesc btscan;
775 
776  *pageno = P_NONE;
777 
778  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
779  parallel_scan->ps_offset);
780 
781  while (1)
782  {
783  SpinLockAcquire(&btscan->btps_mutex);
784  pageStatus = btscan->btps_pageStatus;
785 
786  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
787  {
788  /* Parallel scan has already advanced to a new set of scankeys. */
789  status = false;
790  }
791  else if (pageStatus == BTPARALLEL_DONE)
792  {
793  /*
794  * We're done with this set of scankeys. This may be the end, or
795  * there could be more sets to try.
796  */
797  status = false;
798  }
799  else if (pageStatus != BTPARALLEL_ADVANCING)
800  {
801  /*
802  * We have successfully seized control of the scan for the purpose
803  * of advancing it to a new page!
804  */
805  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
806  *pageno = btscan->btps_scanPage;
807  exit_loop = true;
808  }
809  SpinLockRelease(&btscan->btps_mutex);
810  if (exit_loop || !status)
811  break;
813  }
815 
816  return status;
817 }
818 
819 /*
820  * _bt_parallel_release() -- Complete the process of advancing the scan to a
821  * new page. We now have the new value btps_scanPage; some other backend
822  * can now begin advancing the scan.
823  */
824 void
826 {
827  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
828  BTParallelScanDesc btscan;
829 
830  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
831  parallel_scan->ps_offset);
832 
833  SpinLockAcquire(&btscan->btps_mutex);
834  btscan->btps_scanPage = scan_page;
836  SpinLockRelease(&btscan->btps_mutex);
838 }
839 
840 /*
841  * _bt_parallel_done() -- Mark the parallel scan as complete.
842  *
843  * When there are no pages left to scan, this function should be called to
844  * notify other workers. Otherwise, they might wait forever for the scan to
845  * advance to the next page.
846  */
847 void
849 {
850  BTScanOpaque so = (BTScanOpaque) scan->opaque;
851  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
852  BTParallelScanDesc btscan;
853  bool status_changed = false;
854 
855  /* Do nothing, for non-parallel scans */
856  if (parallel_scan == NULL)
857  return;
858 
859  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
860  parallel_scan->ps_offset);
861 
862  /*
863  * Mark the parallel scan as done for this combination of scan keys,
864  * unless some other process already did so. See also
865  * _bt_advance_array_keys.
866  */
867  SpinLockAcquire(&btscan->btps_mutex);
868  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
869  btscan->btps_pageStatus != BTPARALLEL_DONE)
870  {
871  btscan->btps_pageStatus = BTPARALLEL_DONE;
872  status_changed = true;
873  }
874  SpinLockRelease(&btscan->btps_mutex);
875 
876  /* wake up all the workers associated with this parallel scan */
877  if (status_changed)
878  ConditionVariableBroadcast(&btscan->btps_cv);
879 }
880 
881 /*
882  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
883  * keys.
884  *
885  * Updates the count of array keys processed for both local and parallel
886  * scans.
887  */
888 void
890 {
891  BTScanOpaque so = (BTScanOpaque) scan->opaque;
892  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
893  BTParallelScanDesc btscan;
894 
895  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
896  parallel_scan->ps_offset);
897 
898  so->arrayKeyCount++;
899  SpinLockAcquire(&btscan->btps_mutex);
900  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
901  {
902  btscan->btps_scanPage = InvalidBlockNumber;
903  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
904  btscan->btps_arrayKeyCount++;
905  }
906  SpinLockRelease(&btscan->btps_mutex);
907 }
908 
909 /*
910  * Bulk deletion of all index entries pointing to a set of heap tuples.
911  * The set of target tuples is specified via a callback routine that tells
912  * whether any given heap tuple (identified by ItemPointer) is being deleted.
913  *
914  * Result: a palloc'd struct containing statistical info for VACUUM displays.
915  */
918  IndexBulkDeleteCallback callback, void *callback_state)
919 {
920  Relation rel = info->index;
921  BTCycleId cycleid;
922 
923  /* allocate stats if first time through, else re-use existing struct */
924  if (stats == NULL)
926 
927  /* Establish the vacuum cycle ID to use for this scan */
928  /* The ENSURE stuff ensures we clean up shared memory on failure */
930  {
931  cycleid = _bt_start_vacuum(rel);
932 
933  btvacuumscan(info, stats, callback, callback_state, cycleid);
934  }
936  _bt_end_vacuum(rel);
937 
938  return stats;
939 }
940 
941 /*
942  * Post-VACUUM cleanup.
943  *
944  * Result: a palloc'd struct containing statistical info for VACUUM displays.
945  */
948 {
949  /* No-op in ANALYZE ONLY mode */
950  if (info->analyze_only)
951  return stats;
952 
953  /*
954  * If btbulkdelete was called, we need not do anything, just return the
955  * stats from the latest btbulkdelete call. If it wasn't called, we must
956  * still do a pass over the index, to recycle any newly-recyclable pages
957  * and to obtain index statistics.
958  *
959  * Since we aren't going to actually delete any leaf items, there's no
960  * need to go through all the vacuum-cycle-ID pushups.
961  */
962  if (stats == NULL)
963  {
965  btvacuumscan(info, stats, NULL, NULL, 0);
966  }
967 
968  /* Finally, vacuum the FSM */
970 
971  /*
972  * It's quite possible for us to be fooled by concurrent page splits into
973  * double-counting some index tuples, so disbelieve any total that exceeds
974  * the underlying heap's count ... if we know that accurately. Otherwise
975  * this might just make matters worse.
976  */
977  if (!info->estimated_count)
978  {
979  if (stats->num_index_tuples > info->num_heap_tuples)
980  stats->num_index_tuples = info->num_heap_tuples;
981  }
982 
983  return stats;
984 }
985 
986 /*
987  * btvacuumscan --- scan the index for VACUUMing purposes
988  *
989  * This combines the functions of looking for leaf tuples that are deletable
990  * according to the vacuum callback, looking for empty pages that can be
991  * deleted, and looking for old deleted pages that can be recycled. Both
992  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
993  * btbulkdelete call occurred).
994  *
995  * The caller is responsible for initially allocating/zeroing a stats struct
996  * and for obtaining a vacuum cycle ID if necessary.
997  */
998 static void
1000  IndexBulkDeleteCallback callback, void *callback_state,
1001  BTCycleId cycleid)
1002 {
1003  Relation rel = info->index;
1004  BTVacState vstate;
1005  BlockNumber num_pages;
1006  BlockNumber blkno;
1007  bool needLock;
1008 
1009  /*
1010  * Reset counts that will be incremented during the scan; needed in case
1011  * of multiple scans during a single VACUUM command
1012  */
1013  stats->estimated_count = false;
1014  stats->num_index_tuples = 0;
1015  stats->pages_deleted = 0;
1016 
1017  /* Set up info to pass down to btvacuumpage */
1018  vstate.info = info;
1019  vstate.stats = stats;
1020  vstate.callback = callback;
1021  vstate.callback_state = callback_state;
1022  vstate.cycleid = cycleid;
1023  vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
1025  vstate.totFreePages = 0;
1026 
1027  /* Create a temporary memory context to run _bt_pagedel in */
1029  "_bt_pagedel",
1031 
1032  /*
1033  * The outer loop iterates over all index pages except the metapage, in
1034  * physical order (we hope the kernel will cooperate in providing
1035  * read-ahead for speed). It is critical that we visit all leaf pages,
1036  * including ones added after we start the scan, else we might fail to
1037  * delete some deletable tuples. Hence, we must repeatedly check the
1038  * relation length. We must acquire the relation-extension lock while
1039  * doing so to avoid a race condition: if someone else is extending the
1040  * relation, there is a window where bufmgr/smgr have created a new
1041  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1042  * we manage to scan such a page here, we'll improperly assume it can be
1043  * recycled. Taking the lock synchronizes things enough to prevent a
1044  * problem: either num_pages won't include the new page, or _bt_getbuf
1045  * already has write lock on the buffer and it will be fully initialized
1046  * before we can examine it. (See also vacuumlazy.c, which has the same
1047  * issue.) Also, we need not worry if a page is added immediately after
1048  * we look; the page splitting code already has write-lock on the left
1049  * page before it adds a right page, so we must already have processed any
1050  * tuples due to be moved into such a page.
1051  *
1052  * We can skip locking for new or temp relations, however, since no one
1053  * else could be accessing them.
1054  */
1055  needLock = !RELATION_IS_LOCAL(rel);
1056 
1057  blkno = BTREE_METAPAGE + 1;
1058  for (;;)
1059  {
1060  /* Get the current relation length */
1061  if (needLock)
1063  num_pages = RelationGetNumberOfBlocks(rel);
1064  if (needLock)
1066 
1067  /* Quit if we've scanned the whole relation */
1068  if (blkno >= num_pages)
1069  break;
1070  /* Iterate over pages, then loop back to recheck length */
1071  for (; blkno < num_pages; blkno++)
1072  {
1073  btvacuumpage(&vstate, blkno, blkno);
1074  }
1075  }
1076 
1077  /*
1078  * Check to see if we need to issue one final WAL record for this index,
1079  * which may be needed for correctness on a hot standby node when non-MVCC
1080  * index scans could take place.
1081  *
1082  * If the WAL is replayed in hot standby, the replay process needs to get
1083  * cleanup locks on all index leaf pages, just as we've been doing here.
1084  * However, we won't issue any WAL records about pages that have no items
1085  * to be deleted. For pages between pages we've vacuumed, the replay code
1086  * will take locks under the direction of the lastBlockVacuumed fields in
1087  * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
1088  * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1089  * against the last leaf page in the index, if that one wasn't vacuumed.
1090  */
1091  if (XLogStandbyInfoActive() &&
1092  vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1093  {
1094  Buffer buf;
1095 
1096  /*
1097  * The page should be valid, but we can't use _bt_getbuf() because we
1098  * want to use a nondefault buffer access strategy. Since we aren't
1099  * going to delete any items, getting cleanup lock again is probably
1100  * overkill, but for consistency do that anyway.
1101  */
1102  buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1103  RBM_NORMAL, info->strategy);
1104  LockBufferForCleanup(buf);
1105  _bt_checkpage(rel, buf);
1106  _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1107  _bt_relbuf(rel, buf);
1108  }
1109 
1111 
1112  /* update statistics */
1113  stats->num_pages = num_pages;
1114  stats->pages_free = vstate.totFreePages;
1115 }
1116 
1117 /*
1118  * btvacuumpage --- VACUUM one page
1119  *
1120  * This processes a single page for btvacuumscan(). In some cases we
1121  * must go back and re-examine previously-scanned pages; this routine
1122  * recurses when necessary to handle that case.
1123  *
1124  * blkno is the page to process. orig_blkno is the highest block number
1125  * reached by the outer btvacuumscan loop (the same as blkno, unless we
1126  * are recursing to re-examine a previous page).
1127  */
1128 static void
1129 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1130 {
1131  IndexVacuumInfo *info = vstate->info;
1132  IndexBulkDeleteResult *stats = vstate->stats;
1134  void *callback_state = vstate->callback_state;
1135  Relation rel = info->index;
1136  bool delete_now;
1137  BlockNumber recurse_to;
1138  Buffer buf;
1139  Page page;
1140  BTPageOpaque opaque = NULL;
1141 
1142 restart:
1143  delete_now = false;
1144  recurse_to = P_NONE;
1145 
1146  /* call vacuum_delay_point while not holding any buffer lock */
1148 
1149  /*
1150  * We can't use _bt_getbuf() here because it always applies
1151  * _bt_checkpage(), which will barf on an all-zero page. We want to
1152  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1153  * buffer access strategy.
1154  */
1155  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1156  info->strategy);
1157  LockBuffer(buf, BT_READ);
1158  page = BufferGetPage(buf);
1159  if (!PageIsNew(page))
1160  {
1161  _bt_checkpage(rel, buf);
1162  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1163  }
1164 
1165  /*
1166  * If we are recursing, the only case we want to do anything with is a
1167  * live leaf page having the current vacuum cycle ID. Any other state
1168  * implies we already saw the page (eg, deleted it as being empty).
1169  */
1170  if (blkno != orig_blkno)
1171  {
1172  if (_bt_page_recyclable(page) ||
1173  P_IGNORE(opaque) ||
1174  !P_ISLEAF(opaque) ||
1175  opaque->btpo_cycleid != vstate->cycleid)
1176  {
1177  _bt_relbuf(rel, buf);
1178  return;
1179  }
1180  }
1181 
1182  /* Page is valid, see what to do with it */
1183  if (_bt_page_recyclable(page))
1184  {
1185  /* Okay to recycle this page */
1186  RecordFreeIndexPage(rel, blkno);
1187  vstate->totFreePages++;
1188  stats->pages_deleted++;
1189  }
1190  else if (P_ISDELETED(opaque))
1191  {
1192  /* Already deleted, but can't recycle yet */
1193  stats->pages_deleted++;
1194  }
1195  else if (P_ISHALFDEAD(opaque))
1196  {
1197  /* Half-dead, try to delete */
1198  delete_now = true;
1199  }
1200  else if (P_ISLEAF(opaque))
1201  {
1202  OffsetNumber deletable[MaxOffsetNumber];
1203  int ndeletable;
1204  OffsetNumber offnum,
1205  minoff,
1206  maxoff;
1207 
1208  /*
1209  * Trade in the initial read lock for a super-exclusive write lock on
1210  * this page. We must get such a lock on every leaf page over the
1211  * course of the vacuum scan, whether or not it actually contains any
1212  * deletable tuples --- see nbtree/README.
1213  */
1215  LockBufferForCleanup(buf);
1216 
1217  /*
1218  * Remember highest leaf page number we've taken cleanup lock on; see
1219  * notes in btvacuumscan
1220  */
1221  if (blkno > vstate->lastBlockLocked)
1222  vstate->lastBlockLocked = blkno;
1223 
1224  /*
1225  * Check whether we need to recurse back to earlier pages. What we
1226  * are concerned about is a page split that happened since we started
1227  * the vacuum scan. If the split moved some tuples to a lower page
1228  * then we might have missed 'em. If so, set up for tail recursion.
1229  * (Must do this before possibly clearing btpo_cycleid below!)
1230  */
1231  if (vstate->cycleid != 0 &&
1232  opaque->btpo_cycleid == vstate->cycleid &&
1233  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1234  !P_RIGHTMOST(opaque) &&
1235  opaque->btpo_next < orig_blkno)
1236  recurse_to = opaque->btpo_next;
1237 
1238  /*
1239  * Scan over all items to see which ones need deleted according to the
1240  * callback function.
1241  */
1242  ndeletable = 0;
1243  minoff = P_FIRSTDATAKEY(opaque);
1244  maxoff = PageGetMaxOffsetNumber(page);
1245  if (callback)
1246  {
1247  for (offnum = minoff;
1248  offnum <= maxoff;
1249  offnum = OffsetNumberNext(offnum))
1250  {
1251  IndexTuple itup;
1252  ItemPointer htup;
1253 
1254  itup = (IndexTuple) PageGetItem(page,
1255  PageGetItemId(page, offnum));
1256  htup = &(itup->t_tid);
1257 
1258  /*
1259  * During Hot Standby we currently assume that
1260  * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1261  * only true as long as the callback function depends only
1262  * upon whether the index tuple refers to heap tuples removed
1263  * in the initial heap scan. When vacuum starts it derives a
1264  * value of OldestXmin. Backends taking later snapshots could
1265  * have a RecentGlobalXmin with a later xid than the vacuum's
1266  * OldestXmin, so it is possible that row versions deleted
1267  * after OldestXmin could be marked as killed by other
1268  * backends. The callback function *could* look at the index
1269  * tuple state in isolation and decide to delete the index
1270  * tuple, though currently it does not. If it ever did, we
1271  * would need to reconsider whether XLOG_BTREE_VACUUM records
1272  * should cause conflicts. If they did cause conflicts they
1273  * would be fairly harsh conflicts, since we haven't yet
1274  * worked out a way to pass a useful value for
1275  * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1276  * applies to *any* type of index that marks index tuples as
1277  * killed.
1278  */
1279  if (callback(htup, callback_state))
1280  deletable[ndeletable++] = offnum;
1281  }
1282  }
1283 
1284  /*
1285  * Apply any needed deletes. We issue just one _bt_delitems_vacuum()
1286  * call per page, so as to minimize WAL traffic.
1287  */
1288  if (ndeletable > 0)
1289  {
1290  /*
1291  * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1292  * all information to the replay code to allow it to get a cleanup
1293  * lock on all pages between the previous lastBlockVacuumed and
1294  * this page. This ensures that WAL replay locks all leaf pages at
1295  * some point, which is important should non-MVCC scans be
1296  * requested. This is currently unused on standby, but we record
1297  * it anyway, so that the WAL contains the required information.
1298  *
1299  * Since we can visit leaf pages out-of-order when recursing,
1300  * replay might end up locking such pages an extra time, but it
1301  * doesn't seem worth the amount of bookkeeping it'd take to avoid
1302  * that.
1303  */
1304  _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1305  vstate->lastBlockVacuumed);
1306 
1307  /*
1308  * Remember highest leaf page number we've issued a
1309  * XLOG_BTREE_VACUUM WAL record for.
1310  */
1311  if (blkno > vstate->lastBlockVacuumed)
1312  vstate->lastBlockVacuumed = blkno;
1313 
1314  stats->tuples_removed += ndeletable;
1315  /* must recompute maxoff */
1316  maxoff = PageGetMaxOffsetNumber(page);
1317  }
1318  else
1319  {
1320  /*
1321  * If the page has been split during this vacuum cycle, it seems
1322  * worth expending a write to clear btpo_cycleid even if we don't
1323  * have any deletions to do. (If we do, _bt_delitems_vacuum takes
1324  * care of this.) This ensures we won't process the page again.
1325  *
1326  * We treat this like a hint-bit update because there's no need to
1327  * WAL-log it.
1328  */
1329  if (vstate->cycleid != 0 &&
1330  opaque->btpo_cycleid == vstate->cycleid)
1331  {
1332  opaque->btpo_cycleid = 0;
1333  MarkBufferDirtyHint(buf, true);
1334  }
1335  }
1336 
1337  /*
1338  * If it's now empty, try to delete; else count the live tuples. We
1339  * don't delete when recursing, though, to avoid putting entries into
1340  * freePages out-of-order (doesn't seem worth any extra code to handle
1341  * the case).
1342  */
1343  if (minoff > maxoff)
1344  delete_now = (blkno == orig_blkno);
1345  else
1346  stats->num_index_tuples += maxoff - minoff + 1;
1347  }
1348 
1349  if (delete_now)
1350  {
1351  MemoryContext oldcontext;
1352  int ndel;
1353 
1354  /* Run pagedel in a temp context to avoid memory leakage */
1356  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1357 
1358  ndel = _bt_pagedel(rel, buf);
1359 
1360  /* count only this page, else may double-count parent */
1361  if (ndel)
1362  stats->pages_deleted++;
1363 
1364  MemoryContextSwitchTo(oldcontext);
1365  /* pagedel released buffer, so we shouldn't */
1366  }
1367  else
1368  _bt_relbuf(rel, buf);
1369 
1370  /*
1371  * This is really tail recursion, but if the compiler is too stupid to
1372  * optimize it as such, we'd eat an uncomfortably large amount of stack
1373  * space per recursion level (due to the deletable[] array). A failure is
1374  * improbable since the number of levels isn't likely to be large ... but
1375  * just in case, let's hand-optimize into a loop.
1376  */
1377  if (recurse_to != P_NONE)
1378  {
1379  blkno = recurse_to;
1380  goto restart;
1381  }
1382 }
1383 
1384 /*
1385  * btcanreturn() -- Check whether btree indexes support index-only scans.
1386  *
1387  * btrees always do, so this is trivial.
1388  */
1389 bool
1391 {
1392  return true;
1393 }
ParallelIndexScanDesc parallel_scan
Definition: relscan.h:139
ambeginscan_function ambeginscan
Definition: amapi.h:208
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:321
int slock_t
Definition: s_lock.h:888
BlockNumber lastBlockLocked
Definition: nbtree.c:63
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3603
ambulkdelete_function ambulkdelete
Definition: amapi.h:201
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:547
#define BTP_SPLIT_END
Definition: nbtree.h:75
bool amcanmulticol
Definition: amapi.h:179
BlockNumber btpo_next
Definition: nbtree.h:57
uint16 amsupport
Definition: amapi.h:169
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:200
MemoryContext pagedelcontext
Definition: nbtree.c:65
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:520
double tuples_removed
Definition: genam.h:77
void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:201
void * callback_state
Definition: nbtree.c:60
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Definition: nbtinsert.c:108
#define P_IGNORE(opaque)
Definition: nbtree.h:181
static void btbuildCallback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: nbtree.c:254
BlockNumber lastBlockVacuumed
Definition: nbtree.c:62
amgettuple_function amgettuple
Definition: amapi.h:210
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:328
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3379
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:848
#define RelationGetDescr(relation)
Definition: rel.h:429
bool amcanorderbyop
Definition: amapi.h:173
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:521
BlockNumber totFreePages
Definition: nbtree.c:64
MemoryContext arrayContext
Definition: nbtree.h:389
amproperty_function amproperty
Definition: amapi.h:206
bool ConditionVariableSignal(ConditionVariable *cv)
#define ExclusiveLock
Definition: lockdefs.h:44
#define PointerGetDatum(X)
Definition: postgres.h:562
void ShowUsage(const char *title)
Definition: postgres.c:4380
#define MaxOffsetNumber
Definition: off.h:28
int itemIndex
Definition: nbtree.h:326
struct SMgrRelationData * rd_smgr
Definition: rel.h:87
char * currTuples
Definition: nbtree.h:400
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:403
#define SpinLockInit(lock)
Definition: spin.h:60
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:205
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:524
bool analyze_only
Definition: genam.h:47
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo)
Definition: nbtree.c:319
IndexBulkDeleteCallback callback
Definition: nbtree.c:59
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:640
int ConditionVariableBroadcast(ConditionVariable *cv)
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:219
BufferAccessStrategy strategy
Definition: genam.h:51
bool amstorage
Definition: amapi.h:187
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define P_NONE
Definition: nbtree.h:168
Relation index
Definition: genam.h:46
bool ampredlocks
Definition: amapi.h:191
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:350
return result
Definition: formatting.c:1618
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:135
uint32 BlockNumber
Definition: block.h:31
TupleDesc xs_itupdesc
Definition: relscan.h:114
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:1992
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:474
BTSpool * _bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
Definition: nbtsort.c:152
aminsert_function aminsert
Definition: amapi.h:200
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:999
void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:180
BTSpool * spool
Definition: nbtree.c:44
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:105
Oid amkeytype
Definition: amapi.h:195
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:744
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
bool amoptionalkey
Definition: amapi.h:181
amvalidate_function amvalidate
Definition: amapi.h:207
Size btestimateparallelscan(void)
Definition: nbtree.c:702
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:536
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:38
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:90
IndexUniqueCheck
Definition: genam.h:111
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:37
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:625
void ResetUsage(void)
Definition: postgres.c:4373
void btbuildempty(Relation index)
Definition: nbtree.c:283
#define BT_READ
Definition: nbtree.h:238
#define SpinLockAcquire(lock)
Definition: spin.h:62
int nextTupleOffset
Definition: nbtree.h:315
void ConditionVariableInit(ConditionVariable *cv)
int lastItem
Definition: nbtree.h:325
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1732
BTScanPosData markPos
Definition: nbtree.h:414
void pfree(void *pointer)
Definition: mcxt.c:950
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:947
void ConditionVariableCancelSleep(void)
amgetbitmap_function amgetbitmap
Definition: amapi.h:211
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:503
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:180
#define ERROR
Definition: elog.h:43
uint16 BTCycleId
Definition: nbtree.h:26
ambuild_function ambuild
Definition: amapi.h:198
void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
Definition: nbtsort.c:190
amoptions_function amoptions
Definition: amapi.h:205
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
Definition: nbtree.c:1129
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:416
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:417
BlockNumber num_pages
Definition: genam.h:73
BlockNumber pages_free
Definition: genam.h:79
ItemPointerData t_self
Definition: htup.h:65
BTPS_State
Definition: nbtree.c:80
BTCycleId btpo_cycleid
Definition: nbtree.h:64
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:580
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:165
RelFileNodeBackend smgr_rnode
Definition: smgr.h:43
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
Definition: nbtpage.c:49
amcostestimate_function amcostestimate
Definition: amapi.h:204
bool amcanunique
Definition: amapi.h:177
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6627
int numArrayKeys
Definition: nbtree.h:384
Relation heapRel
Definition: nbtree.c:43
ConditionVariable btps_cv
Definition: nbtree.c:102
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:1964
static char * buf
Definition: pg_test_fsync.c:66
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:202
amendscan_function amendscan
Definition: amapi.h:212
#define memmove(d, s, c)
Definition: c.h:1058
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:647
ScanKeyData * ScanKey
Definition: skey.h:75
BTSpool * spool2
Definition: nbtree.c:50
bool amcanbackward
Definition: amapi.h:175
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:333
bool haveDead
Definition: nbtree.c:42
IndexTupleData * IndexTuple
Definition: itup.h:53
int arrayKeyCount
Definition: nbtree.h:386
ScanDirection
Definition: sdir.h:22
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:606
char * markTuples
Definition: nbtree.h:401
double indtuples
Definition: nbtree.c:51
#define RelationGetRelationName(relation)
Definition: rel.h:437
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:767
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
BlockNumber pages_deleted
Definition: genam.h:78
#define OffsetToPointer(base, offset)
Definition: c.h:535
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
ScanKey arrayKeyData
Definition: nbtree.h:383
amrescan_function amrescan
Definition: amapi.h:209
bool amcanparallel
Definition: amapi.h:193
#define BTREE_METAPAGE
Definition: nbtree.h:109
#define P_ISDELETED(opaque)
Definition: nbtree.h:178
BTCycleId cycleid
Definition: nbtree.c:61
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:332
#define SpinLockRelease(lock)
Definition: spin.h:64
bool amsearchnulls
Definition: amapi.h:185
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:342
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:616
MemoryContext AllocSetContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: aset.c:322
void * palloc0(Size size)
Definition: mcxt.c:878
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:382
uintptr_t Datum
Definition: postgres.h:372
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
bool log_btree_build_stats
Definition: guc.c:447
bool amclusterable
Definition: amapi.h:189
#define XLogStandbyInfoActive()
Definition: xlog.h:159
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:199
BlockNumber btps_scanPage
Definition: nbtree.c:94
bool amsearcharray
Definition: amapi.h:183
#define InvalidOid
Definition: postgres_ext.h:36
int _bt_pagedel(Relation rel, Buffer buf)
Definition: nbtpage.c:1109
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
int markItemIndex
Definition: nbtree.h:410
RelFileNode node
Definition: relfilenode.h:74
bool xs_want_itup
Definition: relscan.h:95
double num_heap_tuples
Definition: genam.h:50
bool ii_Unique
Definition: execnodes.h:146
#define makeNode(_type_)
Definition: nodes.h:557
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int * killedItems
Definition: nbtree.h:392
#define NULL
Definition: c.h:229
#define Assert(condition)
Definition: c.h:675
IndexVacuumInfo * info
Definition: nbtree.c:57
Definition: regguts.h:298
BTPS_State btps_pageStatus
Definition: nbtree.c:95
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2054
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:646
HeapTupleData xs_ctup
Definition: relscan.h:119
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:126
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:356
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:356
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define InvalidBlockNumber
Definition: block.h:33
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
Definition: nbtpage.c:789
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1199
int numberOfKeys
Definition: nbtree.h:379
ammarkpos_function ammarkpos
Definition: amapi.h:213
bool amcanorder
Definition: amapi.h:171
IndexBulkDeleteResult * stats
Definition: nbtree.c:58
ScanKey keyData
Definition: relscan.h:93
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1390
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:726
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:217
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtree.c:174
#define BTNProcs
Definition: nbtree.h:231
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:825
ItemPointerData heapTid
Definition: nbtree.h:289
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:889
uint16 amstrategies
Definition: amapi.h:167
static Datum values[MAXATTR]
Definition: bootstrap.c:163
BTScanPosData currPos
Definition: nbtree.h:413
double IndexBuildHeapScan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, bool allow_sync, IndexBuildCallback callback, void *callback_state)
Definition: index.c:2169
#define PageIsNew(page)
Definition: bufpage.h:226
#define MaxIndexTuplesPerPage
Definition: itup.h:137
void * palloc(Size size)
Definition: mcxt.c:849
void btinitparallelscan(void *target)
Definition: nbtree.c:711
ScanKey keyData
Definition: nbtree.h:380
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:973
ambuildempty_function ambuildempty
Definition: amapi.h:199
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:388
bool kill_prior_tuple
Definition: relscan.h:99
bool isUnique
Definition: nbtree.c:41
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:192
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:78
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1129
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2042
#define BTMaxStrategyNumber
Definition: stratnum.h:35
struct BTParallelScanDescData BTParallelScanDescData
#define elog
Definition: elog.h:219
Buffer buf
Definition: nbtree.h:296
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:1907
void vacuum_delay_point(void)
Definition: vacuum.c:1560
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:224
uint16 btpo_flags
Definition: nbtree.h:63
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
double num_index_tuples
Definition: genam.h:76
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:203
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:175
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:734
bool estimated_count
Definition: genam.h:75
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3347
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:83
#define offsetof(type, field)
Definition: c.h:555
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:344
Pointer Page
Definition: bufpage.h:74
double index_tuples
Definition: genam.h:33
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:218
double heap_tuples
Definition: genam.h:32
bool estimated_count
Definition: genam.h:48
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:917
amrestrpos_function amrestrpos
Definition: amapi.h:214
#define P_ISLEAF(opaque)
Definition: nbtree.h:176