PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/vacuum.h"
26 #include "miscadmin.h"
27 #include "nodes/execnodes.h"
28 #include "pgstat.h"
29 #include "postmaster/autovacuum.h"
31 #include "storage/indexfsm.h"
32 #include "storage/ipc.h"
33 #include "storage/lmgr.h"
34 #include "storage/smgr.h"
35 #include "utils/builtins.h"
36 #include "utils/index_selfuncs.h"
37 #include "utils/memutils.h"
38 
39 
40 /* Working state needed by btvacuumpage */
41 typedef struct
42 {
48  BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
49  BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
50  BlockNumber totFreePages; /* true total # of free pages */
53 } BTVacState;
54 
55 /*
56  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
57  *
58  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
59  * a new page; others must wait.
60  *
61  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
62  * to a new page; some process can start doing that.
63  *
64  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
65  * We reach this state once for every distinct combination of array keys.
66  */
67 typedef enum
68 {
73 } BTPS_State;
74 
75 /*
76  * BTParallelScanDescData contains btree specific shared information required
77  * for parallel scan.
78  */
79 typedef struct BTParallelScanDescData
80 {
81  BlockNumber btps_scanPage; /* latest or next page to be scanned */
82  BTPS_State btps_pageStatus; /* indicates whether next page is
83  * available for scan. see above for
84  * possible states of parallel scan. */
85  int btps_arrayKeyCount; /* count indicating number of array scan
86  * keys processed by parallel scan */
87  slock_t btps_mutex; /* protects above variables */
88  ConditionVariable btps_cv; /* used to synchronize parallel scan */
90 
92 
93 
94 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
95  IndexBulkDeleteCallback callback, void *callback_state,
96  BTCycleId cycleid, TransactionId *oldestBtpoXact);
97 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
98  BlockNumber orig_blkno);
99 
100 
101 /*
102  * Btree handler function: return IndexAmRoutine with access method parameters
103  * and callbacks.
104  */
105 Datum
107 {
108  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
109 
110  amroutine->amstrategies = BTMaxStrategyNumber;
111  amroutine->amsupport = BTNProcs;
112  amroutine->amcanorder = true;
113  amroutine->amcanorderbyop = false;
114  amroutine->amcanbackward = true;
115  amroutine->amcanunique = true;
116  amroutine->amcanmulticol = true;
117  amroutine->amoptionalkey = true;
118  amroutine->amsearcharray = true;
119  amroutine->amsearchnulls = true;
120  amroutine->amstorage = false;
121  amroutine->amclusterable = true;
122  amroutine->ampredlocks = true;
123  amroutine->amcanparallel = true;
124  amroutine->amcaninclude = true;
125  amroutine->amkeytype = InvalidOid;
126 
127  amroutine->ambuild = btbuild;
128  amroutine->ambuildempty = btbuildempty;
129  amroutine->aminsert = btinsert;
130  amroutine->ambulkdelete = btbulkdelete;
131  amroutine->amvacuumcleanup = btvacuumcleanup;
132  amroutine->amcanreturn = btcanreturn;
133  amroutine->amcostestimate = btcostestimate;
134  amroutine->amoptions = btoptions;
135  amroutine->amproperty = btproperty;
136  amroutine->amvalidate = btvalidate;
137  amroutine->ambeginscan = btbeginscan;
138  amroutine->amrescan = btrescan;
139  amroutine->amgettuple = btgettuple;
140  amroutine->amgetbitmap = btgetbitmap;
141  amroutine->amendscan = btendscan;
142  amroutine->ammarkpos = btmarkpos;
143  amroutine->amrestrpos = btrestrpos;
146  amroutine->amparallelrescan = btparallelrescan;
147 
148  PG_RETURN_POINTER(amroutine);
149 }
150 
151 /*
152  * btbuildempty() -- build an empty btree index in the initialization fork
153  */
154 void
156 {
157  Page metapage;
158 
159  /* Construct metapage. */
160  metapage = (Page) palloc(BLCKSZ);
161  _bt_initmetapage(metapage, P_NONE, 0);
162 
163  /*
164  * Write the page and log it. It might seem that an immediate sync would
165  * be sufficient to guarantee that the file exists on disk, but recovery
166  * itself might remove it while replaying, for example, an
167  * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
168  * this even when wal_level=minimal.
169  */
172  (char *) metapage, true);
174  BTREE_METAPAGE, metapage, true);
175 
176  /*
177  * An immediate sync is required even if we xlog'd the page, because the
178  * write did not go through shared_buffers and therefore a concurrent
179  * checkpoint may have moved the redo pointer past our xlog record.
180  */
182 }
183 
184 /*
185  * btinsert() -- insert an index tuple into a btree.
186  *
187  * Descend the tree recursively, find the appropriate location for our
188  * new tuple, and put it there.
189  */
190 bool
191 btinsert(Relation rel, Datum *values, bool *isnull,
192  ItemPointer ht_ctid, Relation heapRel,
193  IndexUniqueCheck checkUnique,
194  IndexInfo *indexInfo)
195 {
196  bool result;
197  IndexTuple itup;
198 
199  /* generate an index tuple */
200  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
201  itup->t_tid = *ht_ctid;
202 
203  result = _bt_doinsert(rel, itup, checkUnique, heapRel);
204 
205  pfree(itup);
206 
207  return result;
208 }
209 
210 /*
211  * btgettuple() -- Get the next tuple in the scan.
212  */
213 bool
215 {
216  BTScanOpaque so = (BTScanOpaque) scan->opaque;
217  bool res;
218 
219  /* btree indexes are never lossy */
220  scan->xs_recheck = false;
221 
222  /*
223  * If we have any array keys, initialize them during first call for a
224  * scan. We can't do this in btrescan because we don't know the scan
225  * direction at that time.
226  */
227  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
228  {
229  /* punt if we have any unsatisfiable array keys */
230  if (so->numArrayKeys < 0)
231  return false;
232 
233  _bt_start_array_keys(scan, dir);
234  }
235 
236  /* This loop handles advancing to the next array elements, if any */
237  do
238  {
239  /*
240  * If we've already initialized this scan, we can just advance it in
241  * the appropriate direction. If we haven't done so yet, we call
242  * _bt_first() to get the first item in the scan.
243  */
244  if (!BTScanPosIsValid(so->currPos))
245  res = _bt_first(scan, dir);
246  else
247  {
248  /*
249  * Check to see if we should kill the previously-fetched tuple.
250  */
251  if (scan->kill_prior_tuple)
252  {
253  /*
254  * Yes, remember it for later. (We'll deal with all such
255  * tuples at once right before leaving the index page.) The
256  * test for numKilled overrun is not just paranoia: if the
257  * caller reverses direction in the indexscan then the same
258  * item might get entered multiple times. It's not worth
259  * trying to optimize that, so we don't detect it, but instead
260  * just forget any excess entries.
261  */
262  if (so->killedItems == NULL)
263  so->killedItems = (int *)
264  palloc(MaxIndexTuplesPerPage * sizeof(int));
266  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
267  }
268 
269  /*
270  * Now continue the scan.
271  */
272  res = _bt_next(scan, dir);
273  }
274 
275  /* If we have a tuple, return it ... */
276  if (res)
277  break;
278  /* ... otherwise see if we have more array keys to deal with */
279  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
280 
281  return res;
282 }
283 
284 /*
285  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
286  */
287 int64
289 {
290  BTScanOpaque so = (BTScanOpaque) scan->opaque;
291  int64 ntids = 0;
292  ItemPointer heapTid;
293 
294  /*
295  * If we have any array keys, initialize them.
296  */
297  if (so->numArrayKeys)
298  {
299  /* punt if we have any unsatisfiable array keys */
300  if (so->numArrayKeys < 0)
301  return ntids;
302 
304  }
305 
306  /* This loop handles advancing to the next array elements, if any */
307  do
308  {
309  /* Fetch the first page & tuple */
310  if (_bt_first(scan, ForwardScanDirection))
311  {
312  /* Save tuple ID, and continue scanning */
313  heapTid = &scan->xs_ctup.t_self;
314  tbm_add_tuples(tbm, heapTid, 1, false);
315  ntids++;
316 
317  for (;;)
318  {
319  /*
320  * Advance to next tuple within page. This is the same as the
321  * easy case in _bt_next().
322  */
323  if (++so->currPos.itemIndex > so->currPos.lastItem)
324  {
325  /* let _bt_next do the heavy lifting */
326  if (!_bt_next(scan, ForwardScanDirection))
327  break;
328  }
329 
330  /* Save tuple ID, and continue scanning */
331  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
332  tbm_add_tuples(tbm, heapTid, 1, false);
333  ntids++;
334  }
335  }
336  /* Now see if we have more array keys to deal with */
338 
339  return ntids;
340 }
341 
342 /*
343  * btbeginscan() -- start a scan on a btree index
344  */
346 btbeginscan(Relation rel, int nkeys, int norderbys)
347 {
348  IndexScanDesc scan;
349  BTScanOpaque so;
350 
351  /* no order by operators allowed */
352  Assert(norderbys == 0);
353 
354  /* get the scan */
355  scan = RelationGetIndexScan(rel, nkeys, norderbys);
356 
357  /* allocate private workspace */
358  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
361  if (scan->numberOfKeys > 0)
362  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
363  else
364  so->keyData = NULL;
365 
366  so->arrayKeyData = NULL; /* assume no array keys for now */
367  so->numArrayKeys = 0;
368  so->arrayKeys = NULL;
369  so->arrayContext = NULL;
370 
371  so->killedItems = NULL; /* until needed */
372  so->numKilled = 0;
373 
374  /*
375  * We don't know yet whether the scan will be index-only, so we do not
376  * allocate the tuple workspace arrays until btrescan. However, we set up
377  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
378  */
379  so->currTuples = so->markTuples = NULL;
380 
381  scan->xs_itupdesc = RelationGetDescr(rel);
382 
383  scan->opaque = so;
384 
385  return scan;
386 }
387 
388 /*
389  * btrescan() -- rescan an index relation
390  */
391 void
392 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
393  ScanKey orderbys, int norderbys)
394 {
395  BTScanOpaque so = (BTScanOpaque) scan->opaque;
396 
397  /* we aren't holding any read locks, but gotta drop the pins */
398  if (BTScanPosIsValid(so->currPos))
399  {
400  /* Before leaving current page, deal with any killed items */
401  if (so->numKilled > 0)
402  _bt_killitems(scan);
405  }
406 
407  so->markItemIndex = -1;
408  so->arrayKeyCount = 0;
411 
412  /*
413  * Allocate tuple workspace arrays, if needed for an index-only scan and
414  * not already done in a previous rescan call. To save on palloc
415  * overhead, both workspaces are allocated as one palloc block; only this
416  * function and btendscan know that.
417  *
418  * NOTE: this data structure also makes it safe to return data from a
419  * "name" column, even though btree name_ops uses an underlying storage
420  * datatype of cstring. The risk there is that "name" is supposed to be
421  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
422  * However, since we only return data out of tuples sitting in the
423  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
424  * data out of the markTuples array --- running off the end of memory for
425  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
426  * adding special-case treatment for name_ops elsewhere.
427  */
428  if (scan->xs_want_itup && so->currTuples == NULL)
429  {
430  so->currTuples = (char *) palloc(BLCKSZ * 2);
431  so->markTuples = so->currTuples + BLCKSZ;
432  }
433 
434  /*
435  * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
436  * - vadim 05/05/97
437  */
438  if (scankey && scan->numberOfKeys > 0)
439  memmove(scan->keyData,
440  scankey,
441  scan->numberOfKeys * sizeof(ScanKeyData));
442  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
443 
444  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
446 }
447 
448 /*
449  * btendscan() -- close down a scan
450  */
451 void
453 {
454  BTScanOpaque so = (BTScanOpaque) scan->opaque;
455 
456  /* we aren't holding any read locks, but gotta drop the pins */
457  if (BTScanPosIsValid(so->currPos))
458  {
459  /* Before leaving current page, deal with any killed items */
460  if (so->numKilled > 0)
461  _bt_killitems(scan);
463  }
464 
465  so->markItemIndex = -1;
467 
468  /* No need to invalidate positions, the RAM is about to be freed. */
469 
470  /* Release storage */
471  if (so->keyData != NULL)
472  pfree(so->keyData);
473  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
474  if (so->arrayContext != NULL)
476  if (so->killedItems != NULL)
477  pfree(so->killedItems);
478  if (so->currTuples != NULL)
479  pfree(so->currTuples);
480  /* so->markTuples should not be pfree'd, see btrescan */
481  pfree(so);
482 }
483 
484 /*
485  * btmarkpos() -- save current scan position
486  */
487 void
489 {
490  BTScanOpaque so = (BTScanOpaque) scan->opaque;
491 
492  /* There may be an old mark with a pin (but no lock). */
494 
495  /*
496  * Just record the current itemIndex. If we later step to next page
497  * before releasing the marked position, _bt_steppage makes a full copy of
498  * the currPos struct in markPos. If (as often happens) the mark is moved
499  * before we leave the page, we don't have to do that work.
500  */
501  if (BTScanPosIsValid(so->currPos))
502  so->markItemIndex = so->currPos.itemIndex;
503  else
504  {
506  so->markItemIndex = -1;
507  }
508 
509  /* Also record the current positions of any array keys */
510  if (so->numArrayKeys)
511  _bt_mark_array_keys(scan);
512 }
513 
514 /*
515  * btrestrpos() -- restore scan to last saved position
516  */
517 void
519 {
520  BTScanOpaque so = (BTScanOpaque) scan->opaque;
521 
522  /* Restore the marked positions of any array keys */
523  if (so->numArrayKeys)
525 
526  if (so->markItemIndex >= 0)
527  {
528  /*
529  * The scan has never moved to a new page since the last mark. Just
530  * restore the itemIndex.
531  *
532  * NB: In this case we can't count on anything in so->markPos to be
533  * accurate.
534  */
535  so->currPos.itemIndex = so->markItemIndex;
536  }
537  else
538  {
539  /*
540  * The scan moved to a new page after last mark or restore, and we are
541  * now restoring to the marked page. We aren't holding any read
542  * locks, but if we're still holding the pin for the current position,
543  * we must drop it.
544  */
545  if (BTScanPosIsValid(so->currPos))
546  {
547  /* Before leaving current page, deal with any killed items */
548  if (so->numKilled > 0)
549  _bt_killitems(scan);
551  }
552 
553  if (BTScanPosIsValid(so->markPos))
554  {
555  /* bump pin on mark buffer for assignment to current buffer */
556  if (BTScanPosIsPinned(so->markPos))
558  memcpy(&so->currPos, &so->markPos,
559  offsetof(BTScanPosData, items[1]) +
560  so->markPos.lastItem * sizeof(BTScanPosItem));
561  if (so->currTuples)
562  memcpy(so->currTuples, so->markTuples,
564  }
565  else
567  }
568 }
569 
570 /*
571  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
572  */
573 Size
575 {
576  return sizeof(BTParallelScanDescData);
577 }
578 
579 /*
580  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
581  */
582 void
583 btinitparallelscan(void *target)
584 {
585  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
586 
587  SpinLockInit(&bt_target->btps_mutex);
588  bt_target->btps_scanPage = InvalidBlockNumber;
590  bt_target->btps_arrayKeyCount = 0;
591  ConditionVariableInit(&bt_target->btps_cv);
592 }
593 
594 /*
595  * btparallelrescan() -- reset parallel scan
596  */
597 void
599 {
600  BTParallelScanDesc btscan;
601  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
602 
603  Assert(parallel_scan);
604 
605  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
606  parallel_scan->ps_offset);
607 
608  /*
609  * In theory, we don't need to acquire the spinlock here, because there
610  * shouldn't be any other workers running at this point, but we do so for
611  * consistency.
612  */
613  SpinLockAcquire(&btscan->btps_mutex);
616  btscan->btps_arrayKeyCount = 0;
617  SpinLockRelease(&btscan->btps_mutex);
618 }
619 
620 /*
621  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
622  * page. Other scans must wait until we call bt_parallel_release() or
623  * bt_parallel_done().
624  *
625  * The return value is true if we successfully seized the scan and false
626  * if we did not. The latter case occurs if no pages remain for the current
627  * set of scankeys.
628  *
629  * If the return value is true, *pageno returns the next or current page
630  * of the scan (depending on the scan direction). An invalid block number
631  * means the scan hasn't yet started, and P_NONE means we've reached the end.
632  * The first time a participating process reaches the last page, it will return
633  * true and set *pageno to P_NONE; after that, further attempts to seize the
634  * scan will return false.
635  *
636  * Callers should ignore the value of pageno if the return value is false.
637  */
638 bool
640 {
641  BTScanOpaque so = (BTScanOpaque) scan->opaque;
642  BTPS_State pageStatus;
643  bool exit_loop = false;
644  bool status = true;
645  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
646  BTParallelScanDesc btscan;
647 
648  *pageno = P_NONE;
649 
650  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
651  parallel_scan->ps_offset);
652 
653  while (1)
654  {
655  SpinLockAcquire(&btscan->btps_mutex);
656  pageStatus = btscan->btps_pageStatus;
657 
658  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
659  {
660  /* Parallel scan has already advanced to a new set of scankeys. */
661  status = false;
662  }
663  else if (pageStatus == BTPARALLEL_DONE)
664  {
665  /*
666  * We're done with this set of scankeys. This may be the end, or
667  * there could be more sets to try.
668  */
669  status = false;
670  }
671  else if (pageStatus != BTPARALLEL_ADVANCING)
672  {
673  /*
674  * We have successfully seized control of the scan for the purpose
675  * of advancing it to a new page!
676  */
677  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
678  *pageno = btscan->btps_scanPage;
679  exit_loop = true;
680  }
681  SpinLockRelease(&btscan->btps_mutex);
682  if (exit_loop || !status)
683  break;
685  }
687 
688  return status;
689 }
690 
691 /*
692  * _bt_parallel_release() -- Complete the process of advancing the scan to a
693  * new page. We now have the new value btps_scanPage; some other backend
694  * can now begin advancing the scan.
695  */
696 void
698 {
699  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
700  BTParallelScanDesc btscan;
701 
702  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
703  parallel_scan->ps_offset);
704 
705  SpinLockAcquire(&btscan->btps_mutex);
706  btscan->btps_scanPage = scan_page;
708  SpinLockRelease(&btscan->btps_mutex);
710 }
711 
712 /*
713  * _bt_parallel_done() -- Mark the parallel scan as complete.
714  *
715  * When there are no pages left to scan, this function should be called to
716  * notify other workers. Otherwise, they might wait forever for the scan to
717  * advance to the next page.
718  */
719 void
721 {
722  BTScanOpaque so = (BTScanOpaque) scan->opaque;
723  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
724  BTParallelScanDesc btscan;
725  bool status_changed = false;
726 
727  /* Do nothing, for non-parallel scans */
728  if (parallel_scan == NULL)
729  return;
730 
731  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
732  parallel_scan->ps_offset);
733 
734  /*
735  * Mark the parallel scan as done for this combination of scan keys,
736  * unless some other process already did so. See also
737  * _bt_advance_array_keys.
738  */
739  SpinLockAcquire(&btscan->btps_mutex);
740  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
741  btscan->btps_pageStatus != BTPARALLEL_DONE)
742  {
743  btscan->btps_pageStatus = BTPARALLEL_DONE;
744  status_changed = true;
745  }
746  SpinLockRelease(&btscan->btps_mutex);
747 
748  /* wake up all the workers associated with this parallel scan */
749  if (status_changed)
750  ConditionVariableBroadcast(&btscan->btps_cv);
751 }
752 
753 /*
754  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
755  * keys.
756  *
757  * Updates the count of array keys processed for both local and parallel
758  * scans.
759  */
760 void
762 {
763  BTScanOpaque so = (BTScanOpaque) scan->opaque;
764  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
765  BTParallelScanDesc btscan;
766 
767  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
768  parallel_scan->ps_offset);
769 
770  so->arrayKeyCount++;
771  SpinLockAcquire(&btscan->btps_mutex);
772  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
773  {
774  btscan->btps_scanPage = InvalidBlockNumber;
775  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
776  btscan->btps_arrayKeyCount++;
777  }
778  SpinLockRelease(&btscan->btps_mutex);
779 }
780 
781 /*
782  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
783  * btbulkdelete() wasn't called.
784  */
785 static bool
787 {
788  Buffer metabuf;
789  Page metapg;
790  BTMetaPageData *metad;
791  bool result = false;
792 
793  metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
794  metapg = BufferGetPage(metabuf);
795  metad = BTPageGetMeta(metapg);
796 
797  if (metad->btm_version < BTREE_VERSION)
798  {
799  /*
800  * Do cleanup if metapage needs upgrade, because we don't have
801  * cleanup-related meta-information yet.
802  */
803  result = true;
804  }
805  else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
808  {
809  /*
810  * If oldest btpo.xact in the deleted pages is older than
811  * RecentGlobalXmin, then at least one deleted page can be recycled.
812  */
813  result = true;
814  }
815  else
816  {
817  StdRdOptions *relopts;
818  float8 cleanup_scale_factor;
819 
820  /*
821  * If table receives large enough amount of insertions and no cleanup
822  * was performed, then index might appear to have stalled statistics.
823  * In order to evade that, we perform cleanup when table receives
824  * vacuum_cleanup_index_scale_factor fractions of insertions.
825  */
826  relopts = (StdRdOptions *) info->index->rd_options;
827  cleanup_scale_factor = (relopts &&
828  relopts->vacuum_cleanup_index_scale_factor >= 0)
831 
832  if (cleanup_scale_factor < 0 ||
834  info->num_heap_tuples > (1.0 + cleanup_scale_factor) *
836  result = true;
837  }
838 
839  _bt_relbuf(info->index, metabuf);
840  return result;
841 }
842 
843 /*
844  * Bulk deletion of all index entries pointing to a set of heap tuples.
845  * The set of target tuples is specified via a callback routine that tells
846  * whether any given heap tuple (identified by ItemPointer) is being deleted.
847  *
848  * Result: a palloc'd struct containing statistical info for VACUUM displays.
849  */
852  IndexBulkDeleteCallback callback, void *callback_state)
853 {
854  Relation rel = info->index;
855  BTCycleId cycleid;
856 
857  /* allocate stats if first time through, else re-use existing struct */
858  if (stats == NULL)
860 
861  /* Establish the vacuum cycle ID to use for this scan */
862  /* The ENSURE stuff ensures we clean up shared memory on failure */
864  {
865  TransactionId oldestBtpoXact;
866 
867  cycleid = _bt_start_vacuum(rel);
868 
869  btvacuumscan(info, stats, callback, callback_state, cycleid,
870  &oldestBtpoXact);
871 
872  /*
873  * Update cleanup-related information in metapage. These information
874  * is used only for cleanup but keeping up them to date can avoid
875  * unnecessary cleanup even after bulkdelete.
876  */
877  _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
878  info->num_heap_tuples);
879  }
881  _bt_end_vacuum(rel);
882 
883  return stats;
884 }
885 
886 /*
887  * Post-VACUUM cleanup.
888  *
889  * Result: a palloc'd struct containing statistical info for VACUUM displays.
890  */
893 {
894  /* No-op in ANALYZE ONLY mode */
895  if (info->analyze_only)
896  return stats;
897 
898  /*
899  * If btbulkdelete was called, we need not do anything, just return the
900  * stats from the latest btbulkdelete call. If it wasn't called, we might
901  * still need to do a pass over the index, to recycle any newly-recyclable
902  * pages and to obtain index statistics. _bt_vacuum_needs_cleanup checks
903  * is there are newly-recyclable or stalled index statistics.
904  *
905  * Since we aren't going to actually delete any leaf items, there's no
906  * need to go through all the vacuum-cycle-ID pushups.
907  */
908  if (stats == NULL)
909  {
910  TransactionId oldestBtpoXact;
911 
912  /* Check if we need a cleanup */
913  if (!_bt_vacuum_needs_cleanup(info))
914  return NULL;
915 
917  btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
918 
919  /* Update cleanup-related information in the metapage */
920  _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
921  info->num_heap_tuples);
922  }
923 
924  /*
925  * It's quite possible for us to be fooled by concurrent page splits into
926  * double-counting some index tuples, so disbelieve any total that exceeds
927  * the underlying heap's count ... if we know that accurately. Otherwise
928  * this might just make matters worse.
929  */
930  if (!info->estimated_count)
931  {
932  if (stats->num_index_tuples > info->num_heap_tuples)
933  stats->num_index_tuples = info->num_heap_tuples;
934  }
935 
936  return stats;
937 }
938 
939 /*
940  * btvacuumscan --- scan the index for VACUUMing purposes
941  *
942  * This combines the functions of looking for leaf tuples that are deletable
943  * according to the vacuum callback, looking for empty pages that can be
944  * deleted, and looking for old deleted pages that can be recycled. Both
945  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
946  * btbulkdelete call occurred).
947  *
948  * The caller is responsible for initially allocating/zeroing a stats struct
949  * and for obtaining a vacuum cycle ID if necessary.
950  */
951 static void
953  IndexBulkDeleteCallback callback, void *callback_state,
954  BTCycleId cycleid, TransactionId *oldestBtpoXact)
955 {
956  Relation rel = info->index;
957  BTVacState vstate;
958  BlockNumber num_pages;
959  BlockNumber blkno;
960  bool needLock;
961 
962  /*
963  * Reset counts that will be incremented during the scan; needed in case
964  * of multiple scans during a single VACUUM command
965  */
966  stats->estimated_count = false;
967  stats->num_index_tuples = 0;
968  stats->pages_deleted = 0;
969 
970  /* Set up info to pass down to btvacuumpage */
971  vstate.info = info;
972  vstate.stats = stats;
973  vstate.callback = callback;
974  vstate.callback_state = callback_state;
975  vstate.cycleid = cycleid;
976  vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
978  vstate.totFreePages = 0;
980 
981  /* Create a temporary memory context to run _bt_pagedel in */
983  "_bt_pagedel",
985 
986  /*
987  * The outer loop iterates over all index pages except the metapage, in
988  * physical order (we hope the kernel will cooperate in providing
989  * read-ahead for speed). It is critical that we visit all leaf pages,
990  * including ones added after we start the scan, else we might fail to
991  * delete some deletable tuples. Hence, we must repeatedly check the
992  * relation length. We must acquire the relation-extension lock while
993  * doing so to avoid a race condition: if someone else is extending the
994  * relation, there is a window where bufmgr/smgr have created a new
995  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
996  * we manage to scan such a page here, we'll improperly assume it can be
997  * recycled. Taking the lock synchronizes things enough to prevent a
998  * problem: either num_pages won't include the new page, or _bt_getbuf
999  * already has write lock on the buffer and it will be fully initialized
1000  * before we can examine it. (See also vacuumlazy.c, which has the same
1001  * issue.) Also, we need not worry if a page is added immediately after
1002  * we look; the page splitting code already has write-lock on the left
1003  * page before it adds a right page, so we must already have processed any
1004  * tuples due to be moved into such a page.
1005  *
1006  * We can skip locking for new or temp relations, however, since no one
1007  * else could be accessing them.
1008  */
1009  needLock = !RELATION_IS_LOCAL(rel);
1010 
1011  blkno = BTREE_METAPAGE + 1;
1012  for (;;)
1013  {
1014  /* Get the current relation length */
1015  if (needLock)
1017  num_pages = RelationGetNumberOfBlocks(rel);
1018  if (needLock)
1020 
1021  /* Quit if we've scanned the whole relation */
1022  if (blkno >= num_pages)
1023  break;
1024  /* Iterate over pages, then loop back to recheck length */
1025  for (; blkno < num_pages; blkno++)
1026  {
1027  btvacuumpage(&vstate, blkno, blkno);
1028  }
1029  }
1030 
1031  /*
1032  * Check to see if we need to issue one final WAL record for this index,
1033  * which may be needed for correctness on a hot standby node when non-MVCC
1034  * index scans could take place.
1035  *
1036  * If the WAL is replayed in hot standby, the replay process needs to get
1037  * cleanup locks on all index leaf pages, just as we've been doing here.
1038  * However, we won't issue any WAL records about pages that have no items
1039  * to be deleted. For pages between pages we've vacuumed, the replay code
1040  * will take locks under the direction of the lastBlockVacuumed fields in
1041  * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
1042  * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1043  * against the last leaf page in the index, if that one wasn't vacuumed.
1044  */
1045  if (XLogStandbyInfoActive() &&
1046  vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1047  {
1048  Buffer buf;
1049 
1050  /*
1051  * The page should be valid, but we can't use _bt_getbuf() because we
1052  * want to use a nondefault buffer access strategy. Since we aren't
1053  * going to delete any items, getting cleanup lock again is probably
1054  * overkill, but for consistency do that anyway.
1055  */
1056  buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1057  RBM_NORMAL, info->strategy);
1058  LockBufferForCleanup(buf);
1059  _bt_checkpage(rel, buf);
1060  _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1061  _bt_relbuf(rel, buf);
1062  }
1063 
1065 
1066  /*
1067  * If we found any recyclable pages (and recorded them in the FSM), then
1068  * forcibly update the upper-level FSM pages to ensure that searchers can
1069  * find them. It's possible that the pages were also found during
1070  * previous scans and so this is a waste of time, but it's cheap enough
1071  * relative to scanning the index that it shouldn't matter much, and
1072  * making sure that free pages are available sooner not later seems
1073  * worthwhile.
1074  *
1075  * Note that if no recyclable pages exist, we don't bother vacuuming the
1076  * FSM at all.
1077  */
1078  if (vstate.totFreePages > 0)
1080 
1081  /* update statistics */
1082  stats->num_pages = num_pages;
1083  stats->pages_free = vstate.totFreePages;
1084 
1085  if (oldestBtpoXact)
1086  *oldestBtpoXact = vstate.oldestBtpoXact;
1087 }
1088 
1089 /*
1090  * btvacuumpage --- VACUUM one page
1091  *
1092  * This processes a single page for btvacuumscan(). In some cases we
1093  * must go back and re-examine previously-scanned pages; this routine
1094  * recurses when necessary to handle that case.
1095  *
1096  * blkno is the page to process. orig_blkno is the highest block number
1097  * reached by the outer btvacuumscan loop (the same as blkno, unless we
1098  * are recursing to re-examine a previous page).
1099  */
1100 static void
1101 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1102 {
1103  IndexVacuumInfo *info = vstate->info;
1104  IndexBulkDeleteResult *stats = vstate->stats;
1106  void *callback_state = vstate->callback_state;
1107  Relation rel = info->index;
1108  bool delete_now;
1109  BlockNumber recurse_to;
1110  Buffer buf;
1111  Page page;
1112  BTPageOpaque opaque = NULL;
1113 
1114 restart:
1115  delete_now = false;
1116  recurse_to = P_NONE;
1117 
1118  /* call vacuum_delay_point while not holding any buffer lock */
1120 
1121  /*
1122  * We can't use _bt_getbuf() here because it always applies
1123  * _bt_checkpage(), which will barf on an all-zero page. We want to
1124  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1125  * buffer access strategy.
1126  */
1127  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1128  info->strategy);
1129  LockBuffer(buf, BT_READ);
1130  page = BufferGetPage(buf);
1131  if (!PageIsNew(page))
1132  {
1133  _bt_checkpage(rel, buf);
1134  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1135  }
1136 
1137  /*
1138  * If we are recursing, the only case we want to do anything with is a
1139  * live leaf page having the current vacuum cycle ID. Any other state
1140  * implies we already saw the page (eg, deleted it as being empty).
1141  */
1142  if (blkno != orig_blkno)
1143  {
1144  if (_bt_page_recyclable(page) ||
1145  P_IGNORE(opaque) ||
1146  !P_ISLEAF(opaque) ||
1147  opaque->btpo_cycleid != vstate->cycleid)
1148  {
1149  _bt_relbuf(rel, buf);
1150  return;
1151  }
1152  }
1153 
1154  /* Page is valid, see what to do with it */
1155  if (_bt_page_recyclable(page))
1156  {
1157  /* Okay to recycle this page */
1158  RecordFreeIndexPage(rel, blkno);
1159  vstate->totFreePages++;
1160  stats->pages_deleted++;
1161  }
1162  else if (P_ISDELETED(opaque))
1163  {
1164  /* Already deleted, but can't recycle yet */
1165  stats->pages_deleted++;
1166 
1167  /* Update the oldest btpo.xact */
1168  if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1169  TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1170  vstate->oldestBtpoXact = opaque->btpo.xact;
1171  }
1172  else if (P_ISHALFDEAD(opaque))
1173  {
1174  /* Half-dead, try to delete */
1175  delete_now = true;
1176  }
1177  else if (P_ISLEAF(opaque))
1178  {
1179  OffsetNumber deletable[MaxOffsetNumber];
1180  int ndeletable;
1181  OffsetNumber offnum,
1182  minoff,
1183  maxoff;
1184 
1185  /*
1186  * Trade in the initial read lock for a super-exclusive write lock on
1187  * this page. We must get such a lock on every leaf page over the
1188  * course of the vacuum scan, whether or not it actually contains any
1189  * deletable tuples --- see nbtree/README.
1190  */
1192  LockBufferForCleanup(buf);
1193 
1194  /*
1195  * Remember highest leaf page number we've taken cleanup lock on; see
1196  * notes in btvacuumscan
1197  */
1198  if (blkno > vstate->lastBlockLocked)
1199  vstate->lastBlockLocked = blkno;
1200 
1201  /*
1202  * Check whether we need to recurse back to earlier pages. What we
1203  * are concerned about is a page split that happened since we started
1204  * the vacuum scan. If the split moved some tuples to a lower page
1205  * then we might have missed 'em. If so, set up for tail recursion.
1206  * (Must do this before possibly clearing btpo_cycleid below!)
1207  */
1208  if (vstate->cycleid != 0 &&
1209  opaque->btpo_cycleid == vstate->cycleid &&
1210  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1211  !P_RIGHTMOST(opaque) &&
1212  opaque->btpo_next < orig_blkno)
1213  recurse_to = opaque->btpo_next;
1214 
1215  /*
1216  * Scan over all items to see which ones need deleted according to the
1217  * callback function.
1218  */
1219  ndeletable = 0;
1220  minoff = P_FIRSTDATAKEY(opaque);
1221  maxoff = PageGetMaxOffsetNumber(page);
1222  if (callback)
1223  {
1224  for (offnum = minoff;
1225  offnum <= maxoff;
1226  offnum = OffsetNumberNext(offnum))
1227  {
1228  IndexTuple itup;
1229  ItemPointer htup;
1230 
1231  itup = (IndexTuple) PageGetItem(page,
1232  PageGetItemId(page, offnum));
1233  htup = &(itup->t_tid);
1234 
1235  /*
1236  * During Hot Standby we currently assume that
1237  * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1238  * only true as long as the callback function depends only
1239  * upon whether the index tuple refers to heap tuples removed
1240  * in the initial heap scan. When vacuum starts it derives a
1241  * value of OldestXmin. Backends taking later snapshots could
1242  * have a RecentGlobalXmin with a later xid than the vacuum's
1243  * OldestXmin, so it is possible that row versions deleted
1244  * after OldestXmin could be marked as killed by other
1245  * backends. The callback function *could* look at the index
1246  * tuple state in isolation and decide to delete the index
1247  * tuple, though currently it does not. If it ever did, we
1248  * would need to reconsider whether XLOG_BTREE_VACUUM records
1249  * should cause conflicts. If they did cause conflicts they
1250  * would be fairly harsh conflicts, since we haven't yet
1251  * worked out a way to pass a useful value for
1252  * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1253  * applies to *any* type of index that marks index tuples as
1254  * killed.
1255  */
1256  if (callback(htup, callback_state))
1257  deletable[ndeletable++] = offnum;
1258  }
1259  }
1260 
1261  /*
1262  * Apply any needed deletes. We issue just one _bt_delitems_vacuum()
1263  * call per page, so as to minimize WAL traffic.
1264  */
1265  if (ndeletable > 0)
1266  {
1267  /*
1268  * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1269  * all information to the replay code to allow it to get a cleanup
1270  * lock on all pages between the previous lastBlockVacuumed and
1271  * this page. This ensures that WAL replay locks all leaf pages at
1272  * some point, which is important should non-MVCC scans be
1273  * requested. This is currently unused on standby, but we record
1274  * it anyway, so that the WAL contains the required information.
1275  *
1276  * Since we can visit leaf pages out-of-order when recursing,
1277  * replay might end up locking such pages an extra time, but it
1278  * doesn't seem worth the amount of bookkeeping it'd take to avoid
1279  * that.
1280  */
1281  _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1282  vstate->lastBlockVacuumed);
1283 
1284  /*
1285  * Remember highest leaf page number we've issued a
1286  * XLOG_BTREE_VACUUM WAL record for.
1287  */
1288  if (blkno > vstate->lastBlockVacuumed)
1289  vstate->lastBlockVacuumed = blkno;
1290 
1291  stats->tuples_removed += ndeletable;
1292  /* must recompute maxoff */
1293  maxoff = PageGetMaxOffsetNumber(page);
1294  }
1295  else
1296  {
1297  /*
1298  * If the page has been split during this vacuum cycle, it seems
1299  * worth expending a write to clear btpo_cycleid even if we don't
1300  * have any deletions to do. (If we do, _bt_delitems_vacuum takes
1301  * care of this.) This ensures we won't process the page again.
1302  *
1303  * We treat this like a hint-bit update because there's no need to
1304  * WAL-log it.
1305  */
1306  if (vstate->cycleid != 0 &&
1307  opaque->btpo_cycleid == vstate->cycleid)
1308  {
1309  opaque->btpo_cycleid = 0;
1310  MarkBufferDirtyHint(buf, true);
1311  }
1312  }
1313 
1314  /*
1315  * If it's now empty, try to delete; else count the live tuples. We
1316  * don't delete when recursing, though, to avoid putting entries into
1317  * freePages out-of-order (doesn't seem worth any extra code to handle
1318  * the case).
1319  */
1320  if (minoff > maxoff)
1321  delete_now = (blkno == orig_blkno);
1322  else
1323  stats->num_index_tuples += maxoff - minoff + 1;
1324  }
1325 
1326  if (delete_now)
1327  {
1328  MemoryContext oldcontext;
1329  int ndel;
1330 
1331  /* Run pagedel in a temp context to avoid memory leakage */
1333  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1334 
1335  ndel = _bt_pagedel(rel, buf);
1336 
1337  /* count only this page, else may double-count parent */
1338  if (ndel)
1339  {
1340  stats->pages_deleted++;
1341  if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1342  TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1343  vstate->oldestBtpoXact = opaque->btpo.xact;
1344  }
1345 
1346  MemoryContextSwitchTo(oldcontext);
1347  /* pagedel released buffer, so we shouldn't */
1348  }
1349  else
1350  _bt_relbuf(rel, buf);
1351 
1352  /*
1353  * This is really tail recursion, but if the compiler is too stupid to
1354  * optimize it as such, we'd eat an uncomfortably large amount of stack
1355  * space per recursion level (due to the deletable[] array). A failure is
1356  * improbable since the number of levels isn't likely to be large ... but
1357  * just in case, let's hand-optimize into a loop.
1358  */
1359  if (recurse_to != P_NONE)
1360  {
1361  blkno = recurse_to;
1362  goto restart;
1363  }
1364 }
1365 
1366 /*
1367  * btcanreturn() -- Check whether btree indexes support index-only scans.
1368  *
1369  * btrees always do, so this is trivial.
1370  */
1371 bool
1373 {
1374  return true;
1375 }
slock_t btps_mutex
Definition: nbtree.c:87
ParallelIndexScanDesc parallel_scan
Definition: relscan.h:141
ambeginscan_function ambeginscan
Definition: amapi.h:217
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:326
int slock_t
Definition: s_lock.h:912
BlockNumber lastBlockLocked
Definition: nbtree.c:49
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3603
ambulkdelete_function ambulkdelete
Definition: amapi.h:210
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:558
#define BTP_SPLIT_END
Definition: nbtree.h:76
bool amcanmulticol
Definition: amapi.h:180
BlockNumber btpo_next
Definition: nbtree.h:58
uint16 amsupport
Definition: amapi.h:170
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:211
MemoryContext pagedelcontext
Definition: nbtree.c:52
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:392
double tuples_removed
Definition: genam.h:77
void * callback_state
Definition: nbtree.c:46
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Definition: nbtinsert.c:110
#define P_IGNORE(opaque)
Definition: nbtree.h:163
static bool _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
Definition: nbtree.c:786
uint32 TransactionId
Definition: c.h:474
BlockNumber lastBlockVacuumed
Definition: nbtree.c:48
uint32 btm_version
Definition: nbtree.h:100
amgettuple_function amgettuple
Definition: amapi.h:219
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:390
struct BTParallelScanDescData BTParallelScanDescData
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3379
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:720
#define RelationGetDescr(relation)
Definition: rel.h:433
bool amcanorderbyop
Definition: amapi.h:174
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:532
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:729
BlockNumber totFreePages
Definition: nbtree.c:50
MemoryContext arrayContext
Definition: nbtree.h:451
amproperty_function amproperty
Definition: amapi.h:215
#define ExclusiveLock
Definition: lockdefs.h:44
#define PointerGetDatum(X)
Definition: postgres.h:539
#define BTREE_VERSION
Definition: nbtree.h:117
#define MaxOffsetNumber
Definition: off.h:28
int itemIndex
Definition: nbtree.h:388
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
char * currTuples
Definition: nbtree.h:462
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
#define SpinLockInit(lock)
Definition: spin.h:60
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:187
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:528
bool analyze_only
Definition: genam.h:47
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo)
Definition: nbtree.c:191
IndexBulkDeleteCallback callback
Definition: nbtree.c:45
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:640
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:228
union BTPageOpaqueData::@46 btpo
BufferAccessStrategy strategy
Definition: genam.h:51
bool amstorage
Definition: amapi.h:188
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define P_NONE
Definition: nbtree.h:150
void ConditionVariableBroadcast(ConditionVariable *cv)
Relation index
Definition: genam.h:46
double vacuum_cleanup_index_scale_factor
Definition: globals.c:151
bool ampredlocks
Definition: amapi.h:192
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:412
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:136
uint32 BlockNumber
Definition: block.h:31
TupleDesc xs_itupdesc
Definition: relscan.h:116
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:2004
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:346
void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
Definition: nbtpage.c:155
aminsert_function aminsert
Definition: amapi.h:209
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:91
Oid amkeytype
Definition: amapi.h:198
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:903
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
TransactionId xact
Definition: nbtree.h:62
bool amoptionalkey
Definition: amapi.h:182
amvalidate_function amvalidate
Definition: amapi.h:216
Size btestimateparallelscan(void)
Definition: nbtree.c:574
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:538
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:38
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
IndexUniqueCheck
Definition: genam.h:111
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:40
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:636
void btbuildempty(Relation index)
Definition: nbtree.c:155
#define BT_READ
Definition: nbtree.h:300
#define SpinLockAcquire(lock)
Definition: spin.h:62
int nextTupleOffset
Definition: nbtree.h:377
void ConditionVariableInit(ConditionVariable *cv)
int lastItem
Definition: nbtree.h:387
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1744
BTScanPosData markPos
Definition: nbtree.h:476
void pfree(void *pointer)
Definition: mcxt.c:1031
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:892
void ConditionVariableCancelSleep(void)
amgetbitmap_function amgetbitmap
Definition: amapi.h:220
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:662
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:162
double float8
Definition: c.h:458
uint16 BTCycleId
Definition: nbtree.h:27
ambuild_function ambuild
Definition: amapi.h:207
amoptions_function amoptions
Definition: amapi.h:214
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
Definition: nbtree.c:1101
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:288
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:306
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:479
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid, TransactionId *oldestBtpoXact)
Definition: nbtree.c:952
BlockNumber num_pages
Definition: genam.h:73
BlockNumber pages_free
Definition: genam.h:79
ItemPointerData t_self
Definition: htup.h:65
BTPS_State
Definition: nbtree.c:67
BTCycleId btpo_cycleid
Definition: nbtree.h:65
void ConditionVariableSignal(ConditionVariable *cv)
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:452
bool amcaninclude
Definition: amapi.h:196
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:192
RelFileNodeBackend smgr_rnode
Definition: smgr.h:43
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
Definition: nbtpage.c:50
amcostestimate_function amcostestimate
Definition: amapi.h:213
#define BTPageGetMeta(p)
Definition: nbtree.h:112
bool amcanunique
Definition: amapi.h:178
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6942
int numArrayKeys
Definition: nbtree.h:446
ConditionVariable btps_cv
Definition: nbtree.c:88
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:1976
static char * buf
Definition: pg_test_fsync.c:67
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:211
amendscan_function amendscan
Definition: amapi.h:221
#define memmove(d, s, c)
Definition: c.h:1100
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:647
ScanKeyData * ScanKey
Definition: skey.h:75
bool amcanbackward
Definition: amapi.h:176
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:395
TransactionId RecentGlobalXmin
Definition: snapmgr.c:166
IndexTupleData * IndexTuple
Definition: itup.h:53
int arrayKeyCount
Definition: nbtree.h:448
ScanDirection
Definition: sdir.h:22
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:617
char * markTuples
Definition: nbtree.h:463
#define InvalidTransactionId
Definition: transam.h:31
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:639
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
BlockNumber pages_deleted
Definition: genam.h:78
#define OffsetToPointer(base, offset)
Definition: c.h:602
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
ScanKey arrayKeyData
Definition: nbtree.h:445
amrescan_function amrescan
Definition: amapi.h:218
bool amcanparallel
Definition: amapi.h:194
#define BTREE_METAPAGE
Definition: nbtree.h:115
#define P_ISDELETED(opaque)
Definition: nbtree.h:160
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
TransactionId oldestBtpoXact
Definition: nbtree.c:51
#define AllocSetContextCreate(parent, name, allocparams)
Definition: memutils.h:170
BTCycleId cycleid
Definition: nbtree.c:47
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:332
#define SpinLockRelease(lock)
Definition: spin.h:64
bool amsearchnulls
Definition: amapi.h:186
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:214
float8 vacuum_cleanup_index_scale_factor
Definition: rel.h:261
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:488
void * palloc0(Size size)
Definition: mcxt.c:955
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:382
uintptr_t Datum
Definition: postgres.h:365
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
bool amclusterable
Definition: amapi.h:190
#define XLogStandbyInfoActive()
Definition: xlog.h:160
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:199
BlockNumber btps_scanPage
Definition: nbtree.c:81
bool amsearcharray
Definition: amapi.h:184
#define InvalidOid
Definition: postgres_ext.h:36
int _bt_pagedel(Relation rel, Buffer buf)
Definition: nbtpage.c:1268
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:879
int markItemIndex
Definition: nbtree.h:472
RelFileNode node
Definition: relfilenode.h:74
bool xs_want_itup
Definition: relscan.h:97
double num_heap_tuples
Definition: genam.h:50
#define makeNode(_type_)
Definition: nodes.h:565
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int * killedItems
Definition: nbtree.h:454
#define Assert(condition)
Definition: c.h:699
IndexVacuumInfo * info
Definition: nbtree.c:43
BTPS_State btps_pageStatus
Definition: nbtree.c:82
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2066
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:518
HeapTupleData xs_ctup
Definition: relscan.h:121
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:106
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:418
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:433
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
Definition: nbtpage.c:948
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1195
int numberOfKeys
Definition: nbtree.h:441
ammarkpos_function ammarkpos
Definition: amapi.h:222
bool amcanorder
Definition: amapi.h:172
IndexBulkDeleteResult * stats
Definition: nbtree.c:44
ScanKey keyData
Definition: relscan.h:95
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1372
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:598
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:226
#define BTNProcs
Definition: nbtree.h:293
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:697
ItemPointerData heapTid
Definition: nbtree.h:351
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:761
static Datum values[MAXATTR]
Definition: bootstrap.c:164
uint16 amstrategies
Definition: amapi.h:168
BTScanPosData currPos
Definition: nbtree.h:475
#define PageIsNew(page)
Definition: bufpage.h:225
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:924
void btinitparallelscan(void *target)
Definition: nbtree.c:583
ScanKey keyData
Definition: nbtree.h:442
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:972
ambuildempty_function ambuildempty
Definition: amapi.h:208
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:450
bool kill_prior_tuple
Definition: relscan.h:101
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:203
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:78
#define PG_FUNCTION_ARGS
Definition: fmgr.h:163
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1131
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2054
#define BTMaxStrategyNumber
Definition: stratnum.h:35
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
Buffer buf
Definition: nbtree.h:358
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:1919
#define TransactionIdIsValid(xid)
Definition: transam.h:41
void vacuum_delay_point(void)
Definition: vacuum.c:1672
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:225
uint16 btpo_flags
Definition: nbtree.h:64
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
double num_index_tuples
Definition: genam.h:76
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:157
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:734
bool estimated_count
Definition: genam.h:75
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3347
bytea * rd_options
Definition: rel.h:128
#define offsetof(type, field)
Definition: c.h:622
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:406
Pointer Page
Definition: bufpage.h:74
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:83
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:227
bool estimated_count
Definition: genam.h:48
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:851
amrestrpos_function amrestrpos
Definition: amapi.h:223
#define P_ISLEAF(opaque)
Definition: nbtree.h:158