PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/progress.h"
26 #include "commands/vacuum.h"
27 #include "miscadmin.h"
28 #include "nodes/execnodes.h"
29 #include "pgstat.h"
30 #include "postmaster/autovacuum.h"
32 #include "storage/indexfsm.h"
33 #include "storage/ipc.h"
34 #include "storage/lmgr.h"
35 #include "storage/smgr.h"
36 #include "utils/builtins.h"
37 #include "utils/index_selfuncs.h"
38 #include "utils/memutils.h"
39 
40 
41 /*
42  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
43  *
44  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
45  * a new page; others must wait.
46  *
47  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
48  * to a new page; some process can start doing that.
49  *
50  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
51  * We reach this state once for every distinct combination of array keys.
52  */
53 typedef enum
54 {
59 } BTPS_State;
60 
61 /*
62  * BTParallelScanDescData contains btree specific shared information required
63  * for parallel scan.
64  */
65 typedef struct BTParallelScanDescData
66 {
67  BlockNumber btps_scanPage; /* latest or next page to be scanned */
68  BTPS_State btps_pageStatus; /* indicates whether next page is
69  * available for scan. see above for
70  * possible states of parallel scan. */
71  int btps_arrayKeyCount; /* count indicating number of array scan
72  * keys processed by parallel scan */
73  slock_t btps_mutex; /* protects above variables */
74  ConditionVariable btps_cv; /* used to synchronize parallel scan */
76 
78 
79 
80 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
81  IndexBulkDeleteCallback callback, void *callback_state,
82  BTCycleId cycleid);
83 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
85  IndexTuple posting,
86  OffsetNumber updatedoffset,
87  int *nremaining);
88 
89 
90 /*
91  * Btree handler function: return IndexAmRoutine with access method parameters
92  * and callbacks.
93  */
94 Datum
96 {
98 
99  amroutine->amstrategies = BTMaxStrategyNumber;
100  amroutine->amsupport = BTNProcs;
101  amroutine->amoptsprocnum = BTOPTIONS_PROC;
102  amroutine->amcanorder = true;
103  amroutine->amcanorderbyop = false;
104  amroutine->amcanbackward = true;
105  amroutine->amcanunique = true;
106  amroutine->amcanmulticol = true;
107  amroutine->amoptionalkey = true;
108  amroutine->amsearcharray = true;
109  amroutine->amsearchnulls = true;
110  amroutine->amstorage = false;
111  amroutine->amclusterable = true;
112  amroutine->ampredlocks = true;
113  amroutine->amcanparallel = true;
114  amroutine->amcaninclude = true;
115  amroutine->amusemaintenanceworkmem = false;
116  amroutine->amparallelvacuumoptions =
118  amroutine->amkeytype = InvalidOid;
119 
120  amroutine->ambuild = btbuild;
121  amroutine->ambuildempty = btbuildempty;
122  amroutine->aminsert = btinsert;
123  amroutine->ambulkdelete = btbulkdelete;
124  amroutine->amvacuumcleanup = btvacuumcleanup;
125  amroutine->amcanreturn = btcanreturn;
126  amroutine->amcostestimate = btcostestimate;
127  amroutine->amoptions = btoptions;
128  amroutine->amproperty = btproperty;
129  amroutine->ambuildphasename = btbuildphasename;
130  amroutine->amvalidate = btvalidate;
131  amroutine->amadjustmembers = btadjustmembers;
132  amroutine->ambeginscan = btbeginscan;
133  amroutine->amrescan = btrescan;
134  amroutine->amgettuple = btgettuple;
135  amroutine->amgetbitmap = btgetbitmap;
136  amroutine->amendscan = btendscan;
137  amroutine->ammarkpos = btmarkpos;
138  amroutine->amrestrpos = btrestrpos;
141  amroutine->amparallelrescan = btparallelrescan;
142 
143  PG_RETURN_POINTER(amroutine);
144 }
145 
146 /*
147  * btbuildempty() -- build an empty btree index in the initialization fork
148  */
149 void
151 {
152  Page metapage;
153 
154  /* Construct metapage. */
155  metapage = (Page) palloc(BLCKSZ);
156  _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
157 
158  /*
159  * Write the page and log it. It might seem that an immediate sync would
160  * be sufficient to guarantee that the file exists on disk, but recovery
161  * itself might remove it while replaying, for example, an
162  * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
163  * this even when wal_level=minimal.
164  */
167  (char *) metapage, true);
168  log_newpage(&RelationGetSmgr(index)->smgr_rnode.node, INIT_FORKNUM,
169  BTREE_METAPAGE, metapage, true);
170 
171  /*
172  * An immediate sync is required even if we xlog'd the page, because the
173  * write did not go through shared_buffers and therefore a concurrent
174  * checkpoint may have moved the redo pointer past our xlog record.
175  */
177 }
178 
179 /*
180  * btinsert() -- insert an index tuple into a btree.
181  *
182  * Descend the tree recursively, find the appropriate location for our
183  * new tuple, and put it there.
184  */
185 bool
186 btinsert(Relation rel, Datum *values, bool *isnull,
187  ItemPointer ht_ctid, Relation heapRel,
188  IndexUniqueCheck checkUnique,
189  bool indexUnchanged,
190  IndexInfo *indexInfo)
191 {
192  bool result;
193  IndexTuple itup;
194 
195  /* generate an index tuple */
196  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
197  itup->t_tid = *ht_ctid;
198 
199  result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
200 
201  pfree(itup);
202 
203  return result;
204 }
205 
206 /*
207  * btgettuple() -- Get the next tuple in the scan.
208  */
209 bool
211 {
212  BTScanOpaque so = (BTScanOpaque) scan->opaque;
213  bool res;
214 
215  /* btree indexes are never lossy */
216  scan->xs_recheck = false;
217 
218  /*
219  * If we have any array keys, initialize them during first call for a
220  * scan. We can't do this in btrescan because we don't know the scan
221  * direction at that time.
222  */
223  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
224  {
225  /* punt if we have any unsatisfiable array keys */
226  if (so->numArrayKeys < 0)
227  return false;
228 
229  _bt_start_array_keys(scan, dir);
230  }
231 
232  /* This loop handles advancing to the next array elements, if any */
233  do
234  {
235  /*
236  * If we've already initialized this scan, we can just advance it in
237  * the appropriate direction. If we haven't done so yet, we call
238  * _bt_first() to get the first item in the scan.
239  */
240  if (!BTScanPosIsValid(so->currPos))
241  res = _bt_first(scan, dir);
242  else
243  {
244  /*
245  * Check to see if we should kill the previously-fetched tuple.
246  */
247  if (scan->kill_prior_tuple)
248  {
249  /*
250  * Yes, remember it for later. (We'll deal with all such
251  * tuples at once right before leaving the index page.) The
252  * test for numKilled overrun is not just paranoia: if the
253  * caller reverses direction in the indexscan then the same
254  * item might get entered multiple times. It's not worth
255  * trying to optimize that, so we don't detect it, but instead
256  * just forget any excess entries.
257  */
258  if (so->killedItems == NULL)
259  so->killedItems = (int *)
260  palloc(MaxTIDsPerBTreePage * sizeof(int));
261  if (so->numKilled < MaxTIDsPerBTreePage)
262  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
263  }
264 
265  /*
266  * Now continue the scan.
267  */
268  res = _bt_next(scan, dir);
269  }
270 
271  /* If we have a tuple, return it ... */
272  if (res)
273  break;
274  /* ... otherwise see if we have more array keys to deal with */
275  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
276 
277  return res;
278 }
279 
280 /*
281  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
282  */
283 int64
285 {
286  BTScanOpaque so = (BTScanOpaque) scan->opaque;
287  int64 ntids = 0;
288  ItemPointer heapTid;
289 
290  /*
291  * If we have any array keys, initialize them.
292  */
293  if (so->numArrayKeys)
294  {
295  /* punt if we have any unsatisfiable array keys */
296  if (so->numArrayKeys < 0)
297  return ntids;
298 
300  }
301 
302  /* This loop handles advancing to the next array elements, if any */
303  do
304  {
305  /* Fetch the first page & tuple */
306  if (_bt_first(scan, ForwardScanDirection))
307  {
308  /* Save tuple ID, and continue scanning */
309  heapTid = &scan->xs_heaptid;
310  tbm_add_tuples(tbm, heapTid, 1, false);
311  ntids++;
312 
313  for (;;)
314  {
315  /*
316  * Advance to next tuple within page. This is the same as the
317  * easy case in _bt_next().
318  */
319  if (++so->currPos.itemIndex > so->currPos.lastItem)
320  {
321  /* let _bt_next do the heavy lifting */
322  if (!_bt_next(scan, ForwardScanDirection))
323  break;
324  }
325 
326  /* Save tuple ID, and continue scanning */
327  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
328  tbm_add_tuples(tbm, heapTid, 1, false);
329  ntids++;
330  }
331  }
332  /* Now see if we have more array keys to deal with */
334 
335  return ntids;
336 }
337 
338 /*
339  * btbeginscan() -- start a scan on a btree index
340  */
342 btbeginscan(Relation rel, int nkeys, int norderbys)
343 {
344  IndexScanDesc scan;
345  BTScanOpaque so;
346 
347  /* no order by operators allowed */
348  Assert(norderbys == 0);
349 
350  /* get the scan */
351  scan = RelationGetIndexScan(rel, nkeys, norderbys);
352 
353  /* allocate private workspace */
354  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
357  if (scan->numberOfKeys > 0)
358  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
359  else
360  so->keyData = NULL;
361 
362  so->arrayKeyData = NULL; /* assume no array keys for now */
363  so->numArrayKeys = 0;
364  so->arrayKeys = NULL;
365  so->arrayContext = NULL;
366 
367  so->killedItems = NULL; /* until needed */
368  so->numKilled = 0;
369 
370  /*
371  * We don't know yet whether the scan will be index-only, so we do not
372  * allocate the tuple workspace arrays until btrescan. However, we set up
373  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
374  */
375  so->currTuples = so->markTuples = NULL;
376 
377  scan->xs_itupdesc = RelationGetDescr(rel);
378 
379  scan->opaque = so;
380 
381  return scan;
382 }
383 
384 /*
385  * btrescan() -- rescan an index relation
386  */
387 void
388 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
389  ScanKey orderbys, int norderbys)
390 {
391  BTScanOpaque so = (BTScanOpaque) scan->opaque;
392 
393  /* we aren't holding any read locks, but gotta drop the pins */
394  if (BTScanPosIsValid(so->currPos))
395  {
396  /* Before leaving current page, deal with any killed items */
397  if (so->numKilled > 0)
398  _bt_killitems(scan);
401  }
402 
403  so->markItemIndex = -1;
404  so->arrayKeyCount = 0;
407 
408  /*
409  * Allocate tuple workspace arrays, if needed for an index-only scan and
410  * not already done in a previous rescan call. To save on palloc
411  * overhead, both workspaces are allocated as one palloc block; only this
412  * function and btendscan know that.
413  *
414  * NOTE: this data structure also makes it safe to return data from a
415  * "name" column, even though btree name_ops uses an underlying storage
416  * datatype of cstring. The risk there is that "name" is supposed to be
417  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
418  * However, since we only return data out of tuples sitting in the
419  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
420  * data out of the markTuples array --- running off the end of memory for
421  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
422  * adding special-case treatment for name_ops elsewhere.
423  */
424  if (scan->xs_want_itup && so->currTuples == NULL)
425  {
426  so->currTuples = (char *) palloc(BLCKSZ * 2);
427  so->markTuples = so->currTuples + BLCKSZ;
428  }
429 
430  /*
431  * Reset the scan keys
432  */
433  if (scankey && scan->numberOfKeys > 0)
434  memmove(scan->keyData,
435  scankey,
436  scan->numberOfKeys * sizeof(ScanKeyData));
437  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
438 
439  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
441 }
442 
443 /*
444  * btendscan() -- close down a scan
445  */
446 void
448 {
449  BTScanOpaque so = (BTScanOpaque) scan->opaque;
450 
451  /* we aren't holding any read locks, but gotta drop the pins */
452  if (BTScanPosIsValid(so->currPos))
453  {
454  /* Before leaving current page, deal with any killed items */
455  if (so->numKilled > 0)
456  _bt_killitems(scan);
458  }
459 
460  so->markItemIndex = -1;
462 
463  /* No need to invalidate positions, the RAM is about to be freed. */
464 
465  /* Release storage */
466  if (so->keyData != NULL)
467  pfree(so->keyData);
468  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
469  if (so->arrayContext != NULL)
471  if (so->killedItems != NULL)
472  pfree(so->killedItems);
473  if (so->currTuples != NULL)
474  pfree(so->currTuples);
475  /* so->markTuples should not be pfree'd, see btrescan */
476  pfree(so);
477 }
478 
479 /*
480  * btmarkpos() -- save current scan position
481  */
482 void
484 {
485  BTScanOpaque so = (BTScanOpaque) scan->opaque;
486 
487  /* There may be an old mark with a pin (but no lock). */
489 
490  /*
491  * Just record the current itemIndex. If we later step to next page
492  * before releasing the marked position, _bt_steppage makes a full copy of
493  * the currPos struct in markPos. If (as often happens) the mark is moved
494  * before we leave the page, we don't have to do that work.
495  */
496  if (BTScanPosIsValid(so->currPos))
497  so->markItemIndex = so->currPos.itemIndex;
498  else
499  {
501  so->markItemIndex = -1;
502  }
503 
504  /* Also record the current positions of any array keys */
505  if (so->numArrayKeys)
506  _bt_mark_array_keys(scan);
507 }
508 
509 /*
510  * btrestrpos() -- restore scan to last saved position
511  */
512 void
514 {
515  BTScanOpaque so = (BTScanOpaque) scan->opaque;
516 
517  /* Restore the marked positions of any array keys */
518  if (so->numArrayKeys)
520 
521  if (so->markItemIndex >= 0)
522  {
523  /*
524  * The scan has never moved to a new page since the last mark. Just
525  * restore the itemIndex.
526  *
527  * NB: In this case we can't count on anything in so->markPos to be
528  * accurate.
529  */
530  so->currPos.itemIndex = so->markItemIndex;
531  }
532  else
533  {
534  /*
535  * The scan moved to a new page after last mark or restore, and we are
536  * now restoring to the marked page. We aren't holding any read
537  * locks, but if we're still holding the pin for the current position,
538  * we must drop it.
539  */
540  if (BTScanPosIsValid(so->currPos))
541  {
542  /* Before leaving current page, deal with any killed items */
543  if (so->numKilled > 0)
544  _bt_killitems(scan);
546  }
547 
548  if (BTScanPosIsValid(so->markPos))
549  {
550  /* bump pin on mark buffer for assignment to current buffer */
551  if (BTScanPosIsPinned(so->markPos))
553  memcpy(&so->currPos, &so->markPos,
554  offsetof(BTScanPosData, items[1]) +
555  so->markPos.lastItem * sizeof(BTScanPosItem));
556  if (so->currTuples)
557  memcpy(so->currTuples, so->markTuples,
559  }
560  else
562  }
563 }
564 
565 /*
566  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
567  */
568 Size
570 {
571  return sizeof(BTParallelScanDescData);
572 }
573 
574 /*
575  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
576  */
577 void
578 btinitparallelscan(void *target)
579 {
580  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
581 
582  SpinLockInit(&bt_target->btps_mutex);
583  bt_target->btps_scanPage = InvalidBlockNumber;
585  bt_target->btps_arrayKeyCount = 0;
586  ConditionVariableInit(&bt_target->btps_cv);
587 }
588 
589 /*
590  * btparallelrescan() -- reset parallel scan
591  */
592 void
594 {
595  BTParallelScanDesc btscan;
596  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
597 
598  Assert(parallel_scan);
599 
600  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
601  parallel_scan->ps_offset);
602 
603  /*
604  * In theory, we don't need to acquire the spinlock here, because there
605  * shouldn't be any other workers running at this point, but we do so for
606  * consistency.
607  */
608  SpinLockAcquire(&btscan->btps_mutex);
611  btscan->btps_arrayKeyCount = 0;
612  SpinLockRelease(&btscan->btps_mutex);
613 }
614 
615 /*
616  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
617  * page. Other scans must wait until we call _bt_parallel_release()
618  * or _bt_parallel_done().
619  *
620  * The return value is true if we successfully seized the scan and false
621  * if we did not. The latter case occurs if no pages remain for the current
622  * set of scankeys.
623  *
624  * If the return value is true, *pageno returns the next or current page
625  * of the scan (depending on the scan direction). An invalid block number
626  * means the scan hasn't yet started, and P_NONE means we've reached the end.
627  * The first time a participating process reaches the last page, it will return
628  * true and set *pageno to P_NONE; after that, further attempts to seize the
629  * scan will return false.
630  *
631  * Callers should ignore the value of pageno if the return value is false.
632  */
633 bool
635 {
636  BTScanOpaque so = (BTScanOpaque) scan->opaque;
637  BTPS_State pageStatus;
638  bool exit_loop = false;
639  bool status = true;
640  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
641  BTParallelScanDesc btscan;
642 
643  *pageno = P_NONE;
644 
645  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
646  parallel_scan->ps_offset);
647 
648  while (1)
649  {
650  SpinLockAcquire(&btscan->btps_mutex);
651  pageStatus = btscan->btps_pageStatus;
652 
653  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
654  {
655  /* Parallel scan has already advanced to a new set of scankeys. */
656  status = false;
657  }
658  else if (pageStatus == BTPARALLEL_DONE)
659  {
660  /*
661  * We're done with this set of scankeys. This may be the end, or
662  * there could be more sets to try.
663  */
664  status = false;
665  }
666  else if (pageStatus != BTPARALLEL_ADVANCING)
667  {
668  /*
669  * We have successfully seized control of the scan for the purpose
670  * of advancing it to a new page!
671  */
672  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
673  *pageno = btscan->btps_scanPage;
674  exit_loop = true;
675  }
676  SpinLockRelease(&btscan->btps_mutex);
677  if (exit_loop || !status)
678  break;
680  }
682 
683  return status;
684 }
685 
686 /*
687  * _bt_parallel_release() -- Complete the process of advancing the scan to a
688  * new page. We now have the new value btps_scanPage; some other backend
689  * can now begin advancing the scan.
690  */
691 void
693 {
694  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
695  BTParallelScanDesc btscan;
696 
697  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
698  parallel_scan->ps_offset);
699 
700  SpinLockAcquire(&btscan->btps_mutex);
701  btscan->btps_scanPage = scan_page;
703  SpinLockRelease(&btscan->btps_mutex);
705 }
706 
707 /*
708  * _bt_parallel_done() -- Mark the parallel scan as complete.
709  *
710  * When there are no pages left to scan, this function should be called to
711  * notify other workers. Otherwise, they might wait forever for the scan to
712  * advance to the next page.
713  */
714 void
716 {
717  BTScanOpaque so = (BTScanOpaque) scan->opaque;
718  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
719  BTParallelScanDesc btscan;
720  bool status_changed = false;
721 
722  /* Do nothing, for non-parallel scans */
723  if (parallel_scan == NULL)
724  return;
725 
726  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
727  parallel_scan->ps_offset);
728 
729  /*
730  * Mark the parallel scan as done for this combination of scan keys,
731  * unless some other process already did so. See also
732  * _bt_advance_array_keys.
733  */
734  SpinLockAcquire(&btscan->btps_mutex);
735  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
736  btscan->btps_pageStatus != BTPARALLEL_DONE)
737  {
738  btscan->btps_pageStatus = BTPARALLEL_DONE;
739  status_changed = true;
740  }
741  SpinLockRelease(&btscan->btps_mutex);
742 
743  /* wake up all the workers associated with this parallel scan */
744  if (status_changed)
745  ConditionVariableBroadcast(&btscan->btps_cv);
746 }
747 
748 /*
749  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
750  * keys.
751  *
752  * Updates the count of array keys processed for both local and parallel
753  * scans.
754  */
755 void
757 {
758  BTScanOpaque so = (BTScanOpaque) scan->opaque;
759  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
760  BTParallelScanDesc btscan;
761 
762  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
763  parallel_scan->ps_offset);
764 
765  so->arrayKeyCount++;
766  SpinLockAcquire(&btscan->btps_mutex);
767  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
768  {
769  btscan->btps_scanPage = InvalidBlockNumber;
770  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
771  btscan->btps_arrayKeyCount++;
772  }
773  SpinLockRelease(&btscan->btps_mutex);
774 }
775 
776 /*
777  * Bulk deletion of all index entries pointing to a set of heap tuples.
778  * The set of target tuples is specified via a callback routine that tells
779  * whether any given heap tuple (identified by ItemPointer) is being deleted.
780  *
781  * Result: a palloc'd struct containing statistical info for VACUUM displays.
782  */
785  IndexBulkDeleteCallback callback, void *callback_state)
786 {
787  Relation rel = info->index;
788  BTCycleId cycleid;
789 
790  /* allocate stats if first time through, else re-use existing struct */
791  if (stats == NULL)
793 
794  /* Establish the vacuum cycle ID to use for this scan */
795  /* The ENSURE stuff ensures we clean up shared memory on failure */
797  {
798  cycleid = _bt_start_vacuum(rel);
799 
800  btvacuumscan(info, stats, callback, callback_state, cycleid);
801  }
803  _bt_end_vacuum(rel);
804 
805  return stats;
806 }
807 
808 /*
809  * Post-VACUUM cleanup.
810  *
811  * Result: a palloc'd struct containing statistical info for VACUUM displays.
812  */
815 {
816  BlockNumber num_delpages;
817 
818  /* No-op in ANALYZE ONLY mode */
819  if (info->analyze_only)
820  return stats;
821 
822  /*
823  * If btbulkdelete was called, we need not do anything (we just maintain
824  * the information used within _bt_vacuum_needs_cleanup() by calling
825  * _bt_set_cleanup_info() below).
826  *
827  * If btbulkdelete was _not_ called, then we have a choice to make: we
828  * must decide whether or not a btvacuumscan() call is needed now (i.e.
829  * whether the ongoing VACUUM operation can entirely avoid a physical scan
830  * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
831  * now.
832  */
833  if (stats == NULL)
834  {
835  /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
836  if (!_bt_vacuum_needs_cleanup(info->index))
837  return NULL;
838 
839  /*
840  * Since we aren't going to actually delete any leaf items, there's no
841  * need to go through all the vacuum-cycle-ID pushups here.
842  *
843  * Posting list tuples are a source of inaccuracy for cleanup-only
844  * scans. btvacuumscan() will assume that the number of index tuples
845  * from each page can be used as num_index_tuples, even though
846  * num_index_tuples is supposed to represent the number of TIDs in the
847  * index. This naive approach can underestimate the number of tuples
848  * in the index significantly.
849  *
850  * We handle the problem by making num_index_tuples an estimate in
851  * cleanup-only case.
852  */
854  btvacuumscan(info, stats, NULL, NULL, 0);
855  stats->estimated_count = true;
856  }
857 
858  /*
859  * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
860  *
861  * num_delpages is the number of deleted pages now in the index that were
862  * not safe to place in the FSM to be recycled just yet. num_delpages is
863  * greater than 0 only when _bt_pagedel() actually deleted pages during
864  * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
865  * have failed to place any newly deleted pages in the FSM just moments
866  * ago. (Actually, there are edge cases where recycling of the current
867  * VACUUM's newly deleted pages does not even become safe by the time the
868  * next VACUUM comes around. See nbtree/README.)
869  */
870  Assert(stats->pages_deleted >= stats->pages_free);
871  num_delpages = stats->pages_deleted - stats->pages_free;
872  _bt_set_cleanup_info(info->index, num_delpages);
873 
874  /*
875  * It's quite possible for us to be fooled by concurrent page splits into
876  * double-counting some index tuples, so disbelieve any total that exceeds
877  * the underlying heap's count ... if we know that accurately. Otherwise
878  * this might just make matters worse.
879  */
880  if (!info->estimated_count)
881  {
882  if (stats->num_index_tuples > info->num_heap_tuples)
883  stats->num_index_tuples = info->num_heap_tuples;
884  }
885 
886  return stats;
887 }
888 
889 /*
890  * btvacuumscan --- scan the index for VACUUMing purposes
891  *
892  * This combines the functions of looking for leaf tuples that are deletable
893  * according to the vacuum callback, looking for empty pages that can be
894  * deleted, and looking for old deleted pages that can be recycled. Both
895  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
896  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
897  *
898  * The caller is responsible for initially allocating/zeroing a stats struct
899  * and for obtaining a vacuum cycle ID if necessary.
900  */
901 static void
903  IndexBulkDeleteCallback callback, void *callback_state,
904  BTCycleId cycleid)
905 {
906  Relation rel = info->index;
907  BTVacState vstate;
908  BlockNumber num_pages;
909  BlockNumber scanblkno;
910  bool needLock;
911 
912  /*
913  * Reset fields that track information about the entire index now. This
914  * avoids double-counting in the case where a single VACUUM command
915  * requires multiple scans of the index.
916  *
917  * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
918  * since they track information about the VACUUM command, and so must last
919  * across each call to btvacuumscan().
920  *
921  * (Note that pages_free is treated as state about the whole index, not
922  * the current VACUUM. This is appropriate because RecordFreeIndexPage()
923  * calls are idempotent, and get repeated for the same deleted pages in
924  * some scenarios. The point for us is to track the number of recyclable
925  * pages in the index at the end of the VACUUM command.)
926  */
927  stats->num_pages = 0;
928  stats->num_index_tuples = 0;
929  stats->pages_deleted = 0;
930  stats->pages_free = 0;
931 
932  /* Set up info to pass down to btvacuumpage */
933  vstate.info = info;
934  vstate.stats = stats;
935  vstate.callback = callback;
936  vstate.callback_state = callback_state;
937  vstate.cycleid = cycleid;
938 
939  /* Create a temporary memory context to run _bt_pagedel in */
941  "_bt_pagedel",
943 
944  /* Initialize vstate fields used by _bt_pendingfsm_finalize */
945  vstate.bufsize = 0;
946  vstate.maxbufsize = 0;
947  vstate.pendingpages = NULL;
948  vstate.npendingpages = 0;
949  /* Consider applying _bt_pendingfsm_finalize optimization */
950  _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
951 
952  /*
953  * The outer loop iterates over all index pages except the metapage, in
954  * physical order (we hope the kernel will cooperate in providing
955  * read-ahead for speed). It is critical that we visit all leaf pages,
956  * including ones added after we start the scan, else we might fail to
957  * delete some deletable tuples. Hence, we must repeatedly check the
958  * relation length. We must acquire the relation-extension lock while
959  * doing so to avoid a race condition: if someone else is extending the
960  * relation, there is a window where bufmgr/smgr have created a new
961  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
962  * we manage to scan such a page here, we'll improperly assume it can be
963  * recycled. Taking the lock synchronizes things enough to prevent a
964  * problem: either num_pages won't include the new page, or _bt_getbuf
965  * already has write lock on the buffer and it will be fully initialized
966  * before we can examine it. (See also vacuumlazy.c, which has the same
967  * issue.) Also, we need not worry if a page is added immediately after
968  * we look; the page splitting code already has write-lock on the left
969  * page before it adds a right page, so we must already have processed any
970  * tuples due to be moved into such a page.
971  *
972  * We can skip locking for new or temp relations, however, since no one
973  * else could be accessing them.
974  */
975  needLock = !RELATION_IS_LOCAL(rel);
976 
977  scanblkno = BTREE_METAPAGE + 1;
978  for (;;)
979  {
980  /* Get the current relation length */
981  if (needLock)
983  num_pages = RelationGetNumberOfBlocks(rel);
984  if (needLock)
986 
987  if (info->report_progress)
989  num_pages);
990 
991  /* Quit if we've scanned the whole relation */
992  if (scanblkno >= num_pages)
993  break;
994  /* Iterate over pages, then loop back to recheck length */
995  for (; scanblkno < num_pages; scanblkno++)
996  {
997  btvacuumpage(&vstate, scanblkno);
998  if (info->report_progress)
1000  scanblkno);
1001  }
1002  }
1003 
1004  /* Set statistics num_pages field to final size of index */
1005  stats->num_pages = num_pages;
1006 
1008 
1009  /*
1010  * If there were any calls to _bt_pagedel() during scan of the index then
1011  * see if any of the resulting pages can be placed in the FSM now. When
1012  * it's not safe we'll have to leave it up to a future VACUUM operation.
1013  *
1014  * Finally, if we placed any pages in the FSM (either just now or during
1015  * the scan), forcibly update the upper-level FSM pages to ensure that
1016  * searchers can find them.
1017  */
1018  _bt_pendingfsm_finalize(rel, &vstate);
1019  if (stats->pages_free > 0)
1021 }
1022 
1023 /*
1024  * btvacuumpage --- VACUUM one page
1025  *
1026  * This processes a single page for btvacuumscan(). In some cases we must
1027  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1028  * a previous call here. This is how we handle page splits (that happened
1029  * after our cycleid was acquired) whose right half page happened to reuse
1030  * a block that we might have processed at some point before it was
1031  * recycled (i.e. before the page split).
1032  */
1033 static void
1035 {
1036  IndexVacuumInfo *info = vstate->info;
1037  IndexBulkDeleteResult *stats = vstate->stats;
1039  void *callback_state = vstate->callback_state;
1040  Relation rel = info->index;
1041  bool attempt_pagedel;
1042  BlockNumber blkno,
1043  backtrack_to;
1044  Buffer buf;
1045  Page page;
1046  BTPageOpaque opaque;
1047 
1048  blkno = scanblkno;
1049 
1050 backtrack:
1051 
1052  attempt_pagedel = false;
1053  backtrack_to = P_NONE;
1054 
1055  /* call vacuum_delay_point while not holding any buffer lock */
1057 
1058  /*
1059  * We can't use _bt_getbuf() here because it always applies
1060  * _bt_checkpage(), which will barf on an all-zero page. We want to
1061  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1062  * buffer access strategy.
1063  */
1064  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1065  info->strategy);
1066  _bt_lockbuf(rel, buf, BT_READ);
1067  page = BufferGetPage(buf);
1068  opaque = NULL;
1069  if (!PageIsNew(page))
1070  {
1071  _bt_checkpage(rel, buf);
1072  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1073  }
1074 
1075  Assert(blkno <= scanblkno);
1076  if (blkno != scanblkno)
1077  {
1078  /*
1079  * We're backtracking.
1080  *
1081  * We followed a right link to a sibling leaf page (a page that
1082  * happens to be from a block located before scanblkno). The only
1083  * case we want to do anything with is a live leaf page having the
1084  * current vacuum cycle ID.
1085  *
1086  * The page had better be in a state that's consistent with what we
1087  * expect. Check for conditions that imply corruption in passing. It
1088  * can't be half-dead because only an interrupted VACUUM process can
1089  * leave pages in that state, so we'd definitely have dealt with it
1090  * back when the page was the scanblkno page (half-dead pages are
1091  * always marked fully deleted by _bt_pagedel()). This assumes that
1092  * there can be only one vacuum process running at a time.
1093  */
1094  if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1095  {
1096  Assert(false);
1097  ereport(LOG,
1098  (errcode(ERRCODE_INDEX_CORRUPTED),
1099  errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1100  blkno, scanblkno, RelationGetRelationName(rel))));
1101  _bt_relbuf(rel, buf);
1102  return;
1103  }
1104 
1105  /*
1106  * We may have already processed the page in an earlier call, when the
1107  * page was scanblkno. This happens when the leaf page split occurred
1108  * after the scan began, but before the right sibling page became the
1109  * scanblkno.
1110  *
1111  * Page may also have been deleted by current btvacuumpage() call,
1112  * since _bt_pagedel() sometimes deletes the right sibling page of
1113  * scanblkno in passing (it does so after we decided where to
1114  * backtrack to). We don't need to process this page as a deleted
1115  * page a second time now (in fact, it would be wrong to count it as a
1116  * deleted page in the bulk delete statistics a second time).
1117  */
1118  if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1119  {
1120  /* Done with current scanblkno (and all lower split pages) */
1121  _bt_relbuf(rel, buf);
1122  return;
1123  }
1124  }
1125 
1126  if (!opaque || BTPageIsRecyclable(page))
1127  {
1128  /* Okay to recycle this page (which could be leaf or internal) */
1129  RecordFreeIndexPage(rel, blkno);
1130  stats->pages_deleted++;
1131  stats->pages_free++;
1132  }
1133  else if (P_ISDELETED(opaque))
1134  {
1135  /*
1136  * Already deleted page (which could be leaf or internal). Can't
1137  * recycle yet.
1138  */
1139  stats->pages_deleted++;
1140  }
1141  else if (P_ISHALFDEAD(opaque))
1142  {
1143  /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1144  attempt_pagedel = true;
1145 
1146  /*
1147  * _bt_pagedel() will increment both pages_newly_deleted and
1148  * pages_deleted stats in all cases (barring corruption)
1149  */
1150  }
1151  else if (P_ISLEAF(opaque))
1152  {
1154  int ndeletable;
1156  int nupdatable;
1157  OffsetNumber offnum,
1158  minoff,
1159  maxoff;
1160  int nhtidsdead,
1161  nhtidslive;
1162 
1163  /*
1164  * Trade in the initial read lock for a super-exclusive write lock on
1165  * this page. We must get such a lock on every leaf page over the
1166  * course of the vacuum scan, whether or not it actually contains any
1167  * deletable tuples --- see nbtree/README.
1168  */
1169  _bt_upgradelockbufcleanup(rel, buf);
1170 
1171  /*
1172  * Check whether we need to backtrack to earlier pages. What we are
1173  * concerned about is a page split that happened since we started the
1174  * vacuum scan. If the split moved tuples on the right half of the
1175  * split (i.e. the tuples that sort high) to a block that we already
1176  * passed over, then we might have missed the tuples. We need to
1177  * backtrack now. (Must do this before possibly clearing btpo_cycleid
1178  * or deleting scanblkno page below!)
1179  */
1180  if (vstate->cycleid != 0 &&
1181  opaque->btpo_cycleid == vstate->cycleid &&
1182  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1183  !P_RIGHTMOST(opaque) &&
1184  opaque->btpo_next < scanblkno)
1185  backtrack_to = opaque->btpo_next;
1186 
1187  /*
1188  * When each VACUUM begins, it determines an OldestXmin cutoff value.
1189  * Tuples before the cutoff are removed by VACUUM. Scan over all
1190  * items to see which ones need to be deleted according to cutoff
1191  * point using callback.
1192  */
1193  ndeletable = 0;
1194  nupdatable = 0;
1195  minoff = P_FIRSTDATAKEY(opaque);
1196  maxoff = PageGetMaxOffsetNumber(page);
1197  nhtidsdead = 0;
1198  nhtidslive = 0;
1199  if (callback)
1200  {
1201  for (offnum = minoff;
1202  offnum <= maxoff;
1203  offnum = OffsetNumberNext(offnum))
1204  {
1205  IndexTuple itup;
1206 
1207  itup = (IndexTuple) PageGetItem(page,
1208  PageGetItemId(page, offnum));
1209 
1210  /*
1211  * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
1212  * records do not produce their own conflicts. This is safe
1213  * as long as the callback function only considers whether the
1214  * index tuple refers to pre-cutoff heap tuples that were
1215  * certainly already pruned away during VACUUM's initial heap
1216  * scan by the time we get here. (heapam's XLOG_HEAP2_PRUNE
1217  * records produce conflicts using a latestRemovedXid value
1218  * for the pointed-to heap tuples, so there is no need to
1219  * produce our own conflict now.)
1220  *
1221  * Backends with snapshots acquired after a VACUUM starts but
1222  * before it finishes could have visibility cutoff with a
1223  * later xid than VACUUM's OldestXmin cutoff. These backends
1224  * might happen to opportunistically mark some index tuples
1225  * LP_DEAD before we reach them, even though they may be after
1226  * our cutoff. We don't try to kill these "extra" index
1227  * tuples in _bt_delitems_vacuum(). This keep things simple,
1228  * and allows us to always avoid generating our own conflicts.
1229  */
1230  Assert(!BTreeTupleIsPivot(itup));
1231  if (!BTreeTupleIsPosting(itup))
1232  {
1233  /* Regular tuple, standard table TID representation */
1234  if (callback(&itup->t_tid, callback_state))
1235  {
1236  deletable[ndeletable++] = offnum;
1237  nhtidsdead++;
1238  }
1239  else
1240  nhtidslive++;
1241  }
1242  else
1243  {
1244  BTVacuumPosting vacposting;
1245  int nremaining;
1246 
1247  /* Posting list tuple */
1248  vacposting = btreevacuumposting(vstate, itup, offnum,
1249  &nremaining);
1250  if (vacposting == NULL)
1251  {
1252  /*
1253  * All table TIDs from the posting tuple remain, so no
1254  * delete or update required
1255  */
1256  Assert(nremaining == BTreeTupleGetNPosting(itup));
1257  }
1258  else if (nremaining > 0)
1259  {
1260 
1261  /*
1262  * Store metadata about posting list tuple in
1263  * updatable array for entire page. Existing tuple
1264  * will be updated during the later call to
1265  * _bt_delitems_vacuum().
1266  */
1267  Assert(nremaining < BTreeTupleGetNPosting(itup));
1268  updatable[nupdatable++] = vacposting;
1269  nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1270  }
1271  else
1272  {
1273  /*
1274  * All table TIDs from the posting list must be
1275  * deleted. We'll delete the index tuple completely
1276  * (no update required).
1277  */
1278  Assert(nremaining == 0);
1279  deletable[ndeletable++] = offnum;
1280  nhtidsdead += BTreeTupleGetNPosting(itup);
1281  pfree(vacposting);
1282  }
1283 
1284  nhtidslive += nremaining;
1285  }
1286  }
1287  }
1288 
1289  /*
1290  * Apply any needed deletes or updates. We issue just one
1291  * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1292  */
1293  if (ndeletable > 0 || nupdatable > 0)
1294  {
1295  Assert(nhtidsdead >= ndeletable + nupdatable);
1296  _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1297  nupdatable);
1298 
1299  stats->tuples_removed += nhtidsdead;
1300  /* must recompute maxoff */
1301  maxoff = PageGetMaxOffsetNumber(page);
1302 
1303  /* can't leak memory here */
1304  for (int i = 0; i < nupdatable; i++)
1305  pfree(updatable[i]);
1306  }
1307  else
1308  {
1309  /*
1310  * If the leaf page has been split during this vacuum cycle, it
1311  * seems worth expending a write to clear btpo_cycleid even if we
1312  * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1313  * takes care of this.) This ensures we won't process the page
1314  * again.
1315  *
1316  * We treat this like a hint-bit update because there's no need to
1317  * WAL-log it.
1318  */
1319  Assert(nhtidsdead == 0);
1320  if (vstate->cycleid != 0 &&
1321  opaque->btpo_cycleid == vstate->cycleid)
1322  {
1323  opaque->btpo_cycleid = 0;
1324  MarkBufferDirtyHint(buf, true);
1325  }
1326  }
1327 
1328  /*
1329  * If the leaf page is now empty, try to delete it; else count the
1330  * live tuples (live table TIDs in posting lists are counted as
1331  * separate live tuples). We don't delete when backtracking, though,
1332  * since that would require teaching _bt_pagedel() about backtracking
1333  * (doesn't seem worth adding more complexity to deal with that).
1334  *
1335  * We don't count the number of live TIDs during cleanup-only calls to
1336  * btvacuumscan (i.e. when callback is not set). We count the number
1337  * of index tuples directly instead. This avoids the expense of
1338  * directly examining all of the tuples on each page. VACUUM will
1339  * treat num_index_tuples as an estimate in cleanup-only case, so it
1340  * doesn't matter that this underestimates num_index_tuples
1341  * significantly in some cases.
1342  */
1343  if (minoff > maxoff)
1344  attempt_pagedel = (blkno == scanblkno);
1345  else if (callback)
1346  stats->num_index_tuples += nhtidslive;
1347  else
1348  stats->num_index_tuples += maxoff - minoff + 1;
1349 
1350  Assert(!attempt_pagedel || nhtidslive == 0);
1351  }
1352 
1353  if (attempt_pagedel)
1354  {
1355  MemoryContext oldcontext;
1356 
1357  /* Run pagedel in a temp context to avoid memory leakage */
1359  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1360 
1361  /*
1362  * _bt_pagedel maintains the bulk delete stats on our behalf;
1363  * pages_newly_deleted and pages_deleted are likely to be incremented
1364  * during call
1365  */
1366  Assert(blkno == scanblkno);
1367  _bt_pagedel(rel, buf, vstate);
1368 
1369  MemoryContextSwitchTo(oldcontext);
1370  /* pagedel released buffer, so we shouldn't */
1371  }
1372  else
1373  _bt_relbuf(rel, buf);
1374 
1375  if (backtrack_to != P_NONE)
1376  {
1377  blkno = backtrack_to;
1378  goto backtrack;
1379  }
1380 }
1381 
1382 /*
1383  * btreevacuumposting --- determine TIDs still needed in posting list
1384  *
1385  * Returns metadata describing how to build replacement tuple without the TIDs
1386  * that VACUUM needs to delete. Returned value is NULL in the common case
1387  * where no changes are needed to caller's posting list tuple (we avoid
1388  * allocating memory here as an optimization).
1389  *
1390  * The number of TIDs that should remain in the posting list tuple is set for
1391  * caller in *nremaining.
1392  */
1393 static BTVacuumPosting
1395  OffsetNumber updatedoffset, int *nremaining)
1396 {
1397  int live = 0;
1398  int nitem = BTreeTupleGetNPosting(posting);
1399  ItemPointer items = BTreeTupleGetPosting(posting);
1400  BTVacuumPosting vacposting = NULL;
1401 
1402  for (int i = 0; i < nitem; i++)
1403  {
1404  if (!vstate->callback(items + i, vstate->callback_state))
1405  {
1406  /* Live table TID */
1407  live++;
1408  }
1409  else if (vacposting == NULL)
1410  {
1411  /*
1412  * First dead table TID encountered.
1413  *
1414  * It's now clear that we need to delete one or more dead table
1415  * TIDs, so start maintaining metadata describing how to update
1416  * existing posting list tuple.
1417  */
1418  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1419  nitem * sizeof(uint16));
1420 
1421  vacposting->itup = posting;
1422  vacposting->updatedoffset = updatedoffset;
1423  vacposting->ndeletedtids = 0;
1424  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1425  }
1426  else
1427  {
1428  /* Second or subsequent dead table TID */
1429  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1430  }
1431  }
1432 
1433  *nremaining = live;
1434  return vacposting;
1435 }
1436 
1437 /*
1438  * btcanreturn() -- Check whether btree indexes support index-only scans.
1439  *
1440  * btrees always do, so this is trivial.
1441  */
1442 bool
1444 {
1445  return true;
1446 }
int maxbufsize
Definition: nbtree.h:343
slock_t btps_mutex
Definition: nbtree.c:73
ambeginscan_function ambeginscan
Definition: amapi.h:271
uint8 amparallelvacuumoptions
Definition: amapi.h:248
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
int slock_t
Definition: s_lock.h:934
uint16 ndeletedtids
Definition: nbtree.h:908
ambulkdelete_function ambulkdelete
Definition: amapi.h:262
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:234
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:544
#define BTP_SPLIT_END
Definition: nbtree.h:79
bool amcanmulticol
Definition: amapi.h:228
BlockNumber btpo_next
Definition: nbtree.h:65
uint16 amsupport
Definition: amapi.h:216
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:218
#define AllocSetContextCreate
Definition: memutils.h:173
MemoryContext pagedelcontext
Definition: nbtree.h:337
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:388
double tuples_removed
Definition: genam.h:79
void * callback_state
Definition: nbtree.h:335
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:473
amgettuple_function amgettuple
Definition: amapi.h:273
struct BTParallelScanDescData BTParallelScanDescData
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3842
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:715
#define RelationGetDescr(relation)
Definition: rel.h:503
bool amcanorderbyop
Definition: amapi.h:222
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:518
MemoryContext arrayContext
Definition: nbtree.h:1044
amproperty_function amproperty
Definition: amapi.h:267
#define ExclusiveLock
Definition: lockdefs.h:44
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:2152
#define PointerGetDatum(X)
Definition: postgres.h:600
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:530
int itemIndex
Definition: nbtree.h:981
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1167
char * currTuples
Definition: nbtree.h:1055
OffsetNumber updatedoffset
Definition: nbtree.h:905
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
#define SpinLockInit(lock)
Definition: spin.h:60
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:621
bool analyze_only
Definition: genam.h:47
struct TupleDescData * xs_itupdesc
Definition: relscan.h:143
IndexBulkDeleteCallback callback
Definition: nbtree.h:334
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:741
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:282
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1394
bool report_progress
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:52
IndexTuple itup
Definition: nbtree.h:904
bool amstorage
Definition: amapi.h:236
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define P_NONE
Definition: nbtree.h:211
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
Definition: nbtree.c:1034
void ConditionVariableBroadcast(ConditionVariable *cv)
int errcode(int sqlerrcode)
Definition: elog.c:698
Relation index
Definition: genam.h:46
bool ampredlocks
Definition: amapi.h:240
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1005
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:69
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:143
uint32 BlockNumber
Definition: block.h:31
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:2054
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:342
aminsert_function aminsert
Definition: amapi.h:261
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:902
#define LOG
Definition: elog.h:26
#define MaxTIDsPerBTreePage
Definition: nbtree.h:184
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:77
Oid amkeytype
Definition: amapi.h:250
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
bool amoptionalkey
Definition: amapi.h:230
amvalidate_function amvalidate
Definition: amapi.h:269
Size btestimateparallelscan(void)
Definition: nbtree.c:569
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
#define BTOPTIONS_PROC
Definition: nbtree.h:704
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:848
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:41
uint16 OffsetNumber
Definition: off.h:24
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1816
BTPendingFSM * pendingpages
Definition: nbtree.h:344
Definition: type.h:89
IndexUniqueCheck
Definition: genam.h:114
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:47
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:622
void btbuildempty(Relation index)
Definition: nbtree.c:150
#define BT_READ
Definition: nbtree.h:712
#define SpinLockAcquire(lock)
Definition: spin.h:62
int nextTupleOffset
Definition: nbtree.h:970
void ConditionVariableInit(ConditionVariable *cv)
int lastItem
Definition: nbtree.h:980
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1718
unsigned short uint16
Definition: c.h:440
BTScanPosData markPos
Definition: nbtree.h:1069
void pfree(void *pointer)
Definition: mcxt.c:1169
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:814
void ConditionVariableCancelSleep(void)
amgetbitmap_function amgetbitmap
Definition: amapi.h:274
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:794
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:223
uint16 BTCycleId
Definition: nbtree.h:29
ambuild_function ambuild
Definition: amapi.h:259
amoptions_function amoptions
Definition: amapi.h:266
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:284
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:298
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1072
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:293
BlockNumber num_pages
Definition: genam.h:76
BlockNumber pages_free
Definition: genam.h:82
BTPS_State
Definition: nbtree.c:53
BTCycleId btpo_cycleid
Definition: nbtree.h:68
void ConditionVariableSignal(ConditionVariable *cv)
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:447
bool amcaninclude
Definition: amapi.h:244
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:195
amcostestimate_function amcostestimate
Definition: amapi.h:265
bool amcanunique
Definition: amapi.h:226
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6591
ConditionVariable btps_cv
Definition: nbtree.c:74
ItemPointerData xs_heaptid
Definition: relscan.h:147
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2026
static char * buf
Definition: pg_test_fsync.c:68
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:263
amendscan_function amendscan
Definition: amapi.h:275
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:523
ScanKeyData * ScanKey
Definition: skey.h:75
bool amcanbackward
Definition: amapi.h:224
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:988
IndexTupleData * IndexTuple
Definition: itup.h:53
ScanDirection
Definition: sdir.h:22
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:603
char * markTuples
Definition: nbtree.h:1056
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:181
#define RelationGetRelationName(relation)
Definition: rel.h:511
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:634
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
BlockNumber pages_deleted
Definition: genam.h:81
#define OffsetToPointer(base, offset)
Definition: c.h:707
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
ScanKey arrayKeyData
Definition: nbtree.h:1038
amrescan_function amrescan
Definition: amapi.h:272
bool amcanparallel
Definition: amapi.h:242
#define BTREE_METAPAGE
Definition: nbtree.h:146
#define P_ISDELETED(opaque)
Definition: nbtree.h:221
BTCycleId cycleid
Definition: nbtree.h:336
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:403
#define SpinLockRelease(lock)
Definition: spin.h:64
bool amsearchnulls
Definition: amapi.h:234
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:210
int npendingpages
Definition: nbtree.h:345
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:483
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
void * palloc0(Size size)
Definition: mcxt.c:1093
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1051
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:120
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:453
uintptr_t Datum
Definition: postgres.h:411
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:909
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:99
bool amclusterable
Definition: amapi.h:238
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:213
BlockNumber btps_scanPage
Definition: nbtree.c:67
bool amsearcharray
Definition: amapi.h:232
#define InvalidOid
Definition: postgres_ext.h:36
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
#define ereport(elevel,...)
Definition: elog.h:157
bool amusemaintenanceworkmem
Definition: amapi.h:246
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:485
int errmsg_internal(const char *fmt,...)
Definition: elog.c:996
double num_heap_tuples
Definition: genam.h:51
#define makeNode(_type_)
Definition: nodes.h:587
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int * killedItems
Definition: nbtree.h:1047
amadjustmembers_function amadjustmembers
Definition: amapi.h:270
#define Assert(condition)
Definition: c.h:804
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:52
void pgstat_progress_update_param(int index, int64 val)
IndexVacuumInfo * info
Definition: nbtree.h:332
BTPS_State btps_pageStatus
Definition: nbtree.c:68
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2129
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:513
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:544
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:95
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1011
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:540
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
static bool BTPageIsRecyclable(Page page)
Definition: nbtree.h:290
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1532
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:186
ammarkpos_function ammarkpos
Definition: amapi.h:276
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:119
bool amcanorder
Definition: amapi.h:220
IndexBulkDeleteResult * stats
Definition: nbtree.h:333
ambuildphasename_function ambuildphasename
Definition: amapi.h:268
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:45
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1443
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:593
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:280
#define BTNProcs
Definition: nbtree.h:705
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:692
ItemPointerData heapTid
Definition: nbtree.h:944
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:983
struct ScanKeyData * keyData
Definition: relscan.h:122
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:756
static Datum values[MAXATTR]
Definition: bootstrap.c:166
uint16 amstrategies
Definition: amapi.h:214
BTScanPosData currPos
Definition: nbtree.h:1068
#define PageIsNew(page)
Definition: bufpage.h:229
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:1062
void btinitparallelscan(void *target)
Definition: nbtree.c:578
ScanKey keyData
Definition: nbtree.h:1035
uint16 amoptsprocnum
Definition: amapi.h:218
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2956
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:1048
ambuildempty_function ambuildempty
Definition: amapi.h:260
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1043
int i
bool kill_prior_tuple
Definition: relscan.h:128
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:203
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:511
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:81
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1454
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2104
#define BTMaxStrategyNumber
Definition: stratnum.h:35
Buffer buf
Definition: nbtree.h:951
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:1969
void vacuum_delay_point(void)
Definition: vacuum.c:2149
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1122
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2692
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2915
uint16 btpo_flags
Definition: nbtree.h:67
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
double num_index_tuples
Definition: genam.h:78
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:264
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:218
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:660
bool estimated_count
Definition: genam.h:77
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3810
#define offsetof(type, field)
Definition: c.h:727
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:999
Pointer Page
Definition: bufpage.h:78
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:86
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:281
dlist_node node
Definition: smgr.h:72
bool estimated_count
Definition: genam.h:49
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:784
amrestrpos_function amrestrpos
Definition: amapi.h:277
#define P_ISLEAF(opaque)
Definition: nbtree.h:219
int bufsize
Definition: nbtree.h:342