PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/progress.h"
26 #include "commands/vacuum.h"
27 #include "miscadmin.h"
28 #include "nodes/execnodes.h"
29 #include "pgstat.h"
30 #include "postmaster/autovacuum.h"
32 #include "storage/indexfsm.h"
33 #include "storage/ipc.h"
34 #include "storage/lmgr.h"
35 #include "storage/smgr.h"
36 #include "utils/builtins.h"
37 #include "utils/index_selfuncs.h"
38 #include "utils/memutils.h"
39 
40 
41 /* Working state needed by btvacuumpage */
42 typedef struct
43 {
49  BlockNumber totFreePages; /* true total # of free pages */
52 } BTVacState;
53 
54 /*
55  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
56  *
57  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
58  * a new page; others must wait.
59  *
60  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
61  * to a new page; some process can start doing that.
62  *
63  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
64  * We reach this state once for every distinct combination of array keys.
65  */
66 typedef enum
67 {
72 } BTPS_State;
73 
74 /*
75  * BTParallelScanDescData contains btree specific shared information required
76  * for parallel scan.
77  */
78 typedef struct BTParallelScanDescData
79 {
80  BlockNumber btps_scanPage; /* latest or next page to be scanned */
81  BTPS_State btps_pageStatus; /* indicates whether next page is
82  * available for scan. see above for
83  * possible states of parallel scan. */
84  int btps_arrayKeyCount; /* count indicating number of array scan
85  * keys processed by parallel scan */
86  slock_t btps_mutex; /* protects above variables */
87  ConditionVariable btps_cv; /* used to synchronize parallel scan */
89 
91 
92 
93 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
94  IndexBulkDeleteCallback callback, void *callback_state,
95  BTCycleId cycleid);
96 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
98  IndexTuple posting,
99  OffsetNumber updatedoffset,
100  int *nremaining);
101 
102 
103 /*
104  * Btree handler function: return IndexAmRoutine with access method parameters
105  * and callbacks.
106  */
107 Datum
109 {
110  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
111 
112  amroutine->amstrategies = BTMaxStrategyNumber;
113  amroutine->amsupport = BTNProcs;
114  amroutine->amoptsprocnum = BTOPTIONS_PROC;
115  amroutine->amcanorder = true;
116  amroutine->amcanorderbyop = false;
117  amroutine->amcanbackward = true;
118  amroutine->amcanunique = true;
119  amroutine->amcanmulticol = true;
120  amroutine->amoptionalkey = true;
121  amroutine->amsearcharray = true;
122  amroutine->amsearchnulls = true;
123  amroutine->amstorage = false;
124  amroutine->amclusterable = true;
125  amroutine->ampredlocks = true;
126  amroutine->amcanparallel = true;
127  amroutine->amcaninclude = true;
128  amroutine->amusemaintenanceworkmem = false;
129  amroutine->amparallelvacuumoptions =
131  amroutine->amkeytype = InvalidOid;
132 
133  amroutine->ambuild = btbuild;
134  amroutine->ambuildempty = btbuildempty;
135  amroutine->aminsert = btinsert;
136  amroutine->ambulkdelete = btbulkdelete;
137  amroutine->amvacuumcleanup = btvacuumcleanup;
138  amroutine->amcanreturn = btcanreturn;
139  amroutine->amcostestimate = btcostestimate;
140  amroutine->amoptions = btoptions;
141  amroutine->amproperty = btproperty;
142  amroutine->ambuildphasename = btbuildphasename;
143  amroutine->amvalidate = btvalidate;
144  amroutine->ambeginscan = btbeginscan;
145  amroutine->amrescan = btrescan;
146  amroutine->amgettuple = btgettuple;
147  amroutine->amgetbitmap = btgetbitmap;
148  amroutine->amendscan = btendscan;
149  amroutine->ammarkpos = btmarkpos;
150  amroutine->amrestrpos = btrestrpos;
153  amroutine->amparallelrescan = btparallelrescan;
154 
155  PG_RETURN_POINTER(amroutine);
156 }
157 
158 /*
159  * btbuildempty() -- build an empty btree index in the initialization fork
160  */
161 void
163 {
164  Page metapage;
165 
166  /* Construct metapage. */
167  metapage = (Page) palloc(BLCKSZ);
168  _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
169 
170  /*
171  * Write the page and log it. It might seem that an immediate sync would
172  * be sufficient to guarantee that the file exists on disk, but recovery
173  * itself might remove it while replaying, for example, an
174  * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
175  * this even when wal_level=minimal.
176  */
179  (char *) metapage, true);
181  BTREE_METAPAGE, metapage, true);
182 
183  /*
184  * An immediate sync is required even if we xlog'd the page, because the
185  * write did not go through shared_buffers and therefore a concurrent
186  * checkpoint may have moved the redo pointer past our xlog record.
187  */
189 }
190 
191 /*
192  * btinsert() -- insert an index tuple into a btree.
193  *
194  * Descend the tree recursively, find the appropriate location for our
195  * new tuple, and put it there.
196  */
197 bool
198 btinsert(Relation rel, Datum *values, bool *isnull,
199  ItemPointer ht_ctid, Relation heapRel,
200  IndexUniqueCheck checkUnique,
201  IndexInfo *indexInfo)
202 {
203  bool result;
204  IndexTuple itup;
205 
206  /* generate an index tuple */
207  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
208  itup->t_tid = *ht_ctid;
209 
210  result = _bt_doinsert(rel, itup, checkUnique, heapRel);
211 
212  pfree(itup);
213 
214  return result;
215 }
216 
217 /*
218  * btgettuple() -- Get the next tuple in the scan.
219  */
220 bool
222 {
223  BTScanOpaque so = (BTScanOpaque) scan->opaque;
224  bool res;
225 
226  /* btree indexes are never lossy */
227  scan->xs_recheck = false;
228 
229  /*
230  * If we have any array keys, initialize them during first call for a
231  * scan. We can't do this in btrescan because we don't know the scan
232  * direction at that time.
233  */
234  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
235  {
236  /* punt if we have any unsatisfiable array keys */
237  if (so->numArrayKeys < 0)
238  return false;
239 
240  _bt_start_array_keys(scan, dir);
241  }
242 
243  /* This loop handles advancing to the next array elements, if any */
244  do
245  {
246  /*
247  * If we've already initialized this scan, we can just advance it in
248  * the appropriate direction. If we haven't done so yet, we call
249  * _bt_first() to get the first item in the scan.
250  */
251  if (!BTScanPosIsValid(so->currPos))
252  res = _bt_first(scan, dir);
253  else
254  {
255  /*
256  * Check to see if we should kill the previously-fetched tuple.
257  */
258  if (scan->kill_prior_tuple)
259  {
260  /*
261  * Yes, remember it for later. (We'll deal with all such
262  * tuples at once right before leaving the index page.) The
263  * test for numKilled overrun is not just paranoia: if the
264  * caller reverses direction in the indexscan then the same
265  * item might get entered multiple times. It's not worth
266  * trying to optimize that, so we don't detect it, but instead
267  * just forget any excess entries.
268  */
269  if (so->killedItems == NULL)
270  so->killedItems = (int *)
271  palloc(MaxTIDsPerBTreePage * sizeof(int));
272  if (so->numKilled < MaxTIDsPerBTreePage)
273  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
274  }
275 
276  /*
277  * Now continue the scan.
278  */
279  res = _bt_next(scan, dir);
280  }
281 
282  /* If we have a tuple, return it ... */
283  if (res)
284  break;
285  /* ... otherwise see if we have more array keys to deal with */
286  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
287 
288  return res;
289 }
290 
291 /*
292  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
293  */
294 int64
296 {
297  BTScanOpaque so = (BTScanOpaque) scan->opaque;
298  int64 ntids = 0;
299  ItemPointer heapTid;
300 
301  /*
302  * If we have any array keys, initialize them.
303  */
304  if (so->numArrayKeys)
305  {
306  /* punt if we have any unsatisfiable array keys */
307  if (so->numArrayKeys < 0)
308  return ntids;
309 
311  }
312 
313  /* This loop handles advancing to the next array elements, if any */
314  do
315  {
316  /* Fetch the first page & tuple */
317  if (_bt_first(scan, ForwardScanDirection))
318  {
319  /* Save tuple ID, and continue scanning */
320  heapTid = &scan->xs_heaptid;
321  tbm_add_tuples(tbm, heapTid, 1, false);
322  ntids++;
323 
324  for (;;)
325  {
326  /*
327  * Advance to next tuple within page. This is the same as the
328  * easy case in _bt_next().
329  */
330  if (++so->currPos.itemIndex > so->currPos.lastItem)
331  {
332  /* let _bt_next do the heavy lifting */
333  if (!_bt_next(scan, ForwardScanDirection))
334  break;
335  }
336 
337  /* Save tuple ID, and continue scanning */
338  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
339  tbm_add_tuples(tbm, heapTid, 1, false);
340  ntids++;
341  }
342  }
343  /* Now see if we have more array keys to deal with */
345 
346  return ntids;
347 }
348 
349 /*
350  * btbeginscan() -- start a scan on a btree index
351  */
353 btbeginscan(Relation rel, int nkeys, int norderbys)
354 {
355  IndexScanDesc scan;
356  BTScanOpaque so;
357 
358  /* no order by operators allowed */
359  Assert(norderbys == 0);
360 
361  /* get the scan */
362  scan = RelationGetIndexScan(rel, nkeys, norderbys);
363 
364  /* allocate private workspace */
365  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
368  if (scan->numberOfKeys > 0)
369  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
370  else
371  so->keyData = NULL;
372 
373  so->arrayKeyData = NULL; /* assume no array keys for now */
374  so->numArrayKeys = 0;
375  so->arrayKeys = NULL;
376  so->arrayContext = NULL;
377 
378  so->killedItems = NULL; /* until needed */
379  so->numKilled = 0;
380 
381  /*
382  * We don't know yet whether the scan will be index-only, so we do not
383  * allocate the tuple workspace arrays until btrescan. However, we set up
384  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
385  */
386  so->currTuples = so->markTuples = NULL;
387 
388  scan->xs_itupdesc = RelationGetDescr(rel);
389 
390  scan->opaque = so;
391 
392  return scan;
393 }
394 
395 /*
396  * btrescan() -- rescan an index relation
397  */
398 void
399 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
400  ScanKey orderbys, int norderbys)
401 {
402  BTScanOpaque so = (BTScanOpaque) scan->opaque;
403 
404  /* we aren't holding any read locks, but gotta drop the pins */
405  if (BTScanPosIsValid(so->currPos))
406  {
407  /* Before leaving current page, deal with any killed items */
408  if (so->numKilled > 0)
409  _bt_killitems(scan);
412  }
413 
414  so->markItemIndex = -1;
415  so->arrayKeyCount = 0;
418 
419  /*
420  * Allocate tuple workspace arrays, if needed for an index-only scan and
421  * not already done in a previous rescan call. To save on palloc
422  * overhead, both workspaces are allocated as one palloc block; only this
423  * function and btendscan know that.
424  *
425  * NOTE: this data structure also makes it safe to return data from a
426  * "name" column, even though btree name_ops uses an underlying storage
427  * datatype of cstring. The risk there is that "name" is supposed to be
428  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
429  * However, since we only return data out of tuples sitting in the
430  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
431  * data out of the markTuples array --- running off the end of memory for
432  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
433  * adding special-case treatment for name_ops elsewhere.
434  */
435  if (scan->xs_want_itup && so->currTuples == NULL)
436  {
437  so->currTuples = (char *) palloc(BLCKSZ * 2);
438  so->markTuples = so->currTuples + BLCKSZ;
439  }
440 
441  /*
442  * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
443  * - vadim 05/05/97
444  */
445  if (scankey && scan->numberOfKeys > 0)
446  memmove(scan->keyData,
447  scankey,
448  scan->numberOfKeys * sizeof(ScanKeyData));
449  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
450 
451  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
453 }
454 
455 /*
456  * btendscan() -- close down a scan
457  */
458 void
460 {
461  BTScanOpaque so = (BTScanOpaque) scan->opaque;
462 
463  /* we aren't holding any read locks, but gotta drop the pins */
464  if (BTScanPosIsValid(so->currPos))
465  {
466  /* Before leaving current page, deal with any killed items */
467  if (so->numKilled > 0)
468  _bt_killitems(scan);
470  }
471 
472  so->markItemIndex = -1;
474 
475  /* No need to invalidate positions, the RAM is about to be freed. */
476 
477  /* Release storage */
478  if (so->keyData != NULL)
479  pfree(so->keyData);
480  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
481  if (so->arrayContext != NULL)
483  if (so->killedItems != NULL)
484  pfree(so->killedItems);
485  if (so->currTuples != NULL)
486  pfree(so->currTuples);
487  /* so->markTuples should not be pfree'd, see btrescan */
488  pfree(so);
489 }
490 
491 /*
492  * btmarkpos() -- save current scan position
493  */
494 void
496 {
497  BTScanOpaque so = (BTScanOpaque) scan->opaque;
498 
499  /* There may be an old mark with a pin (but no lock). */
501 
502  /*
503  * Just record the current itemIndex. If we later step to next page
504  * before releasing the marked position, _bt_steppage makes a full copy of
505  * the currPos struct in markPos. If (as often happens) the mark is moved
506  * before we leave the page, we don't have to do that work.
507  */
508  if (BTScanPosIsValid(so->currPos))
509  so->markItemIndex = so->currPos.itemIndex;
510  else
511  {
513  so->markItemIndex = -1;
514  }
515 
516  /* Also record the current positions of any array keys */
517  if (so->numArrayKeys)
518  _bt_mark_array_keys(scan);
519 }
520 
521 /*
522  * btrestrpos() -- restore scan to last saved position
523  */
524 void
526 {
527  BTScanOpaque so = (BTScanOpaque) scan->opaque;
528 
529  /* Restore the marked positions of any array keys */
530  if (so->numArrayKeys)
532 
533  if (so->markItemIndex >= 0)
534  {
535  /*
536  * The scan has never moved to a new page since the last mark. Just
537  * restore the itemIndex.
538  *
539  * NB: In this case we can't count on anything in so->markPos to be
540  * accurate.
541  */
542  so->currPos.itemIndex = so->markItemIndex;
543  }
544  else
545  {
546  /*
547  * The scan moved to a new page after last mark or restore, and we are
548  * now restoring to the marked page. We aren't holding any read
549  * locks, but if we're still holding the pin for the current position,
550  * we must drop it.
551  */
552  if (BTScanPosIsValid(so->currPos))
553  {
554  /* Before leaving current page, deal with any killed items */
555  if (so->numKilled > 0)
556  _bt_killitems(scan);
558  }
559 
560  if (BTScanPosIsValid(so->markPos))
561  {
562  /* bump pin on mark buffer for assignment to current buffer */
563  if (BTScanPosIsPinned(so->markPos))
565  memcpy(&so->currPos, &so->markPos,
566  offsetof(BTScanPosData, items[1]) +
567  so->markPos.lastItem * sizeof(BTScanPosItem));
568  if (so->currTuples)
569  memcpy(so->currTuples, so->markTuples,
571  }
572  else
574  }
575 }
576 
577 /*
578  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
579  */
580 Size
582 {
583  return sizeof(BTParallelScanDescData);
584 }
585 
586 /*
587  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
588  */
589 void
590 btinitparallelscan(void *target)
591 {
592  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
593 
594  SpinLockInit(&bt_target->btps_mutex);
595  bt_target->btps_scanPage = InvalidBlockNumber;
597  bt_target->btps_arrayKeyCount = 0;
598  ConditionVariableInit(&bt_target->btps_cv);
599 }
600 
601 /*
602  * btparallelrescan() -- reset parallel scan
603  */
604 void
606 {
607  BTParallelScanDesc btscan;
608  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
609 
610  Assert(parallel_scan);
611 
612  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
613  parallel_scan->ps_offset);
614 
615  /*
616  * In theory, we don't need to acquire the spinlock here, because there
617  * shouldn't be any other workers running at this point, but we do so for
618  * consistency.
619  */
620  SpinLockAcquire(&btscan->btps_mutex);
623  btscan->btps_arrayKeyCount = 0;
624  SpinLockRelease(&btscan->btps_mutex);
625 }
626 
627 /*
628  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
629  * page. Other scans must wait until we call _bt_parallel_release()
630  * or _bt_parallel_done().
631  *
632  * The return value is true if we successfully seized the scan and false
633  * if we did not. The latter case occurs if no pages remain for the current
634  * set of scankeys.
635  *
636  * If the return value is true, *pageno returns the next or current page
637  * of the scan (depending on the scan direction). An invalid block number
638  * means the scan hasn't yet started, and P_NONE means we've reached the end.
639  * The first time a participating process reaches the last page, it will return
640  * true and set *pageno to P_NONE; after that, further attempts to seize the
641  * scan will return false.
642  *
643  * Callers should ignore the value of pageno if the return value is false.
644  */
645 bool
647 {
648  BTScanOpaque so = (BTScanOpaque) scan->opaque;
649  BTPS_State pageStatus;
650  bool exit_loop = false;
651  bool status = true;
652  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
653  BTParallelScanDesc btscan;
654 
655  *pageno = P_NONE;
656 
657  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
658  parallel_scan->ps_offset);
659 
660  while (1)
661  {
662  SpinLockAcquire(&btscan->btps_mutex);
663  pageStatus = btscan->btps_pageStatus;
664 
665  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
666  {
667  /* Parallel scan has already advanced to a new set of scankeys. */
668  status = false;
669  }
670  else if (pageStatus == BTPARALLEL_DONE)
671  {
672  /*
673  * We're done with this set of scankeys. This may be the end, or
674  * there could be more sets to try.
675  */
676  status = false;
677  }
678  else if (pageStatus != BTPARALLEL_ADVANCING)
679  {
680  /*
681  * We have successfully seized control of the scan for the purpose
682  * of advancing it to a new page!
683  */
684  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
685  *pageno = btscan->btps_scanPage;
686  exit_loop = true;
687  }
688  SpinLockRelease(&btscan->btps_mutex);
689  if (exit_loop || !status)
690  break;
692  }
694 
695  return status;
696 }
697 
698 /*
699  * _bt_parallel_release() -- Complete the process of advancing the scan to a
700  * new page. We now have the new value btps_scanPage; some other backend
701  * can now begin advancing the scan.
702  */
703 void
705 {
706  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
707  BTParallelScanDesc btscan;
708 
709  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
710  parallel_scan->ps_offset);
711 
712  SpinLockAcquire(&btscan->btps_mutex);
713  btscan->btps_scanPage = scan_page;
715  SpinLockRelease(&btscan->btps_mutex);
717 }
718 
719 /*
720  * _bt_parallel_done() -- Mark the parallel scan as complete.
721  *
722  * When there are no pages left to scan, this function should be called to
723  * notify other workers. Otherwise, they might wait forever for the scan to
724  * advance to the next page.
725  */
726 void
728 {
729  BTScanOpaque so = (BTScanOpaque) scan->opaque;
730  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
731  BTParallelScanDesc btscan;
732  bool status_changed = false;
733 
734  /* Do nothing, for non-parallel scans */
735  if (parallel_scan == NULL)
736  return;
737 
738  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
739  parallel_scan->ps_offset);
740 
741  /*
742  * Mark the parallel scan as done for this combination of scan keys,
743  * unless some other process already did so. See also
744  * _bt_advance_array_keys.
745  */
746  SpinLockAcquire(&btscan->btps_mutex);
747  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
748  btscan->btps_pageStatus != BTPARALLEL_DONE)
749  {
750  btscan->btps_pageStatus = BTPARALLEL_DONE;
751  status_changed = true;
752  }
753  SpinLockRelease(&btscan->btps_mutex);
754 
755  /* wake up all the workers associated with this parallel scan */
756  if (status_changed)
757  ConditionVariableBroadcast(&btscan->btps_cv);
758 }
759 
760 /*
761  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
762  * keys.
763  *
764  * Updates the count of array keys processed for both local and parallel
765  * scans.
766  */
767 void
769 {
770  BTScanOpaque so = (BTScanOpaque) scan->opaque;
771  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
772  BTParallelScanDesc btscan;
773 
774  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
775  parallel_scan->ps_offset);
776 
777  so->arrayKeyCount++;
778  SpinLockAcquire(&btscan->btps_mutex);
779  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
780  {
781  btscan->btps_scanPage = InvalidBlockNumber;
782  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
783  btscan->btps_arrayKeyCount++;
784  }
785  SpinLockRelease(&btscan->btps_mutex);
786 }
787 
788 /*
789  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
790  *
791  * Called by btvacuumcleanup when btbulkdelete was never called because no
792  * tuples need to be deleted.
793  *
794  * When we return false, VACUUM can even skip the cleanup-only call to
795  * btvacuumscan (i.e. there will be no btvacuumscan call for this index at
796  * all). Otherwise, a cleanup-only btvacuumscan call is required.
797  */
798 static bool
800 {
801  Buffer metabuf;
802  Page metapg;
803  BTMetaPageData *metad;
804  bool result = false;
805 
806  metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
807  metapg = BufferGetPage(metabuf);
808  metad = BTPageGetMeta(metapg);
809 
810  if (metad->btm_version < BTREE_NOVAC_VERSION)
811  {
812  /*
813  * Do cleanup if metapage needs upgrade, because we don't have
814  * cleanup-related meta-information yet.
815  */
816  result = true;
817  }
818  else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
821  {
822  /*
823  * If any oldest btpo.xact from a previously deleted page in the index
824  * is older than RecentGlobalXmin, then at least one deleted page can
825  * be recycled -- don't skip cleanup.
826  */
827  result = true;
828  }
829  else
830  {
831  BTOptions *relopts;
832  float8 cleanup_scale_factor;
833  float8 prev_num_heap_tuples;
834 
835  /*
836  * If table receives enough insertions and no cleanup was performed,
837  * then index would appear have stale statistics. If scale factor is
838  * set, we avoid that by performing cleanup if the number of inserted
839  * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
840  * original tuples count.
841  */
842  relopts = (BTOptions *) info->index->rd_options;
843  cleanup_scale_factor = (relopts &&
844  relopts->vacuum_cleanup_index_scale_factor >= 0)
847  prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
848 
849  if (cleanup_scale_factor <= 0 ||
850  prev_num_heap_tuples <= 0 ||
851  (info->num_heap_tuples - prev_num_heap_tuples) /
852  prev_num_heap_tuples >= cleanup_scale_factor)
853  result = true;
854  }
855 
856  _bt_relbuf(info->index, metabuf);
857  return result;
858 }
859 
860 /*
861  * Bulk deletion of all index entries pointing to a set of heap tuples.
862  * The set of target tuples is specified via a callback routine that tells
863  * whether any given heap tuple (identified by ItemPointer) is being deleted.
864  *
865  * Result: a palloc'd struct containing statistical info for VACUUM displays.
866  */
869  IndexBulkDeleteCallback callback, void *callback_state)
870 {
871  Relation rel = info->index;
872  BTCycleId cycleid;
873 
874  /* allocate stats if first time through, else re-use existing struct */
875  if (stats == NULL)
877 
878  /* Establish the vacuum cycle ID to use for this scan */
879  /* The ENSURE stuff ensures we clean up shared memory on failure */
881  {
882  cycleid = _bt_start_vacuum(rel);
883 
884  btvacuumscan(info, stats, callback, callback_state, cycleid);
885  }
887  _bt_end_vacuum(rel);
888 
889  return stats;
890 }
891 
892 /*
893  * Post-VACUUM cleanup.
894  *
895  * Result: a palloc'd struct containing statistical info for VACUUM displays.
896  */
899 {
900  /* No-op in ANALYZE ONLY mode */
901  if (info->analyze_only)
902  return stats;
903 
904  /*
905  * If btbulkdelete was called, we need not do anything, just return the
906  * stats from the latest btbulkdelete call. If it wasn't called, we might
907  * still need to do a pass over the index, to recycle any newly-recyclable
908  * pages or to obtain index statistics. _bt_vacuum_needs_cleanup
909  * determines if either are needed.
910  *
911  * Since we aren't going to actually delete any leaf items, there's no
912  * need to go through all the vacuum-cycle-ID pushups.
913  */
914  if (stats == NULL)
915  {
916  /* Check if we need a cleanup */
917  if (!_bt_vacuum_needs_cleanup(info))
918  return NULL;
919 
921  btvacuumscan(info, stats, NULL, NULL, 0);
922  }
923 
924  /*
925  * It's quite possible for us to be fooled by concurrent page splits into
926  * double-counting some index tuples, so disbelieve any total that exceeds
927  * the underlying heap's count ... if we know that accurately. Otherwise
928  * this might just make matters worse.
929  */
930  if (!info->estimated_count)
931  {
932  if (stats->num_index_tuples > info->num_heap_tuples)
933  stats->num_index_tuples = info->num_heap_tuples;
934  }
935 
936  return stats;
937 }
938 
939 /*
940  * btvacuumscan --- scan the index for VACUUMing purposes
941  *
942  * This combines the functions of looking for leaf tuples that are deletable
943  * according to the vacuum callback, looking for empty pages that can be
944  * deleted, and looking for old deleted pages that can be recycled. Both
945  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
946  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
947  * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup
948  * is maintained.
949  *
950  * The caller is responsible for initially allocating/zeroing a stats struct
951  * and for obtaining a vacuum cycle ID if necessary.
952  */
953 static void
955  IndexBulkDeleteCallback callback, void *callback_state,
956  BTCycleId cycleid)
957 {
958  Relation rel = info->index;
959  BTVacState vstate;
960  BlockNumber num_pages;
961  BlockNumber scanblkno;
962  bool needLock;
963 
964  /*
965  * Reset counts that will be incremented during the scan; needed in case
966  * of multiple scans during a single VACUUM command
967  */
968  stats->estimated_count = false;
969  stats->num_index_tuples = 0;
970  stats->pages_deleted = 0;
971 
972  /* Set up info to pass down to btvacuumpage */
973  vstate.info = info;
974  vstate.stats = stats;
975  vstate.callback = callback;
976  vstate.callback_state = callback_state;
977  vstate.cycleid = cycleid;
978  vstate.totFreePages = 0;
980 
981  /* Create a temporary memory context to run _bt_pagedel in */
983  "_bt_pagedel",
985 
986  /*
987  * The outer loop iterates over all index pages except the metapage, in
988  * physical order (we hope the kernel will cooperate in providing
989  * read-ahead for speed). It is critical that we visit all leaf pages,
990  * including ones added after we start the scan, else we might fail to
991  * delete some deletable tuples. Hence, we must repeatedly check the
992  * relation length. We must acquire the relation-extension lock while
993  * doing so to avoid a race condition: if someone else is extending the
994  * relation, there is a window where bufmgr/smgr have created a new
995  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
996  * we manage to scan such a page here, we'll improperly assume it can be
997  * recycled. Taking the lock synchronizes things enough to prevent a
998  * problem: either num_pages won't include the new page, or _bt_getbuf
999  * already has write lock on the buffer and it will be fully initialized
1000  * before we can examine it. (See also vacuumlazy.c, which has the same
1001  * issue.) Also, we need not worry if a page is added immediately after
1002  * we look; the page splitting code already has write-lock on the left
1003  * page before it adds a right page, so we must already have processed any
1004  * tuples due to be moved into such a page.
1005  *
1006  * We can skip locking for new or temp relations, however, since no one
1007  * else could be accessing them.
1008  */
1009  needLock = !RELATION_IS_LOCAL(rel);
1010 
1011  scanblkno = BTREE_METAPAGE + 1;
1012  for (;;)
1013  {
1014  /* Get the current relation length */
1015  if (needLock)
1017  num_pages = RelationGetNumberOfBlocks(rel);
1018  if (needLock)
1020 
1021  if (info->report_progress)
1023  num_pages);
1024 
1025  /* Quit if we've scanned the whole relation */
1026  if (scanblkno >= num_pages)
1027  break;
1028  /* Iterate over pages, then loop back to recheck length */
1029  for (; scanblkno < num_pages; scanblkno++)
1030  {
1031  btvacuumpage(&vstate, scanblkno);
1032  if (info->report_progress)
1034  scanblkno);
1035  }
1036  }
1037 
1039 
1040  /*
1041  * If we found any recyclable pages (and recorded them in the FSM), then
1042  * forcibly update the upper-level FSM pages to ensure that searchers can
1043  * find them. It's possible that the pages were also found during
1044  * previous scans and so this is a waste of time, but it's cheap enough
1045  * relative to scanning the index that it shouldn't matter much, and
1046  * making sure that free pages are available sooner not later seems
1047  * worthwhile.
1048  *
1049  * Note that if no recyclable pages exist, we don't bother vacuuming the
1050  * FSM at all.
1051  */
1052  if (vstate.totFreePages > 0)
1054 
1055  /*
1056  * Maintain the oldest btpo.xact and a count of the current number of heap
1057  * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup).
1058  *
1059  * The page with the oldest btpo.xact is typically a page deleted by this
1060  * VACUUM operation, since pages deleted by a previous VACUUM operation
1061  * tend to be placed in the FSM (by the current VACUUM operation) -- such
1062  * pages are not candidates to be the oldest btpo.xact. (Note that pages
1063  * placed in the FSM are reported as deleted pages in the bulk delete
1064  * statistics, despite not counting as deleted pages for the purposes of
1065  * determining the oldest btpo.xact.)
1066  */
1068  info->num_heap_tuples);
1069 
1070  /* update statistics */
1071  stats->num_pages = num_pages;
1072  stats->pages_free = vstate.totFreePages;
1073 }
1074 
1075 /*
1076  * btvacuumpage --- VACUUM one page
1077  *
1078  * This processes a single page for btvacuumscan(). In some cases we must
1079  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1080  * a previous call here. This is how we handle page splits (that happened
1081  * after our cycleid was acquired) whose right half page happened to reuse
1082  * a block that we might have processed at some point before it was
1083  * recycled (i.e. before the page split).
1084  */
1085 static void
1087 {
1088  IndexVacuumInfo *info = vstate->info;
1089  IndexBulkDeleteResult *stats = vstate->stats;
1091  void *callback_state = vstate->callback_state;
1092  Relation rel = info->index;
1093  bool attempt_pagedel;
1094  BlockNumber blkno,
1095  backtrack_to;
1096  Buffer buf;
1097  Page page;
1098  BTPageOpaque opaque;
1099 
1100  blkno = scanblkno;
1101 
1102 backtrack:
1103 
1104  attempt_pagedel = false;
1105  backtrack_to = P_NONE;
1106 
1107  /* call vacuum_delay_point while not holding any buffer lock */
1109 
1110  /*
1111  * We can't use _bt_getbuf() here because it always applies
1112  * _bt_checkpage(), which will barf on an all-zero page. We want to
1113  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1114  * buffer access strategy.
1115  */
1116  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1117  info->strategy);
1118  LockBuffer(buf, BT_READ);
1119  page = BufferGetPage(buf);
1120  opaque = NULL;
1121  if (!PageIsNew(page))
1122  {
1123  _bt_checkpage(rel, buf);
1124  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1125  }
1126 
1127  Assert(blkno <= scanblkno);
1128  if (blkno != scanblkno)
1129  {
1130  /*
1131  * We're backtracking.
1132  *
1133  * We followed a right link to a sibling leaf page (a page that
1134  * happens to be from a block located before scanblkno). The only
1135  * case we want to do anything with is a live leaf page having the
1136  * current vacuum cycle ID.
1137  *
1138  * The page had better be in a state that's consistent with what we
1139  * expect. Check for conditions that imply corruption in passing. It
1140  * can't be half-dead because only an interrupted VACUUM process can
1141  * leave pages in that state, so we'd definitely have dealt with it
1142  * back when the page was the scanblkno page (half-dead pages are
1143  * always marked fully deleted by _bt_pagedel()). This assumes that
1144  * there can be only one vacuum process running at a time.
1145  */
1146  if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1147  {
1148  Assert(false);
1149  ereport(LOG,
1150  (errcode(ERRCODE_INDEX_CORRUPTED),
1151  errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1152  blkno, scanblkno, RelationGetRelationName(rel))));
1153  _bt_relbuf(rel, buf);
1154  return;
1155  }
1156 
1157  /*
1158  * We may have already processed the page in an earlier call, when the
1159  * page was scanblkno. This happens when the leaf page split occurred
1160  * after the scan began, but before the right sibling page became the
1161  * scanblkno.
1162  *
1163  * Page may also have been deleted by current btvacuumpage() call,
1164  * since _bt_pagedel() sometimes deletes the right sibling page of
1165  * scanblkno in passing (it does so after we decided where to
1166  * backtrack to). We don't need to process this page as a deleted
1167  * page a second time now (in fact, it would be wrong to count it as a
1168  * deleted page in the bulk delete statistics a second time).
1169  */
1170  if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1171  {
1172  /* Done with current scanblkno (and all lower split pages) */
1173  _bt_relbuf(rel, buf);
1174  return;
1175  }
1176  }
1177 
1178  /* Page is valid, see what to do with it */
1179  if (_bt_page_recyclable(page))
1180  {
1181  /* Okay to recycle this page (which could be leaf or internal) */
1182  RecordFreeIndexPage(rel, blkno);
1183  vstate->totFreePages++;
1184  stats->pages_deleted++;
1185  }
1186  else if (P_ISDELETED(opaque))
1187  {
1188  /*
1189  * Already deleted page (which could be leaf or internal). Can't
1190  * recycle yet.
1191  */
1192  stats->pages_deleted++;
1193 
1194  /* Maintain the oldest btpo.xact */
1195  if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1196  TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1197  vstate->oldestBtpoXact = opaque->btpo.xact;
1198  }
1199  else if (P_ISHALFDEAD(opaque))
1200  {
1201  /*
1202  * Half-dead leaf page. Try to delete now. Might update
1203  * oldestBtpoXact and pages_deleted below.
1204  */
1205  attempt_pagedel = true;
1206  }
1207  else if (P_ISLEAF(opaque))
1208  {
1210  int ndeletable;
1212  int nupdatable;
1213  OffsetNumber offnum,
1214  minoff,
1215  maxoff;
1216  int nhtidsdead,
1217  nhtidslive;
1218 
1219  /*
1220  * Trade in the initial read lock for a super-exclusive write lock on
1221  * this page. We must get such a lock on every leaf page over the
1222  * course of the vacuum scan, whether or not it actually contains any
1223  * deletable tuples --- see nbtree/README.
1224  */
1226  LockBufferForCleanup(buf);
1227 
1228  /*
1229  * Check whether we need to backtrack to earlier pages. What we are
1230  * concerned about is a page split that happened since we started the
1231  * vacuum scan. If the split moved tuples on the right half of the
1232  * split (i.e. the tuples that sort high) to a block that we already
1233  * passed over, then we might have missed the tuples. We need to
1234  * backtrack now. (Must do this before possibly clearing btpo_cycleid
1235  * or deleting scanblkno page below!)
1236  */
1237  if (vstate->cycleid != 0 &&
1238  opaque->btpo_cycleid == vstate->cycleid &&
1239  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1240  !P_RIGHTMOST(opaque) &&
1241  opaque->btpo_next < scanblkno)
1242  backtrack_to = opaque->btpo_next;
1243 
1244  /*
1245  * When each VACUUM begins, it determines an OldestXmin cutoff value.
1246  * Tuples before the cutoff are removed by VACUUM. Scan over all
1247  * items to see which ones need to be deleted according to cutoff
1248  * point using callback.
1249  */
1250  ndeletable = 0;
1251  nupdatable = 0;
1252  minoff = P_FIRSTDATAKEY(opaque);
1253  maxoff = PageGetMaxOffsetNumber(page);
1254  nhtidsdead = 0;
1255  nhtidslive = 0;
1256  if (callback)
1257  {
1258  for (offnum = minoff;
1259  offnum <= maxoff;
1260  offnum = OffsetNumberNext(offnum))
1261  {
1262  IndexTuple itup;
1263 
1264  itup = (IndexTuple) PageGetItem(page,
1265  PageGetItemId(page, offnum));
1266 
1267  /*
1268  * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
1269  * records do not produce their own conflicts. This is safe
1270  * as long as the callback function only considers whether the
1271  * index tuple refers to pre-cutoff heap tuples that were
1272  * certainly already pruned away during VACUUM's initial heap
1273  * scan by the time we get here. (XLOG_HEAP2_CLEANUP_INFO
1274  * records produce conflicts using a latestRemovedXid value
1275  * for the entire VACUUM, so there is no need to produce our
1276  * own conflict now.)
1277  *
1278  * Backends with snapshots acquired after a VACUUM starts but
1279  * before it finishes could have a RecentGlobalXmin with a
1280  * later xid than the VACUUM's OldestXmin cutoff. These
1281  * backends might happen to opportunistically mark some index
1282  * tuples LP_DEAD before we reach them, even though they may
1283  * be after our cutoff. We don't try to kill these "extra"
1284  * index tuples in _bt_delitems_vacuum(). This keep things
1285  * simple, and allows us to always avoid generating our own
1286  * conflicts.
1287  */
1288  Assert(!BTreeTupleIsPivot(itup));
1289  if (!BTreeTupleIsPosting(itup))
1290  {
1291  /* Regular tuple, standard table TID representation */
1292  if (callback(&itup->t_tid, callback_state))
1293  {
1294  deletable[ndeletable++] = offnum;
1295  nhtidsdead++;
1296  }
1297  else
1298  nhtidslive++;
1299  }
1300  else
1301  {
1302  BTVacuumPosting vacposting;
1303  int nremaining;
1304 
1305  /* Posting list tuple */
1306  vacposting = btreevacuumposting(vstate, itup, offnum,
1307  &nremaining);
1308  if (vacposting == NULL)
1309  {
1310  /*
1311  * All table TIDs from the posting tuple remain, so no
1312  * delete or update required
1313  */
1314  Assert(nremaining == BTreeTupleGetNPosting(itup));
1315  }
1316  else if (nremaining > 0)
1317  {
1318 
1319  /*
1320  * Store metadata about posting list tuple in
1321  * updatable array for entire page. Existing tuple
1322  * will be updated during the later call to
1323  * _bt_delitems_vacuum().
1324  */
1325  Assert(nremaining < BTreeTupleGetNPosting(itup));
1326  updatable[nupdatable++] = vacposting;
1327  nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1328  }
1329  else
1330  {
1331  /*
1332  * All table TIDs from the posting list must be
1333  * deleted. We'll delete the index tuple completely
1334  * (no update required).
1335  */
1336  Assert(nremaining == 0);
1337  deletable[ndeletable++] = offnum;
1338  nhtidsdead += BTreeTupleGetNPosting(itup);
1339  pfree(vacposting);
1340  }
1341 
1342  nhtidslive += nremaining;
1343  }
1344  }
1345  }
1346 
1347  /*
1348  * Apply any needed deletes or updates. We issue just one
1349  * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1350  */
1351  if (ndeletable > 0 || nupdatable > 0)
1352  {
1353  Assert(nhtidsdead >= ndeletable + nupdatable);
1354  _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1355  nupdatable);
1356 
1357  stats->tuples_removed += nhtidsdead;
1358  /* must recompute maxoff */
1359  maxoff = PageGetMaxOffsetNumber(page);
1360 
1361  /* can't leak memory here */
1362  for (int i = 0; i < nupdatable; i++)
1363  pfree(updatable[i]);
1364  }
1365  else
1366  {
1367  /*
1368  * If the leaf page has been split during this vacuum cycle, it
1369  * seems worth expending a write to clear btpo_cycleid even if we
1370  * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1371  * takes care of this.) This ensures we won't process the page
1372  * again.
1373  *
1374  * We treat this like a hint-bit update because there's no need to
1375  * WAL-log it.
1376  */
1377  Assert(nhtidsdead == 0);
1378  if (vstate->cycleid != 0 &&
1379  opaque->btpo_cycleid == vstate->cycleid)
1380  {
1381  opaque->btpo_cycleid = 0;
1382  MarkBufferDirtyHint(buf, true);
1383  }
1384  }
1385 
1386  /*
1387  * If the leaf page is now empty, try to delete it; else count the
1388  * live tuples (live table TIDs in posting lists are counted as
1389  * separate live tuples). We don't delete when backtracking, though,
1390  * since that would require teaching _bt_pagedel() about backtracking
1391  * (doesn't seem worth adding more complexity to deal with that).
1392  */
1393  if (minoff > maxoff)
1394  attempt_pagedel = (blkno == scanblkno);
1395  else
1396  stats->num_index_tuples += nhtidslive;
1397 
1398  Assert(!attempt_pagedel || nhtidslive == 0);
1399  }
1400 
1401  if (attempt_pagedel)
1402  {
1403  MemoryContext oldcontext;
1404 
1405  /* Run pagedel in a temp context to avoid memory leakage */
1407  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1408 
1409  /*
1410  * We trust the _bt_pagedel return value because it does not include
1411  * any page that a future call here from btvacuumscan is expected to
1412  * count. There will be no double-counting.
1413  */
1414  Assert(blkno == scanblkno);
1415  stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
1416 
1417  MemoryContextSwitchTo(oldcontext);
1418  /* pagedel released buffer, so we shouldn't */
1419  }
1420  else
1421  _bt_relbuf(rel, buf);
1422 
1423  if (backtrack_to != P_NONE)
1424  {
1425  blkno = backtrack_to;
1426  goto backtrack;
1427  }
1428 }
1429 
1430 /*
1431  * btreevacuumposting --- determine TIDs still needed in posting list
1432  *
1433  * Returns metadata describing how to build replacement tuple without the TIDs
1434  * that VACUUM needs to delete. Returned value is NULL in the common case
1435  * where no changes are needed to caller's posting list tuple (we avoid
1436  * allocating memory here as an optimization).
1437  *
1438  * The number of TIDs that should remain in the posting list tuple is set for
1439  * caller in *nremaining.
1440  */
1441 static BTVacuumPosting
1443  OffsetNumber updatedoffset, int *nremaining)
1444 {
1445  int live = 0;
1446  int nitem = BTreeTupleGetNPosting(posting);
1447  ItemPointer items = BTreeTupleGetPosting(posting);
1448  BTVacuumPosting vacposting = NULL;
1449 
1450  for (int i = 0; i < nitem; i++)
1451  {
1452  if (!vstate->callback(items + i, vstate->callback_state))
1453  {
1454  /* Live table TID */
1455  live++;
1456  }
1457  else if (vacposting == NULL)
1458  {
1459  /*
1460  * First dead table TID encountered.
1461  *
1462  * It's now clear that we need to delete one or more dead table
1463  * TIDs, so start maintaining metadata describing how to update
1464  * existing posting list tuple.
1465  */
1466  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1467  nitem * sizeof(uint16));
1468 
1469  vacposting->itup = posting;
1470  vacposting->updatedoffset = updatedoffset;
1471  vacposting->ndeletedtids = 0;
1472  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1473  }
1474  else
1475  {
1476  /* Second or subsequent dead table TID */
1477  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1478  }
1479  }
1480 
1481  *nremaining = live;
1482  return vacposting;
1483 }
1484 
1485 /*
1486  * btcanreturn() -- Check whether btree indexes support index-only scans.
1487  *
1488  * btrees always do, so this is trivial.
1489  */
1490 bool
1492 {
1493  return true;
1494 }
slock_t btps_mutex
Definition: nbtree.c:86
ambeginscan_function ambeginscan
Definition: amapi.h:227
uint8 amparallelvacuumoptions
Definition: amapi.h:205
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:360
int slock_t
Definition: s_lock.h:934
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
uint16 ndeletedtids
Definition: nbtree.h:782
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3779
ambulkdelete_function ambulkdelete
Definition: amapi.h:219
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:544
#define BTP_SPLIT_END
Definition: nbtree.h:77
bool amcanmulticol
Definition: amapi.h:185
BlockNumber btpo_next
Definition: nbtree.h:59
uint16 amsupport
Definition: amapi.h:173
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:211
#define AllocSetContextCreate
Definition: memutils.h:170
MemoryContext pagedelcontext
Definition: nbtree.c:51
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:399
double tuples_removed
Definition: genam.h:78
void * callback_state
Definition: nbtree.c:47
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Definition: nbtinsert.c:82
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
static bool _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
Definition: nbtree.c:799
uint32 TransactionId
Definition: c.h:513
uint32 btm_version
Definition: nbtree.h:101
amgettuple_function amgettuple
Definition: amapi.h:229
struct BTParallelScanDescData BTParallelScanDescData
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3553
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:727
#define RelationGetDescr(relation)
Definition: rel.h:482
bool amcanorderbyop
Definition: amapi.h:179
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:518
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:792
BlockNumber totFreePages
Definition: nbtree.c:49
MemoryContext arrayContext
Definition: nbtree.h:918
amproperty_function amproperty
Definition: amapi.h:224
#define ExclusiveLock
Definition: lockdefs.h:44
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:2151
#define PointerGetDatum(X)
Definition: postgres.h:556
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:405
int itemIndex
Definition: nbtree.h:855
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
void pgstat_progress_update_param(int index, int64 val)
Definition: pgstat.c:3231
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1017
char * currTuples
Definition: nbtree.h:929
OffsetNumber updatedoffset
Definition: nbtree.h:779
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
#define SpinLockInit(lock)
Definition: spin.h:60
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:583
bool analyze_only
Definition: genam.h:47
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo)
Definition: nbtree.c:198
struct TupleDescData * xs_itupdesc
Definition: relscan.h:128
IndexBulkDeleteCallback callback
Definition: nbtree.c:46
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:652
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:238
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1442
bool report_progress
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:52
IndexTuple itup
Definition: nbtree.h:778
bool amstorage
Definition: amapi.h:193
union BTPageOpaqueData::@45 btpo
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define P_NONE
Definition: nbtree.h:206
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
Definition: nbtree.c:1086
void ConditionVariableBroadcast(ConditionVariable *cv)
int errcode(int sqlerrcode)
Definition: elog.c:610
Relation index
Definition: genam.h:46
double vacuum_cleanup_index_scale_factor
Definition: globals.c:150
bool ampredlocks
Definition: amapi.h:197
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:879
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:60
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:136
uint32 BlockNumber
Definition: block.h:31
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:2053
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:353
void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
Definition: nbtpage.c:173
aminsert_function aminsert
Definition: amapi.h:218
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:954
#define LOG
Definition: elog.h:26
#define MaxTIDsPerBTreePage
Definition: nbtree.h:179
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:90
Oid amkeytype
Definition: amapi.h:207
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:974
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
TransactionId xact
Definition: nbtree.h:63
bool amoptionalkey
Definition: amapi.h:187
amvalidate_function amvalidate
Definition: amapi.h:226
Size btestimateparallelscan(void)
Definition: nbtree.c:581
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
#define BTOPTIONS_PROC
Definition: nbtree.h:579
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:849
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:38
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
IndexUniqueCheck
Definition: genam.h:112
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:47
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:622
void btbuildempty(Relation index)
Definition: nbtree.c:162
#define BT_READ
Definition: nbtree.h:587
#define SpinLockAcquire(lock)
Definition: spin.h:62
int nextTupleOffset
Definition: nbtree.h:844
void ConditionVariableInit(ConditionVariable *cv)
int lastItem
Definition: nbtree.h:854
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1718
float8 vacuum_cleanup_index_scale_factor
Definition: nbtree.h:964
unsigned short uint16
Definition: c.h:366
BTScanPosData markPos
Definition: nbtree.h:943
void pfree(void *pointer)
Definition: mcxt.c:1056
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:898
void ConditionVariableCancelSleep(void)
amgetbitmap_function amgetbitmap
Definition: amapi.h:230
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:725
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:218
double float8
Definition: c.h:491
uint16 BTCycleId
Definition: nbtree.h:28
ambuild_function ambuild
Definition: amapi.h:216
amoptions_function amoptions
Definition: amapi.h:223
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:295
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:299
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
BlockNumber num_pages
Definition: genam.h:74
BlockNumber pages_free
Definition: genam.h:80
BTPS_State
Definition: nbtree.c:66
BTCycleId btpo_cycleid
Definition: nbtree.h:66
void ConditionVariableSignal(ConditionVariable *cv)
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:459
bool amcaninclude
Definition: amapi.h:201
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:192
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
amcostestimate_function amcostestimate
Definition: amapi.h:222
#define BTPageGetMeta(p)
Definition: nbtree.h:114
bool amcanunique
Definition: amapi.h:183
uint32 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
Definition: nbtpage.c:1442
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6234
int numArrayKeys
Definition: nbtree.h:913
ConditionVariable btps_cv
Definition: nbtree.c:87
ItemPointerData xs_heaptid
Definition: relscan.h:132
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2025
static char * buf
Definition: pg_test_fsync.c:67
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:220
amendscan_function amendscan
Definition: amapi.h:231
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:513
ScanKeyData * ScanKey
Definition: skey.h:75
bool amcanbackward
Definition: amapi.h:181
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:862
TransactionId RecentGlobalXmin
Definition: snapmgr.c:168
IndexTupleData * IndexTuple
Definition: itup.h:53
int arrayKeyCount
Definition: nbtree.h:915
ScanDirection
Definition: sdir.h:22
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:603
char * markTuples
Definition: nbtree.h:930
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:490
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:646
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:145
BlockNumber pages_deleted
Definition: genam.h:79
#define OffsetToPointer(base, offset)
Definition: c.h:641
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
ScanKey arrayKeyData
Definition: nbtree.h:912
amrescan_function amrescan
Definition: amapi.h:228
bool amcanparallel
Definition: amapi.h:199
#define BTREE_METAPAGE
Definition: nbtree.h:141
#define P_ISDELETED(opaque)
Definition: nbtree.h:216
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
TransactionId oldestBtpoXact
Definition: nbtree.c:50
BTCycleId cycleid
Definition: nbtree.c:48
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:402
#define SpinLockRelease(lock)
Definition: spin.h:64
bool amsearchnulls
Definition: amapi.h:191
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:221
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:495
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
void * palloc0(Size size)
Definition: mcxt.c:980
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:120
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:452
uintptr_t Datum
Definition: postgres.h:367
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:783
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3722
bool amclusterable
Definition: amapi.h:195
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:211
BlockNumber btps_scanPage
Definition: nbtree.c:80
bool amsearcharray
Definition: amapi.h:189
#define InvalidOid
Definition: postgres_ext.h:36
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:947
#define ereport(elevel,...)
Definition: elog.h:144
bool amusemaintenanceworkmem
Definition: amapi.h:203
int markItemIndex
Definition: nbtree.h:939
RelFileNode node
Definition: relfilenode.h:74
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
int errmsg_internal(const char *fmt,...)
Definition: elog.c:911
double num_heap_tuples
Definition: genam.h:51
#define makeNode(_type_)
Definition: nodes.h:577
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int * killedItems
Definition: nbtree.h:921
#define Assert(condition)
Definition: c.h:738
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:52
IndexVacuumInfo * info
Definition: nbtree.c:44
BTPS_State btps_pageStatus
Definition: nbtree.c:81
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2128
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:525
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:108
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:885
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:109
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1194
int numberOfKeys
Definition: nbtree.h:908
ammarkpos_function ammarkpos
Definition: amapi.h:232
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:119
bool amcanorder
Definition: amapi.h:177
IndexBulkDeleteResult * stats
Definition: nbtree.c:45
ambuildphasename_function ambuildphasename
Definition: amapi.h:225
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:45
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1491
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:605
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:236
#define BTNProcs
Definition: nbtree.h:580
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:704
ItemPointerData heapTid
Definition: nbtree.h:818
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:857
struct ScanKeyData * keyData
Definition: relscan.h:107
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:768
static Datum values[MAXATTR]
Definition: bootstrap.c:167
uint16 amstrategies
Definition: amapi.h:171
BTScanPosData currPos
Definition: nbtree.h:942
#define PageIsNew(page)
Definition: bufpage.h:229
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:949
void btinitparallelscan(void *target)
Definition: nbtree.c:590
ScanKey keyData
Definition: nbtree.h:909
uint16 amoptsprocnum
Definition: amapi.h:175
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:977
ambuildempty_function ambuildempty
Definition: amapi.h:217
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:917
int i
bool kill_prior_tuple
Definition: relscan.h:113
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:203
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1451
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2103
#define BTMaxStrategyNumber
Definition: stratnum.h:35
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:107
Buffer buf
Definition: nbtree.h:825
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:1968
#define TransactionIdIsValid(xid)
Definition: transam.h:41
void vacuum_delay_point(void)
Definition: vacuum.c:1995
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:225
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2691
uint16 btpo_flags
Definition: nbtree.h:65
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
double num_index_tuples
Definition: genam.h:77
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:221
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:620
bool estimated_count
Definition: genam.h:76
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3521
bytea * rd_options
Definition: rel.h:157
#define offsetof(type, field)
Definition: c.h:661
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:873
Pointer Page
Definition: bufpage.h:78
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:84
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:237
bool estimated_count
Definition: genam.h:49
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:868
amrestrpos_function amrestrpos
Definition: amapi.h:233
#define P_ISLEAF(opaque)
Definition: nbtree.h:214