PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "access/xloginsert.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "nodes/execnodes.h"
30 #include "pgstat.h"
31 #include "postmaster/autovacuum.h"
33 #include "storage/indexfsm.h"
34 #include "storage/ipc.h"
35 #include "storage/lmgr.h"
36 #include "storage/smgr.h"
37 #include "utils/builtins.h"
38 #include "utils/index_selfuncs.h"
39 #include "utils/memutils.h"
40 
41 
42 /*
43  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
44  *
45  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
46  * a new page; others must wait.
47  *
48  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
49  * to a new page; some process can start doing that.
50  *
51  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
52  * We reach this state once for every distinct combination of array keys.
53  */
54 typedef enum
55 {
61 
62 /*
63  * BTParallelScanDescData contains btree specific shared information required
64  * for parallel scan.
65  */
66 typedef struct BTParallelScanDescData
67 {
68  BlockNumber btps_scanPage; /* latest or next page to be scanned */
69  BTPS_State btps_pageStatus; /* indicates whether next page is
70  * available for scan. see above for
71  * possible states of parallel scan. */
72  int btps_arrayKeyCount; /* count indicating number of array scan
73  * keys processed by parallel scan */
74  slock_t btps_mutex; /* protects above variables */
75  ConditionVariable btps_cv; /* used to synchronize parallel scan */
77 
79 
80 
81 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
82  IndexBulkDeleteCallback callback, void *callback_state,
83  BTCycleId cycleid);
84 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
86  IndexTuple posting,
87  OffsetNumber updatedoffset,
88  int *nremaining);
89 
90 
91 /*
92  * Btree handler function: return IndexAmRoutine with access method parameters
93  * and callbacks.
94  */
95 Datum
97 {
99 
100  amroutine->amstrategies = BTMaxStrategyNumber;
101  amroutine->amsupport = BTNProcs;
102  amroutine->amoptsprocnum = BTOPTIONS_PROC;
103  amroutine->amcanorder = true;
104  amroutine->amcanorderbyop = false;
105  amroutine->amcanbackward = true;
106  amroutine->amcanunique = true;
107  amroutine->amcanmulticol = true;
108  amroutine->amoptionalkey = true;
109  amroutine->amsearcharray = true;
110  amroutine->amsearchnulls = true;
111  amroutine->amstorage = false;
112  amroutine->amclusterable = true;
113  amroutine->ampredlocks = true;
114  amroutine->amcanparallel = true;
115  amroutine->amcaninclude = true;
116  amroutine->amusemaintenanceworkmem = false;
117  amroutine->amparallelvacuumoptions =
119  amroutine->amkeytype = InvalidOid;
120 
121  amroutine->ambuild = btbuild;
122  amroutine->ambuildempty = btbuildempty;
123  amroutine->aminsert = btinsert;
124  amroutine->ambulkdelete = btbulkdelete;
125  amroutine->amvacuumcleanup = btvacuumcleanup;
126  amroutine->amcanreturn = btcanreturn;
127  amroutine->amcostestimate = btcostestimate;
128  amroutine->amoptions = btoptions;
129  amroutine->amproperty = btproperty;
130  amroutine->ambuildphasename = btbuildphasename;
131  amroutine->amvalidate = btvalidate;
132  amroutine->amadjustmembers = btadjustmembers;
133  amroutine->ambeginscan = btbeginscan;
134  amroutine->amrescan = btrescan;
135  amroutine->amgettuple = btgettuple;
136  amroutine->amgetbitmap = btgetbitmap;
137  amroutine->amendscan = btendscan;
138  amroutine->ammarkpos = btmarkpos;
139  amroutine->amrestrpos = btrestrpos;
142  amroutine->amparallelrescan = btparallelrescan;
143 
144  PG_RETURN_POINTER(amroutine);
145 }
146 
147 /*
148  * btbuildempty() -- build an empty btree index in the initialization fork
149  */
150 void
152 {
153  Page metapage;
154 
155  /* Construct metapage. */
156  metapage = (Page) palloc(BLCKSZ);
157  _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
158 
159  /*
160  * Write the page and log it. It might seem that an immediate sync would
161  * be sufficient to guarantee that the file exists on disk, but recovery
162  * itself might remove it while replaying, for example, an
163  * XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE record. Therefore, we need
164  * this even when wal_level=minimal.
165  */
168  (char *) metapage, true);
169  log_newpage(&RelationGetSmgr(index)->smgr_rlocator.locator, INIT_FORKNUM,
170  BTREE_METAPAGE, metapage, true);
171 
172  /*
173  * An immediate sync is required even if we xlog'd the page, because the
174  * write did not go through shared_buffers and therefore a concurrent
175  * checkpoint may have moved the redo pointer past our xlog record.
176  */
178 }
179 
180 /*
181  * btinsert() -- insert an index tuple into a btree.
182  *
183  * Descend the tree recursively, find the appropriate location for our
184  * new tuple, and put it there.
185  */
186 bool
187 btinsert(Relation rel, Datum *values, bool *isnull,
188  ItemPointer ht_ctid, Relation heapRel,
189  IndexUniqueCheck checkUnique,
190  bool indexUnchanged,
191  IndexInfo *indexInfo)
192 {
193  bool result;
194  IndexTuple itup;
195 
196  /* generate an index tuple */
197  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
198  itup->t_tid = *ht_ctid;
199 
200  result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
201 
202  pfree(itup);
203 
204  return result;
205 }
206 
207 /*
208  * btgettuple() -- Get the next tuple in the scan.
209  */
210 bool
212 {
213  BTScanOpaque so = (BTScanOpaque) scan->opaque;
214  bool res;
215 
216  /* btree indexes are never lossy */
217  scan->xs_recheck = false;
218 
219  /*
220  * If we have any array keys, initialize them during first call for a
221  * scan. We can't do this in btrescan because we don't know the scan
222  * direction at that time.
223  */
224  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
225  {
226  /* punt if we have any unsatisfiable array keys */
227  if (so->numArrayKeys < 0)
228  return false;
229 
230  _bt_start_array_keys(scan, dir);
231  }
232 
233  /* This loop handles advancing to the next array elements, if any */
234  do
235  {
236  /*
237  * If we've already initialized this scan, we can just advance it in
238  * the appropriate direction. If we haven't done so yet, we call
239  * _bt_first() to get the first item in the scan.
240  */
241  if (!BTScanPosIsValid(so->currPos))
242  res = _bt_first(scan, dir);
243  else
244  {
245  /*
246  * Check to see if we should kill the previously-fetched tuple.
247  */
248  if (scan->kill_prior_tuple)
249  {
250  /*
251  * Yes, remember it for later. (We'll deal with all such
252  * tuples at once right before leaving the index page.) The
253  * test for numKilled overrun is not just paranoia: if the
254  * caller reverses direction in the indexscan then the same
255  * item might get entered multiple times. It's not worth
256  * trying to optimize that, so we don't detect it, but instead
257  * just forget any excess entries.
258  */
259  if (so->killedItems == NULL)
260  so->killedItems = (int *)
261  palloc(MaxTIDsPerBTreePage * sizeof(int));
262  if (so->numKilled < MaxTIDsPerBTreePage)
263  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
264  }
265 
266  /*
267  * Now continue the scan.
268  */
269  res = _bt_next(scan, dir);
270  }
271 
272  /* If we have a tuple, return it ... */
273  if (res)
274  break;
275  /* ... otherwise see if we have more array keys to deal with */
276  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
277 
278  return res;
279 }
280 
281 /*
282  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
283  */
284 int64
286 {
287  BTScanOpaque so = (BTScanOpaque) scan->opaque;
288  int64 ntids = 0;
289  ItemPointer heapTid;
290 
291  /*
292  * If we have any array keys, initialize them.
293  */
294  if (so->numArrayKeys)
295  {
296  /* punt if we have any unsatisfiable array keys */
297  if (so->numArrayKeys < 0)
298  return ntids;
299 
301  }
302 
303  /* This loop handles advancing to the next array elements, if any */
304  do
305  {
306  /* Fetch the first page & tuple */
307  if (_bt_first(scan, ForwardScanDirection))
308  {
309  /* Save tuple ID, and continue scanning */
310  heapTid = &scan->xs_heaptid;
311  tbm_add_tuples(tbm, heapTid, 1, false);
312  ntids++;
313 
314  for (;;)
315  {
316  /*
317  * Advance to next tuple within page. This is the same as the
318  * easy case in _bt_next().
319  */
320  if (++so->currPos.itemIndex > so->currPos.lastItem)
321  {
322  /* let _bt_next do the heavy lifting */
323  if (!_bt_next(scan, ForwardScanDirection))
324  break;
325  }
326 
327  /* Save tuple ID, and continue scanning */
328  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
329  tbm_add_tuples(tbm, heapTid, 1, false);
330  ntids++;
331  }
332  }
333  /* Now see if we have more array keys to deal with */
335 
336  return ntids;
337 }
338 
339 /*
340  * btbeginscan() -- start a scan on a btree index
341  */
343 btbeginscan(Relation rel, int nkeys, int norderbys)
344 {
345  IndexScanDesc scan;
346  BTScanOpaque so;
347 
348  /* no order by operators allowed */
349  Assert(norderbys == 0);
350 
351  /* get the scan */
352  scan = RelationGetIndexScan(rel, nkeys, norderbys);
353 
354  /* allocate private workspace */
355  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
358  if (scan->numberOfKeys > 0)
359  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
360  else
361  so->keyData = NULL;
362 
363  so->arrayKeyData = NULL; /* assume no array keys for now */
364  so->numArrayKeys = 0;
365  so->arrayKeys = NULL;
366  so->arrayContext = NULL;
367 
368  so->killedItems = NULL; /* until needed */
369  so->numKilled = 0;
370 
371  /*
372  * We don't know yet whether the scan will be index-only, so we do not
373  * allocate the tuple workspace arrays until btrescan. However, we set up
374  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
375  */
376  so->currTuples = so->markTuples = NULL;
377 
378  scan->xs_itupdesc = RelationGetDescr(rel);
379 
380  scan->opaque = so;
381 
382  return scan;
383 }
384 
385 /*
386  * btrescan() -- rescan an index relation
387  */
388 void
389 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
390  ScanKey orderbys, int norderbys)
391 {
392  BTScanOpaque so = (BTScanOpaque) scan->opaque;
393 
394  /* we aren't holding any read locks, but gotta drop the pins */
396  {
397  /* Before leaving current page, deal with any killed items */
398  if (so->numKilled > 0)
399  _bt_killitems(scan);
402  }
403 
404  so->markItemIndex = -1;
405  so->arrayKeyCount = 0;
408 
409  /*
410  * Allocate tuple workspace arrays, if needed for an index-only scan and
411  * not already done in a previous rescan call. To save on palloc
412  * overhead, both workspaces are allocated as one palloc block; only this
413  * function and btendscan know that.
414  *
415  * NOTE: this data structure also makes it safe to return data from a
416  * "name" column, even though btree name_ops uses an underlying storage
417  * datatype of cstring. The risk there is that "name" is supposed to be
418  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
419  * However, since we only return data out of tuples sitting in the
420  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
421  * data out of the markTuples array --- running off the end of memory for
422  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
423  * adding special-case treatment for name_ops elsewhere.
424  */
425  if (scan->xs_want_itup && so->currTuples == NULL)
426  {
427  so->currTuples = (char *) palloc(BLCKSZ * 2);
428  so->markTuples = so->currTuples + BLCKSZ;
429  }
430 
431  /*
432  * Reset the scan keys
433  */
434  if (scankey && scan->numberOfKeys > 0)
435  memmove(scan->keyData,
436  scankey,
437  scan->numberOfKeys * sizeof(ScanKeyData));
438  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
439 
440  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
442 }
443 
444 /*
445  * btendscan() -- close down a scan
446  */
447 void
449 {
450  BTScanOpaque so = (BTScanOpaque) scan->opaque;
451 
452  /* we aren't holding any read locks, but gotta drop the pins */
454  {
455  /* Before leaving current page, deal with any killed items */
456  if (so->numKilled > 0)
457  _bt_killitems(scan);
459  }
460 
461  so->markItemIndex = -1;
463 
464  /* No need to invalidate positions, the RAM is about to be freed. */
465 
466  /* Release storage */
467  if (so->keyData != NULL)
468  pfree(so->keyData);
469  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
470  if (so->arrayContext != NULL)
472  if (so->killedItems != NULL)
473  pfree(so->killedItems);
474  if (so->currTuples != NULL)
475  pfree(so->currTuples);
476  /* so->markTuples should not be pfree'd, see btrescan */
477  pfree(so);
478 }
479 
480 /*
481  * btmarkpos() -- save current scan position
482  */
483 void
485 {
486  BTScanOpaque so = (BTScanOpaque) scan->opaque;
487 
488  /* There may be an old mark with a pin (but no lock). */
490 
491  /*
492  * Just record the current itemIndex. If we later step to next page
493  * before releasing the marked position, _bt_steppage makes a full copy of
494  * the currPos struct in markPos. If (as often happens) the mark is moved
495  * before we leave the page, we don't have to do that work.
496  */
497  if (BTScanPosIsValid(so->currPos))
498  so->markItemIndex = so->currPos.itemIndex;
499  else
500  {
502  so->markItemIndex = -1;
503  }
504 
505  /* Also record the current positions of any array keys */
506  if (so->numArrayKeys)
507  _bt_mark_array_keys(scan);
508 }
509 
510 /*
511  * btrestrpos() -- restore scan to last saved position
512  */
513 void
515 {
516  BTScanOpaque so = (BTScanOpaque) scan->opaque;
517 
518  /* Restore the marked positions of any array keys */
519  if (so->numArrayKeys)
521 
522  if (so->markItemIndex >= 0)
523  {
524  /*
525  * The scan has never moved to a new page since the last mark. Just
526  * restore the itemIndex.
527  *
528  * NB: In this case we can't count on anything in so->markPos to be
529  * accurate.
530  */
531  so->currPos.itemIndex = so->markItemIndex;
532  }
533  else
534  {
535  /*
536  * The scan moved to a new page after last mark or restore, and we are
537  * now restoring to the marked page. We aren't holding any read
538  * locks, but if we're still holding the pin for the current position,
539  * we must drop it.
540  */
541  if (BTScanPosIsValid(so->currPos))
542  {
543  /* Before leaving current page, deal with any killed items */
544  if (so->numKilled > 0)
545  _bt_killitems(scan);
547  }
548 
549  if (BTScanPosIsValid(so->markPos))
550  {
551  /* bump pin on mark buffer for assignment to current buffer */
552  if (BTScanPosIsPinned(so->markPos))
554  memcpy(&so->currPos, &so->markPos,
555  offsetof(BTScanPosData, items[1]) +
556  so->markPos.lastItem * sizeof(BTScanPosItem));
557  if (so->currTuples)
558  memcpy(so->currTuples, so->markTuples,
560  }
561  else
563  }
564 }
565 
566 /*
567  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
568  */
569 Size
571 {
572  return sizeof(BTParallelScanDescData);
573 }
574 
575 /*
576  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
577  */
578 void
579 btinitparallelscan(void *target)
580 {
581  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
582 
583  SpinLockInit(&bt_target->btps_mutex);
584  bt_target->btps_scanPage = InvalidBlockNumber;
586  bt_target->btps_arrayKeyCount = 0;
587  ConditionVariableInit(&bt_target->btps_cv);
588 }
589 
590 /*
591  * btparallelrescan() -- reset parallel scan
592  */
593 void
595 {
596  BTParallelScanDesc btscan;
597  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
598 
599  Assert(parallel_scan);
600 
601  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
602  parallel_scan->ps_offset);
603 
604  /*
605  * In theory, we don't need to acquire the spinlock here, because there
606  * shouldn't be any other workers running at this point, but we do so for
607  * consistency.
608  */
609  SpinLockAcquire(&btscan->btps_mutex);
612  btscan->btps_arrayKeyCount = 0;
613  SpinLockRelease(&btscan->btps_mutex);
614 }
615 
616 /*
617  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
618  * page. Other scans must wait until we call _bt_parallel_release()
619  * or _bt_parallel_done().
620  *
621  * The return value is true if we successfully seized the scan and false
622  * if we did not. The latter case occurs if no pages remain for the current
623  * set of scankeys.
624  *
625  * If the return value is true, *pageno returns the next or current page
626  * of the scan (depending on the scan direction). An invalid block number
627  * means the scan hasn't yet started, and P_NONE means we've reached the end.
628  * The first time a participating process reaches the last page, it will return
629  * true and set *pageno to P_NONE; after that, further attempts to seize the
630  * scan will return false.
631  *
632  * Callers should ignore the value of pageno if the return value is false.
633  */
634 bool
636 {
637  BTScanOpaque so = (BTScanOpaque) scan->opaque;
638  BTPS_State pageStatus;
639  bool exit_loop = false;
640  bool status = true;
641  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
642  BTParallelScanDesc btscan;
643 
644  *pageno = P_NONE;
645 
646  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
647  parallel_scan->ps_offset);
648 
649  while (1)
650  {
651  SpinLockAcquire(&btscan->btps_mutex);
652  pageStatus = btscan->btps_pageStatus;
653 
654  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
655  {
656  /* Parallel scan has already advanced to a new set of scankeys. */
657  status = false;
658  }
659  else if (pageStatus == BTPARALLEL_DONE)
660  {
661  /*
662  * We're done with this set of scankeys. This may be the end, or
663  * there could be more sets to try.
664  */
665  status = false;
666  }
667  else if (pageStatus != BTPARALLEL_ADVANCING)
668  {
669  /*
670  * We have successfully seized control of the scan for the purpose
671  * of advancing it to a new page!
672  */
673  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
674  *pageno = btscan->btps_scanPage;
675  exit_loop = true;
676  }
677  SpinLockRelease(&btscan->btps_mutex);
678  if (exit_loop || !status)
679  break;
681  }
683 
684  return status;
685 }
686 
687 /*
688  * _bt_parallel_release() -- Complete the process of advancing the scan to a
689  * new page. We now have the new value btps_scanPage; some other backend
690  * can now begin advancing the scan.
691  */
692 void
694 {
695  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
696  BTParallelScanDesc btscan;
697 
698  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
699  parallel_scan->ps_offset);
700 
701  SpinLockAcquire(&btscan->btps_mutex);
702  btscan->btps_scanPage = scan_page;
704  SpinLockRelease(&btscan->btps_mutex);
706 }
707 
708 /*
709  * _bt_parallel_done() -- Mark the parallel scan as complete.
710  *
711  * When there are no pages left to scan, this function should be called to
712  * notify other workers. Otherwise, they might wait forever for the scan to
713  * advance to the next page.
714  */
715 void
717 {
718  BTScanOpaque so = (BTScanOpaque) scan->opaque;
719  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
720  BTParallelScanDesc btscan;
721  bool status_changed = false;
722 
723  /* Do nothing, for non-parallel scans */
724  if (parallel_scan == NULL)
725  return;
726 
727  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
728  parallel_scan->ps_offset);
729 
730  /*
731  * Mark the parallel scan as done for this combination of scan keys,
732  * unless some other process already did so. See also
733  * _bt_advance_array_keys.
734  */
735  SpinLockAcquire(&btscan->btps_mutex);
736  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
737  btscan->btps_pageStatus != BTPARALLEL_DONE)
738  {
739  btscan->btps_pageStatus = BTPARALLEL_DONE;
740  status_changed = true;
741  }
742  SpinLockRelease(&btscan->btps_mutex);
743 
744  /* wake up all the workers associated with this parallel scan */
745  if (status_changed)
746  ConditionVariableBroadcast(&btscan->btps_cv);
747 }
748 
749 /*
750  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
751  * keys.
752  *
753  * Updates the count of array keys processed for both local and parallel
754  * scans.
755  */
756 void
758 {
759  BTScanOpaque so = (BTScanOpaque) scan->opaque;
760  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
761  BTParallelScanDesc btscan;
762 
763  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
764  parallel_scan->ps_offset);
765 
766  so->arrayKeyCount++;
767  SpinLockAcquire(&btscan->btps_mutex);
768  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
769  {
770  btscan->btps_scanPage = InvalidBlockNumber;
771  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
772  btscan->btps_arrayKeyCount++;
773  }
774  SpinLockRelease(&btscan->btps_mutex);
775 }
776 
777 /*
778  * Bulk deletion of all index entries pointing to a set of heap tuples.
779  * The set of target tuples is specified via a callback routine that tells
780  * whether any given heap tuple (identified by ItemPointer) is being deleted.
781  *
782  * Result: a palloc'd struct containing statistical info for VACUUM displays.
783  */
786  IndexBulkDeleteCallback callback, void *callback_state)
787 {
788  Relation rel = info->index;
789  BTCycleId cycleid;
790 
791  /* allocate stats if first time through, else re-use existing struct */
792  if (stats == NULL)
794 
795  /* Establish the vacuum cycle ID to use for this scan */
796  /* The ENSURE stuff ensures we clean up shared memory on failure */
798  {
799  cycleid = _bt_start_vacuum(rel);
800 
801  btvacuumscan(info, stats, callback, callback_state, cycleid);
802  }
804  _bt_end_vacuum(rel);
805 
806  return stats;
807 }
808 
809 /*
810  * Post-VACUUM cleanup.
811  *
812  * Result: a palloc'd struct containing statistical info for VACUUM displays.
813  */
816 {
817  BlockNumber num_delpages;
818 
819  /* No-op in ANALYZE ONLY mode */
820  if (info->analyze_only)
821  return stats;
822 
823  /*
824  * If btbulkdelete was called, we need not do anything (we just maintain
825  * the information used within _bt_vacuum_needs_cleanup() by calling
826  * _bt_set_cleanup_info() below).
827  *
828  * If btbulkdelete was _not_ called, then we have a choice to make: we
829  * must decide whether or not a btvacuumscan() call is needed now (i.e.
830  * whether the ongoing VACUUM operation can entirely avoid a physical scan
831  * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
832  * now.
833  */
834  if (stats == NULL)
835  {
836  /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
837  if (!_bt_vacuum_needs_cleanup(info->index))
838  return NULL;
839 
840  /*
841  * Since we aren't going to actually delete any leaf items, there's no
842  * need to go through all the vacuum-cycle-ID pushups here.
843  *
844  * Posting list tuples are a source of inaccuracy for cleanup-only
845  * scans. btvacuumscan() will assume that the number of index tuples
846  * from each page can be used as num_index_tuples, even though
847  * num_index_tuples is supposed to represent the number of TIDs in the
848  * index. This naive approach can underestimate the number of tuples
849  * in the index significantly.
850  *
851  * We handle the problem by making num_index_tuples an estimate in
852  * cleanup-only case.
853  */
855  btvacuumscan(info, stats, NULL, NULL, 0);
856  stats->estimated_count = true;
857  }
858 
859  /*
860  * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
861  *
862  * num_delpages is the number of deleted pages now in the index that were
863  * not safe to place in the FSM to be recycled just yet. num_delpages is
864  * greater than 0 only when _bt_pagedel() actually deleted pages during
865  * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
866  * have failed to place any newly deleted pages in the FSM just moments
867  * ago. (Actually, there are edge cases where recycling of the current
868  * VACUUM's newly deleted pages does not even become safe by the time the
869  * next VACUUM comes around. See nbtree/README.)
870  */
871  Assert(stats->pages_deleted >= stats->pages_free);
872  num_delpages = stats->pages_deleted - stats->pages_free;
873  _bt_set_cleanup_info(info->index, num_delpages);
874 
875  /*
876  * It's quite possible for us to be fooled by concurrent page splits into
877  * double-counting some index tuples, so disbelieve any total that exceeds
878  * the underlying heap's count ... if we know that accurately. Otherwise
879  * this might just make matters worse.
880  */
881  if (!info->estimated_count)
882  {
883  if (stats->num_index_tuples > info->num_heap_tuples)
884  stats->num_index_tuples = info->num_heap_tuples;
885  }
886 
887  return stats;
888 }
889 
890 /*
891  * btvacuumscan --- scan the index for VACUUMing purposes
892  *
893  * This combines the functions of looking for leaf tuples that are deletable
894  * according to the vacuum callback, looking for empty pages that can be
895  * deleted, and looking for old deleted pages that can be recycled. Both
896  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
897  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
898  *
899  * The caller is responsible for initially allocating/zeroing a stats struct
900  * and for obtaining a vacuum cycle ID if necessary.
901  */
902 static void
904  IndexBulkDeleteCallback callback, void *callback_state,
905  BTCycleId cycleid)
906 {
907  Relation rel = info->index;
908  BTVacState vstate;
909  BlockNumber num_pages;
910  BlockNumber scanblkno;
911  bool needLock;
912 
913  /*
914  * Reset fields that track information about the entire index now. This
915  * avoids double-counting in the case where a single VACUUM command
916  * requires multiple scans of the index.
917  *
918  * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
919  * since they track information about the VACUUM command, and so must last
920  * across each call to btvacuumscan().
921  *
922  * (Note that pages_free is treated as state about the whole index, not
923  * the current VACUUM. This is appropriate because RecordFreeIndexPage()
924  * calls are idempotent, and get repeated for the same deleted pages in
925  * some scenarios. The point for us is to track the number of recyclable
926  * pages in the index at the end of the VACUUM command.)
927  */
928  stats->num_pages = 0;
929  stats->num_index_tuples = 0;
930  stats->pages_deleted = 0;
931  stats->pages_free = 0;
932 
933  /* Set up info to pass down to btvacuumpage */
934  vstate.info = info;
935  vstate.stats = stats;
936  vstate.callback = callback;
937  vstate.callback_state = callback_state;
938  vstate.cycleid = cycleid;
939 
940  /* Create a temporary memory context to run _bt_pagedel in */
942  "_bt_pagedel",
944 
945  /* Initialize vstate fields used by _bt_pendingfsm_finalize */
946  vstate.bufsize = 0;
947  vstate.maxbufsize = 0;
948  vstate.pendingpages = NULL;
949  vstate.npendingpages = 0;
950  /* Consider applying _bt_pendingfsm_finalize optimization */
951  _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
952 
953  /*
954  * The outer loop iterates over all index pages except the metapage, in
955  * physical order (we hope the kernel will cooperate in providing
956  * read-ahead for speed). It is critical that we visit all leaf pages,
957  * including ones added after we start the scan, else we might fail to
958  * delete some deletable tuples. Hence, we must repeatedly check the
959  * relation length. We must acquire the relation-extension lock while
960  * doing so to avoid a race condition: if someone else is extending the
961  * relation, there is a window where bufmgr/smgr have created a new
962  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
963  * we manage to scan such a page here, we'll improperly assume it can be
964  * recycled. Taking the lock synchronizes things enough to prevent a
965  * problem: either num_pages won't include the new page, or _bt_getbuf
966  * already has write lock on the buffer and it will be fully initialized
967  * before we can examine it. Also, we need not worry if a page is added
968  * immediately after we look; the page splitting code already has
969  * write-lock on the left page before it adds a right page, so we must
970  * already have processed any tuples due to be moved into such a page.
971  *
972  * We can skip locking for new or temp relations, however, since no one
973  * else could be accessing them.
974  */
975  needLock = !RELATION_IS_LOCAL(rel);
976 
977  scanblkno = BTREE_METAPAGE + 1;
978  for (;;)
979  {
980  /* Get the current relation length */
981  if (needLock)
983  num_pages = RelationGetNumberOfBlocks(rel);
984  if (needLock)
986 
987  if (info->report_progress)
989  num_pages);
990 
991  /* Quit if we've scanned the whole relation */
992  if (scanblkno >= num_pages)
993  break;
994  /* Iterate over pages, then loop back to recheck length */
995  for (; scanblkno < num_pages; scanblkno++)
996  {
997  btvacuumpage(&vstate, scanblkno);
998  if (info->report_progress)
1000  scanblkno);
1001  }
1002  }
1003 
1004  /* Set statistics num_pages field to final size of index */
1005  stats->num_pages = num_pages;
1006 
1008 
1009  /*
1010  * If there were any calls to _bt_pagedel() during scan of the index then
1011  * see if any of the resulting pages can be placed in the FSM now. When
1012  * it's not safe we'll have to leave it up to a future VACUUM operation.
1013  *
1014  * Finally, if we placed any pages in the FSM (either just now or during
1015  * the scan), forcibly update the upper-level FSM pages to ensure that
1016  * searchers can find them.
1017  */
1018  _bt_pendingfsm_finalize(rel, &vstate);
1019  if (stats->pages_free > 0)
1021 }
1022 
1023 /*
1024  * btvacuumpage --- VACUUM one page
1025  *
1026  * This processes a single page for btvacuumscan(). In some cases we must
1027  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1028  * a previous call here. This is how we handle page splits (that happened
1029  * after our cycleid was acquired) whose right half page happened to reuse
1030  * a block that we might have processed at some point before it was
1031  * recycled (i.e. before the page split).
1032  */
1033 static void
1035 {
1036  IndexVacuumInfo *info = vstate->info;
1037  IndexBulkDeleteResult *stats = vstate->stats;
1039  void *callback_state = vstate->callback_state;
1040  Relation rel = info->index;
1041  bool attempt_pagedel;
1042  BlockNumber blkno,
1043  backtrack_to;
1044  Buffer buf;
1045  Page page;
1046  BTPageOpaque opaque;
1047 
1048  blkno = scanblkno;
1049 
1050 backtrack:
1051 
1052  attempt_pagedel = false;
1053  backtrack_to = P_NONE;
1054 
1055  /* call vacuum_delay_point while not holding any buffer lock */
1057 
1058  /*
1059  * We can't use _bt_getbuf() here because it always applies
1060  * _bt_checkpage(), which will barf on an all-zero page. We want to
1061  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1062  * buffer access strategy.
1063  */
1065  info->strategy);
1066  _bt_lockbuf(rel, buf, BT_READ);
1067  page = BufferGetPage(buf);
1068  opaque = NULL;
1069  if (!PageIsNew(page))
1070  {
1071  _bt_checkpage(rel, buf);
1072  opaque = BTPageGetOpaque(page);
1073  }
1074 
1075  Assert(blkno <= scanblkno);
1076  if (blkno != scanblkno)
1077  {
1078  /*
1079  * We're backtracking.
1080  *
1081  * We followed a right link to a sibling leaf page (a page that
1082  * happens to be from a block located before scanblkno). The only
1083  * case we want to do anything with is a live leaf page having the
1084  * current vacuum cycle ID.
1085  *
1086  * The page had better be in a state that's consistent with what we
1087  * expect. Check for conditions that imply corruption in passing. It
1088  * can't be half-dead because only an interrupted VACUUM process can
1089  * leave pages in that state, so we'd definitely have dealt with it
1090  * back when the page was the scanblkno page (half-dead pages are
1091  * always marked fully deleted by _bt_pagedel()). This assumes that
1092  * there can be only one vacuum process running at a time.
1093  */
1094  if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1095  {
1096  Assert(false);
1097  ereport(LOG,
1098  (errcode(ERRCODE_INDEX_CORRUPTED),
1099  errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1100  blkno, scanblkno, RelationGetRelationName(rel))));
1101  _bt_relbuf(rel, buf);
1102  return;
1103  }
1104 
1105  /*
1106  * We may have already processed the page in an earlier call, when the
1107  * page was scanblkno. This happens when the leaf page split occurred
1108  * after the scan began, but before the right sibling page became the
1109  * scanblkno.
1110  *
1111  * Page may also have been deleted by current btvacuumpage() call,
1112  * since _bt_pagedel() sometimes deletes the right sibling page of
1113  * scanblkno in passing (it does so after we decided where to
1114  * backtrack to). We don't need to process this page as a deleted
1115  * page a second time now (in fact, it would be wrong to count it as a
1116  * deleted page in the bulk delete statistics a second time).
1117  */
1118  if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1119  {
1120  /* Done with current scanblkno (and all lower split pages) */
1121  _bt_relbuf(rel, buf);
1122  return;
1123  }
1124  }
1125 
1126  if (!opaque || BTPageIsRecyclable(page))
1127  {
1128  /* Okay to recycle this page (which could be leaf or internal) */
1129  RecordFreeIndexPage(rel, blkno);
1130  stats->pages_deleted++;
1131  stats->pages_free++;
1132  }
1133  else if (P_ISDELETED(opaque))
1134  {
1135  /*
1136  * Already deleted page (which could be leaf or internal). Can't
1137  * recycle yet.
1138  */
1139  stats->pages_deleted++;
1140  }
1141  else if (P_ISHALFDEAD(opaque))
1142  {
1143  /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1144  attempt_pagedel = true;
1145 
1146  /*
1147  * _bt_pagedel() will increment both pages_newly_deleted and
1148  * pages_deleted stats in all cases (barring corruption)
1149  */
1150  }
1151  else if (P_ISLEAF(opaque))
1152  {
1154  int ndeletable;
1156  int nupdatable;
1157  OffsetNumber offnum,
1158  minoff,
1159  maxoff;
1160  int nhtidsdead,
1161  nhtidslive;
1162 
1163  /*
1164  * Trade in the initial read lock for a full cleanup lock on this
1165  * page. We must get such a lock on every leaf page over the course
1166  * of the vacuum scan, whether or not it actually contains any
1167  * deletable tuples --- see nbtree/README.
1168  */
1170 
1171  /*
1172  * Check whether we need to backtrack to earlier pages. What we are
1173  * concerned about is a page split that happened since we started the
1174  * vacuum scan. If the split moved tuples on the right half of the
1175  * split (i.e. the tuples that sort high) to a block that we already
1176  * passed over, then we might have missed the tuples. We need to
1177  * backtrack now. (Must do this before possibly clearing btpo_cycleid
1178  * or deleting scanblkno page below!)
1179  */
1180  if (vstate->cycleid != 0 &&
1181  opaque->btpo_cycleid == vstate->cycleid &&
1182  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1183  !P_RIGHTMOST(opaque) &&
1184  opaque->btpo_next < scanblkno)
1185  backtrack_to = opaque->btpo_next;
1186 
1187  ndeletable = 0;
1188  nupdatable = 0;
1189  minoff = P_FIRSTDATAKEY(opaque);
1190  maxoff = PageGetMaxOffsetNumber(page);
1191  nhtidsdead = 0;
1192  nhtidslive = 0;
1193  if (callback)
1194  {
1195  /* btbulkdelete callback tells us what to delete (or update) */
1196  for (offnum = minoff;
1197  offnum <= maxoff;
1198  offnum = OffsetNumberNext(offnum))
1199  {
1200  IndexTuple itup;
1201 
1202  itup = (IndexTuple) PageGetItem(page,
1203  PageGetItemId(page, offnum));
1204 
1205  Assert(!BTreeTupleIsPivot(itup));
1206  if (!BTreeTupleIsPosting(itup))
1207  {
1208  /* Regular tuple, standard table TID representation */
1209  if (callback(&itup->t_tid, callback_state))
1210  {
1211  deletable[ndeletable++] = offnum;
1212  nhtidsdead++;
1213  }
1214  else
1215  nhtidslive++;
1216  }
1217  else
1218  {
1219  BTVacuumPosting vacposting;
1220  int nremaining;
1221 
1222  /* Posting list tuple */
1223  vacposting = btreevacuumposting(vstate, itup, offnum,
1224  &nremaining);
1225  if (vacposting == NULL)
1226  {
1227  /*
1228  * All table TIDs from the posting tuple remain, so no
1229  * delete or update required
1230  */
1231  Assert(nremaining == BTreeTupleGetNPosting(itup));
1232  }
1233  else if (nremaining > 0)
1234  {
1235 
1236  /*
1237  * Store metadata about posting list tuple in
1238  * updatable array for entire page. Existing tuple
1239  * will be updated during the later call to
1240  * _bt_delitems_vacuum().
1241  */
1242  Assert(nremaining < BTreeTupleGetNPosting(itup));
1243  updatable[nupdatable++] = vacposting;
1244  nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1245  }
1246  else
1247  {
1248  /*
1249  * All table TIDs from the posting list must be
1250  * deleted. We'll delete the index tuple completely
1251  * (no update required).
1252  */
1253  Assert(nremaining == 0);
1254  deletable[ndeletable++] = offnum;
1255  nhtidsdead += BTreeTupleGetNPosting(itup);
1256  pfree(vacposting);
1257  }
1258 
1259  nhtidslive += nremaining;
1260  }
1261  }
1262  }
1263 
1264  /*
1265  * Apply any needed deletes or updates. We issue just one
1266  * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1267  */
1268  if (ndeletable > 0 || nupdatable > 0)
1269  {
1270  Assert(nhtidsdead >= ndeletable + nupdatable);
1271  _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1272  nupdatable);
1273 
1274  stats->tuples_removed += nhtidsdead;
1275  /* must recompute maxoff */
1276  maxoff = PageGetMaxOffsetNumber(page);
1277 
1278  /* can't leak memory here */
1279  for (int i = 0; i < nupdatable; i++)
1280  pfree(updatable[i]);
1281  }
1282  else
1283  {
1284  /*
1285  * If the leaf page has been split during this vacuum cycle, it
1286  * seems worth expending a write to clear btpo_cycleid even if we
1287  * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1288  * takes care of this.) This ensures we won't process the page
1289  * again.
1290  *
1291  * We treat this like a hint-bit update because there's no need to
1292  * WAL-log it.
1293  */
1294  Assert(nhtidsdead == 0);
1295  if (vstate->cycleid != 0 &&
1296  opaque->btpo_cycleid == vstate->cycleid)
1297  {
1298  opaque->btpo_cycleid = 0;
1299  MarkBufferDirtyHint(buf, true);
1300  }
1301  }
1302 
1303  /*
1304  * If the leaf page is now empty, try to delete it; else count the
1305  * live tuples (live table TIDs in posting lists are counted as
1306  * separate live tuples). We don't delete when backtracking, though,
1307  * since that would require teaching _bt_pagedel() about backtracking
1308  * (doesn't seem worth adding more complexity to deal with that).
1309  *
1310  * We don't count the number of live TIDs during cleanup-only calls to
1311  * btvacuumscan (i.e. when callback is not set). We count the number
1312  * of index tuples directly instead. This avoids the expense of
1313  * directly examining all of the tuples on each page. VACUUM will
1314  * treat num_index_tuples as an estimate in cleanup-only case, so it
1315  * doesn't matter that this underestimates num_index_tuples
1316  * significantly in some cases.
1317  */
1318  if (minoff > maxoff)
1319  attempt_pagedel = (blkno == scanblkno);
1320  else if (callback)
1321  stats->num_index_tuples += nhtidslive;
1322  else
1323  stats->num_index_tuples += maxoff - minoff + 1;
1324 
1325  Assert(!attempt_pagedel || nhtidslive == 0);
1326  }
1327 
1328  if (attempt_pagedel)
1329  {
1330  MemoryContext oldcontext;
1331 
1332  /* Run pagedel in a temp context to avoid memory leakage */
1334  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1335 
1336  /*
1337  * _bt_pagedel maintains the bulk delete stats on our behalf;
1338  * pages_newly_deleted and pages_deleted are likely to be incremented
1339  * during call
1340  */
1341  Assert(blkno == scanblkno);
1342  _bt_pagedel(rel, buf, vstate);
1343 
1344  MemoryContextSwitchTo(oldcontext);
1345  /* pagedel released buffer, so we shouldn't */
1346  }
1347  else
1348  _bt_relbuf(rel, buf);
1349 
1350  if (backtrack_to != P_NONE)
1351  {
1352  blkno = backtrack_to;
1353  goto backtrack;
1354  }
1355 }
1356 
1357 /*
1358  * btreevacuumposting --- determine TIDs still needed in posting list
1359  *
1360  * Returns metadata describing how to build replacement tuple without the TIDs
1361  * that VACUUM needs to delete. Returned value is NULL in the common case
1362  * where no changes are needed to caller's posting list tuple (we avoid
1363  * allocating memory here as an optimization).
1364  *
1365  * The number of TIDs that should remain in the posting list tuple is set for
1366  * caller in *nremaining.
1367  */
1368 static BTVacuumPosting
1370  OffsetNumber updatedoffset, int *nremaining)
1371 {
1372  int live = 0;
1373  int nitem = BTreeTupleGetNPosting(posting);
1374  ItemPointer items = BTreeTupleGetPosting(posting);
1375  BTVacuumPosting vacposting = NULL;
1376 
1377  for (int i = 0; i < nitem; i++)
1378  {
1379  if (!vstate->callback(items + i, vstate->callback_state))
1380  {
1381  /* Live table TID */
1382  live++;
1383  }
1384  else if (vacposting == NULL)
1385  {
1386  /*
1387  * First dead table TID encountered.
1388  *
1389  * It's now clear that we need to delete one or more dead table
1390  * TIDs, so start maintaining metadata describing how to update
1391  * existing posting list tuple.
1392  */
1393  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1394  nitem * sizeof(uint16));
1395 
1396  vacposting->itup = posting;
1397  vacposting->updatedoffset = updatedoffset;
1398  vacposting->ndeletedtids = 0;
1399  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1400  }
1401  else
1402  {
1403  /* Second or subsequent dead table TID */
1404  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1405  }
1406  }
1407 
1408  *nremaining = live;
1409  return vacposting;
1410 }
1411 
1412 /*
1413  * btcanreturn() -- Check whether btree indexes support index-only scans.
1414  *
1415  * btrees always do, so this is trivial.
1416  */
1417 bool
1419 {
1420  return true;
1421 }
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static Datum values[MAXATTR]
Definition: bootstrap.c:156
int Buffer
Definition: buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3969
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4001
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:759
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:156
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:280
@ RBM_NORMAL
Definition: bufmgr.h:39
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1539
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static bool PageIsNew(Page page)
Definition: bufpage.h:230
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define OffsetToPointer(base, offset)
Definition: c.h:708
unsigned short uint16
Definition: c.h:441
size_t Size
Definition: c.h:541
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
void ConditionVariableCancelSleep(void)
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errcode(int sqlerrcode)
Definition: elog.c:858
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:81
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:86
IndexUniqueCheck
Definition: genam.h:115
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:165
Assert(fmt[strlen(fmt) - 1] !='\n')
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:431
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:481
#define ExclusiveLock
Definition: lockdefs.h:42
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:303
void pfree(void *pointer)
Definition: mcxt.c:1306
void * palloc0(Size size)
Definition: mcxt.c:1230
MemoryContext CurrentMemoryContext
Definition: mcxt.c:124
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:376
void * palloc(Size size)
Definition: mcxt.c:1199
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:153
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:100
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1816
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1166
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:794
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:234
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1121
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:69
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:181
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2956
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1051
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2915
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1418
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:693
BTPS_State
Definition: nbtree.c:55
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:57
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56
@ BTPARALLEL_IDLE
Definition: nbtree.c:58
@ BTPARALLEL_DONE
Definition: nbtree.c:59
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:343
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:716
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:785
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:757
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1369
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:815
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:211
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:594
struct BTParallelScanDescData BTParallelScanDescData
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:187
void btbuildempty(Relation index)
Definition: nbtree.c:151
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:635
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
Definition: nbtree.c:1034
Size btestimateparallelscan(void)
Definition: nbtree.c:570
void btinitparallelscan(void *target)
Definition: nbtree.c:579
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:903
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:285
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:484
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:448
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:389
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:78
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:96
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:514
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:989
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:512
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:474
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
static bool BTPageIsRecyclable(Page page)
Definition: nbtree.h:291
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:531
#define BTNProcs
Definition: nbtree.h:706
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1006
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
uint16 BTCycleId
Definition: nbtree.h:29
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_READ
Definition: nbtree.h:713
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:486
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1012
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1000
#define BTP_SPLIT_END
Definition: nbtree.h:81
#define BTOPTIONS_PROC
Definition: nbtree.h:705
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1073
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:860
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1466
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:301
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2033
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:2157
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:2061
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2111
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:551
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1725
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:210
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:629
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:610
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2134
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2696
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:525
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:1976
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:41
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:293
#define makeNode(_type_)
Definition: nodes.h:165
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:135
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:225
static char * buf
Definition: pg_test_fsync.c:67
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:670
uintptr_t Datum
Definition: postgres.h:412
#define InvalidOid
Definition: postgres_ext.h:36
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:120
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:119
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:646
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:569
#define RelationGetDescr(relation)
Definition: rel.h:527
#define RelationGetRelationName(relation)
Definition: rel.h:535
@ MAIN_FORKNUM
Definition: relpath.h:50
@ INIT_FORKNUM
Definition: relpath.h:53
int slock_t
Definition: s_lock.h:754
ScanDirection
Definition: sdir.h:23
@ ForwardScanDirection
Definition: sdir.h:26
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6675
ScanKeyData * ScanKey
Definition: skey.h:75
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:554
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:691
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define BTMaxStrategyNumber
Definition: stratnum.h:35
BlockNumber btpo_next
Definition: nbtree.h:65
uint16 btpo_flags
Definition: nbtree.h:67
BTCycleId btpo_cycleid
Definition: nbtree.h:68
slock_t btps_mutex
Definition: nbtree.c:74
BTPS_State btps_pageStatus
Definition: nbtree.c:69
ConditionVariable btps_cv
Definition: nbtree.c:75
BlockNumber btps_scanPage
Definition: nbtree.c:68
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1044
char * markTuples
Definition: nbtree.h:1057
BTScanPosData currPos
Definition: nbtree.h:1069
int * killedItems
Definition: nbtree.h:1048
char * currTuples
Definition: nbtree.h:1056
ScanKey arrayKeyData
Definition: nbtree.h:1039
BTScanPosData markPos
Definition: nbtree.h:1070
ScanKey keyData
Definition: nbtree.h:1036
MemoryContext arrayContext
Definition: nbtree.h:1045
Buffer buf
Definition: nbtree.h:952
int nextTupleOffset
Definition: nbtree.h:971
int lastItem
Definition: nbtree.h:981
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:984
int itemIndex
Definition: nbtree.h:982
ItemPointerData heapTid
Definition: nbtree.h:945
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BTCycleId cycleid
Definition: nbtree.h:337
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
IndexBulkDeleteCallback callback
Definition: nbtree.h:335
MemoryContext pagedelcontext
Definition: nbtree.h:338
IndexVacuumInfo * info
Definition: nbtree.h:333
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344
void * callback_state
Definition: nbtree.h:336
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:910
uint16 ndeletedtids
Definition: nbtree.h:909
IndexTuple itup
Definition: nbtree.h:905
OffsetNumber updatedoffset
Definition: nbtree.h:906
ambuildphasename_function ambuildphasename
Definition: amapi.h:268
ambuildempty_function ambuildempty
Definition: amapi.h:260
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:263
bool amclusterable
Definition: amapi.h:238
amoptions_function amoptions
Definition: amapi.h:266
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:280
amrestrpos_function amrestrpos
Definition: amapi.h:277
aminsert_function aminsert
Definition: amapi.h:261
amendscan_function amendscan
Definition: amapi.h:275
uint16 amoptsprocnum
Definition: amapi.h:218
amparallelrescan_function amparallelrescan
Definition: amapi.h:282
Oid amkeytype
Definition: amapi.h:250
bool ampredlocks
Definition: amapi.h:240
uint16 amsupport
Definition: amapi.h:216
amcostestimate_function amcostestimate
Definition: amapi.h:265
bool amcanorderbyop
Definition: amapi.h:222
amadjustmembers_function amadjustmembers
Definition: amapi.h:270
ambuild_function ambuild
Definition: amapi.h:259
bool amstorage
Definition: amapi.h:236
uint16 amstrategies
Definition: amapi.h:214
bool amoptionalkey
Definition: amapi.h:230
amgettuple_function amgettuple
Definition: amapi.h:273
amcanreturn_function amcanreturn
Definition: amapi.h:264
bool amcanunique
Definition: amapi.h:226
amgetbitmap_function amgetbitmap
Definition: amapi.h:274
amproperty_function amproperty
Definition: amapi.h:267
ambulkdelete_function ambulkdelete
Definition: amapi.h:262
bool amsearcharray
Definition: amapi.h:232
amvalidate_function amvalidate
Definition: amapi.h:269
ammarkpos_function ammarkpos
Definition: amapi.h:276
bool amcanmulticol
Definition: amapi.h:228
bool amusemaintenanceworkmem
Definition: amapi.h:246
ambeginscan_function ambeginscan
Definition: amapi.h:271
bool amcanparallel
Definition: amapi.h:242
amrescan_function amrescan
Definition: amapi.h:272
bool amcanorder
Definition: amapi.h:220
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:281
uint8 amparallelvacuumoptions
Definition: amapi.h:248
bool amcanbackward
Definition: amapi.h:224
bool amcaninclude
Definition: amapi.h:244
bool amsearchnulls
Definition: amapi.h:234
bool estimated_count
Definition: genam.h:77
BlockNumber pages_deleted
Definition: genam.h:81
BlockNumber pages_free
Definition: genam.h:82
BlockNumber num_pages
Definition: genam.h:76
double tuples_removed
Definition: genam.h:79
double num_index_tuples
Definition: genam.h:78
struct ScanKeyData * keyData
Definition: relscan.h:122
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
bool kill_prior_tuple
Definition: relscan.h:128
struct TupleDescData * xs_itupdesc
Definition: relscan.h:143
ItemPointerData xs_heaptid
Definition: relscan.h:147
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:46
double num_heap_tuples
Definition: genam.h:51
bool analyze_only
Definition: genam.h:47
BufferAccessStrategy strategy
Definition: genam.h:52
bool report_progress
Definition: genam.h:48
bool estimated_count
Definition: genam.h:49
Definition: type.h:95
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
void vacuum_delay_point(void)
Definition: vacuum.c:2166
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:47
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:54
@ WAIT_EVENT_BTREE_PAGE
Definition: wait_event.h:88
XLogRecPtr log_newpage(RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:1097