PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "access/xloginsert.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "nodes/execnodes.h"
30 #include "pgstat.h"
31 #include "postmaster/autovacuum.h"
33 #include "storage/indexfsm.h"
34 #include "storage/ipc.h"
35 #include "storage/lmgr.h"
36 #include "storage/smgr.h"
37 #include "utils/builtins.h"
38 #include "utils/index_selfuncs.h"
39 #include "utils/memutils.h"
40 
41 
42 /*
43  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
44  *
45  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
46  * a new page; others must wait.
47  *
48  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
49  * to a new page; some process can start doing that.
50  *
51  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
52  * We reach this state once for every distinct combination of array keys.
53  */
54 typedef enum
55 {
61 
62 /*
63  * BTParallelScanDescData contains btree specific shared information required
64  * for parallel scan.
65  */
66 typedef struct BTParallelScanDescData
67 {
68  BlockNumber btps_scanPage; /* latest or next page to be scanned */
69  BTPS_State btps_pageStatus; /* indicates whether next page is
70  * available for scan. see above for
71  * possible states of parallel scan. */
72  int btps_arrayKeyCount; /* count indicating number of array scan
73  * keys processed by parallel scan */
74  slock_t btps_mutex; /* protects above variables */
75  ConditionVariable btps_cv; /* used to synchronize parallel scan */
77 
79 
80 
81 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
82  IndexBulkDeleteCallback callback, void *callback_state,
83  BTCycleId cycleid);
84 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
86  IndexTuple posting,
87  OffsetNumber updatedoffset,
88  int *nremaining);
89 
90 
91 /*
92  * Btree handler function: return IndexAmRoutine with access method parameters
93  * and callbacks.
94  */
95 Datum
97 {
99 
100  amroutine->amstrategies = BTMaxStrategyNumber;
101  amroutine->amsupport = BTNProcs;
102  amroutine->amoptsprocnum = BTOPTIONS_PROC;
103  amroutine->amcanorder = true;
104  amroutine->amcanorderbyop = false;
105  amroutine->amcanbackward = true;
106  amroutine->amcanunique = true;
107  amroutine->amcanmulticol = true;
108  amroutine->amoptionalkey = true;
109  amroutine->amsearcharray = true;
110  amroutine->amsearchnulls = true;
111  amroutine->amstorage = false;
112  amroutine->amclusterable = true;
113  amroutine->ampredlocks = true;
114  amroutine->amcanparallel = true;
115  amroutine->amcaninclude = true;
116  amroutine->amusemaintenanceworkmem = false;
117  amroutine->amsummarizing = false;
118  amroutine->amparallelvacuumoptions =
120  amroutine->amkeytype = InvalidOid;
121 
122  amroutine->ambuild = btbuild;
123  amroutine->ambuildempty = btbuildempty;
124  amroutine->aminsert = btinsert;
125  amroutine->ambulkdelete = btbulkdelete;
126  amroutine->amvacuumcleanup = btvacuumcleanup;
127  amroutine->amcanreturn = btcanreturn;
128  amroutine->amcostestimate = btcostestimate;
129  amroutine->amoptions = btoptions;
130  amroutine->amproperty = btproperty;
131  amroutine->ambuildphasename = btbuildphasename;
132  amroutine->amvalidate = btvalidate;
133  amroutine->amadjustmembers = btadjustmembers;
134  amroutine->ambeginscan = btbeginscan;
135  amroutine->amrescan = btrescan;
136  amroutine->amgettuple = btgettuple;
137  amroutine->amgetbitmap = btgetbitmap;
138  amroutine->amendscan = btendscan;
139  amroutine->ammarkpos = btmarkpos;
140  amroutine->amrestrpos = btrestrpos;
143  amroutine->amparallelrescan = btparallelrescan;
144 
145  PG_RETURN_POINTER(amroutine);
146 }
147 
148 /*
149  * btbuildempty() -- build an empty btree index in the initialization fork
150  */
151 void
153 {
154  Page metapage;
155 
156  /* Construct metapage. */
157  metapage = (Page) palloc(BLCKSZ);
158  _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
159 
160  /*
161  * Write the page and log it. It might seem that an immediate sync would
162  * be sufficient to guarantee that the file exists on disk, but recovery
163  * itself might remove it while replaying, for example, an
164  * XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE record. Therefore, we need
165  * this even when wal_level=minimal.
166  */
169  metapage, true);
170  log_newpage(&RelationGetSmgr(index)->smgr_rlocator.locator, INIT_FORKNUM,
171  BTREE_METAPAGE, metapage, true);
172 
173  /*
174  * An immediate sync is required even if we xlog'd the page, because the
175  * write did not go through shared_buffers and therefore a concurrent
176  * checkpoint may have moved the redo pointer past our xlog record.
177  */
179 }
180 
181 /*
182  * btinsert() -- insert an index tuple into a btree.
183  *
184  * Descend the tree recursively, find the appropriate location for our
185  * new tuple, and put it there.
186  */
187 bool
188 btinsert(Relation rel, Datum *values, bool *isnull,
189  ItemPointer ht_ctid, Relation heapRel,
190  IndexUniqueCheck checkUnique,
191  bool indexUnchanged,
192  IndexInfo *indexInfo)
193 {
194  bool result;
195  IndexTuple itup;
196 
197  /* generate an index tuple */
198  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
199  itup->t_tid = *ht_ctid;
200 
201  result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
202 
203  pfree(itup);
204 
205  return result;
206 }
207 
208 /*
209  * btgettuple() -- Get the next tuple in the scan.
210  */
211 bool
213 {
214  BTScanOpaque so = (BTScanOpaque) scan->opaque;
215  bool res;
216 
217  /* btree indexes are never lossy */
218  scan->xs_recheck = false;
219 
220  /*
221  * If we have any array keys, initialize them during first call for a
222  * scan. We can't do this in btrescan because we don't know the scan
223  * direction at that time.
224  */
225  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
226  {
227  /* punt if we have any unsatisfiable array keys */
228  if (so->numArrayKeys < 0)
229  return false;
230 
231  _bt_start_array_keys(scan, dir);
232  }
233 
234  /* This loop handles advancing to the next array elements, if any */
235  do
236  {
237  /*
238  * If we've already initialized this scan, we can just advance it in
239  * the appropriate direction. If we haven't done so yet, we call
240  * _bt_first() to get the first item in the scan.
241  */
242  if (!BTScanPosIsValid(so->currPos))
243  res = _bt_first(scan, dir);
244  else
245  {
246  /*
247  * Check to see if we should kill the previously-fetched tuple.
248  */
249  if (scan->kill_prior_tuple)
250  {
251  /*
252  * Yes, remember it for later. (We'll deal with all such
253  * tuples at once right before leaving the index page.) The
254  * test for numKilled overrun is not just paranoia: if the
255  * caller reverses direction in the indexscan then the same
256  * item might get entered multiple times. It's not worth
257  * trying to optimize that, so we don't detect it, but instead
258  * just forget any excess entries.
259  */
260  if (so->killedItems == NULL)
261  so->killedItems = (int *)
262  palloc(MaxTIDsPerBTreePage * sizeof(int));
263  if (so->numKilled < MaxTIDsPerBTreePage)
264  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
265  }
266 
267  /*
268  * Now continue the scan.
269  */
270  res = _bt_next(scan, dir);
271  }
272 
273  /* If we have a tuple, return it ... */
274  if (res)
275  break;
276  /* ... otherwise see if we have more array keys to deal with */
277  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
278 
279  return res;
280 }
281 
282 /*
283  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
284  */
285 int64
287 {
288  BTScanOpaque so = (BTScanOpaque) scan->opaque;
289  int64 ntids = 0;
290  ItemPointer heapTid;
291 
292  /*
293  * If we have any array keys, initialize them.
294  */
295  if (so->numArrayKeys)
296  {
297  /* punt if we have any unsatisfiable array keys */
298  if (so->numArrayKeys < 0)
299  return ntids;
300 
302  }
303 
304  /* This loop handles advancing to the next array elements, if any */
305  do
306  {
307  /* Fetch the first page & tuple */
308  if (_bt_first(scan, ForwardScanDirection))
309  {
310  /* Save tuple ID, and continue scanning */
311  heapTid = &scan->xs_heaptid;
312  tbm_add_tuples(tbm, heapTid, 1, false);
313  ntids++;
314 
315  for (;;)
316  {
317  /*
318  * Advance to next tuple within page. This is the same as the
319  * easy case in _bt_next().
320  */
321  if (++so->currPos.itemIndex > so->currPos.lastItem)
322  {
323  /* let _bt_next do the heavy lifting */
324  if (!_bt_next(scan, ForwardScanDirection))
325  break;
326  }
327 
328  /* Save tuple ID, and continue scanning */
329  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
330  tbm_add_tuples(tbm, heapTid, 1, false);
331  ntids++;
332  }
333  }
334  /* Now see if we have more array keys to deal with */
336 
337  return ntids;
338 }
339 
340 /*
341  * btbeginscan() -- start a scan on a btree index
342  */
344 btbeginscan(Relation rel, int nkeys, int norderbys)
345 {
346  IndexScanDesc scan;
347  BTScanOpaque so;
348 
349  /* no order by operators allowed */
350  Assert(norderbys == 0);
351 
352  /* get the scan */
353  scan = RelationGetIndexScan(rel, nkeys, norderbys);
354 
355  /* allocate private workspace */
356  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
359  if (scan->numberOfKeys > 0)
360  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
361  else
362  so->keyData = NULL;
363 
364  so->arrayKeyData = NULL; /* assume no array keys for now */
365  so->numArrayKeys = 0;
366  so->arrayKeys = NULL;
367  so->arrayContext = NULL;
368 
369  so->killedItems = NULL; /* until needed */
370  so->numKilled = 0;
371 
372  /*
373  * We don't know yet whether the scan will be index-only, so we do not
374  * allocate the tuple workspace arrays until btrescan. However, we set up
375  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
376  */
377  so->currTuples = so->markTuples = NULL;
378 
379  scan->xs_itupdesc = RelationGetDescr(rel);
380 
381  scan->opaque = so;
382 
383  return scan;
384 }
385 
386 /*
387  * btrescan() -- rescan an index relation
388  */
389 void
390 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
391  ScanKey orderbys, int norderbys)
392 {
393  BTScanOpaque so = (BTScanOpaque) scan->opaque;
394 
395  /* we aren't holding any read locks, but gotta drop the pins */
397  {
398  /* Before leaving current page, deal with any killed items */
399  if (so->numKilled > 0)
400  _bt_killitems(scan);
403  }
404 
405  so->markItemIndex = -1;
406  so->arrayKeyCount = 0;
409 
410  /*
411  * Allocate tuple workspace arrays, if needed for an index-only scan and
412  * not already done in a previous rescan call. To save on palloc
413  * overhead, both workspaces are allocated as one palloc block; only this
414  * function and btendscan know that.
415  *
416  * NOTE: this data structure also makes it safe to return data from a
417  * "name" column, even though btree name_ops uses an underlying storage
418  * datatype of cstring. The risk there is that "name" is supposed to be
419  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
420  * However, since we only return data out of tuples sitting in the
421  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
422  * data out of the markTuples array --- running off the end of memory for
423  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
424  * adding special-case treatment for name_ops elsewhere.
425  */
426  if (scan->xs_want_itup && so->currTuples == NULL)
427  {
428  so->currTuples = (char *) palloc(BLCKSZ * 2);
429  so->markTuples = so->currTuples + BLCKSZ;
430  }
431 
432  /*
433  * Reset the scan keys
434  */
435  if (scankey && scan->numberOfKeys > 0)
436  memmove(scan->keyData,
437  scankey,
438  scan->numberOfKeys * sizeof(ScanKeyData));
439  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
440 
441  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
443 }
444 
445 /*
446  * btendscan() -- close down a scan
447  */
448 void
450 {
451  BTScanOpaque so = (BTScanOpaque) scan->opaque;
452 
453  /* we aren't holding any read locks, but gotta drop the pins */
455  {
456  /* Before leaving current page, deal with any killed items */
457  if (so->numKilled > 0)
458  _bt_killitems(scan);
460  }
461 
462  so->markItemIndex = -1;
464 
465  /* No need to invalidate positions, the RAM is about to be freed. */
466 
467  /* Release storage */
468  if (so->keyData != NULL)
469  pfree(so->keyData);
470  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
471  if (so->arrayContext != NULL)
473  if (so->killedItems != NULL)
474  pfree(so->killedItems);
475  if (so->currTuples != NULL)
476  pfree(so->currTuples);
477  /* so->markTuples should not be pfree'd, see btrescan */
478  pfree(so);
479 }
480 
481 /*
482  * btmarkpos() -- save current scan position
483  */
484 void
486 {
487  BTScanOpaque so = (BTScanOpaque) scan->opaque;
488 
489  /* There may be an old mark with a pin (but no lock). */
491 
492  /*
493  * Just record the current itemIndex. If we later step to next page
494  * before releasing the marked position, _bt_steppage makes a full copy of
495  * the currPos struct in markPos. If (as often happens) the mark is moved
496  * before we leave the page, we don't have to do that work.
497  */
498  if (BTScanPosIsValid(so->currPos))
499  so->markItemIndex = so->currPos.itemIndex;
500  else
501  {
503  so->markItemIndex = -1;
504  }
505 
506  /* Also record the current positions of any array keys */
507  if (so->numArrayKeys)
508  _bt_mark_array_keys(scan);
509 }
510 
511 /*
512  * btrestrpos() -- restore scan to last saved position
513  */
514 void
516 {
517  BTScanOpaque so = (BTScanOpaque) scan->opaque;
518 
519  /* Restore the marked positions of any array keys */
520  if (so->numArrayKeys)
522 
523  if (so->markItemIndex >= 0)
524  {
525  /*
526  * The scan has never moved to a new page since the last mark. Just
527  * restore the itemIndex.
528  *
529  * NB: In this case we can't count on anything in so->markPos to be
530  * accurate.
531  */
532  so->currPos.itemIndex = so->markItemIndex;
533  }
534  else
535  {
536  /*
537  * The scan moved to a new page after last mark or restore, and we are
538  * now restoring to the marked page. We aren't holding any read
539  * locks, but if we're still holding the pin for the current position,
540  * we must drop it.
541  */
542  if (BTScanPosIsValid(so->currPos))
543  {
544  /* Before leaving current page, deal with any killed items */
545  if (so->numKilled > 0)
546  _bt_killitems(scan);
548  }
549 
550  if (BTScanPosIsValid(so->markPos))
551  {
552  /* bump pin on mark buffer for assignment to current buffer */
553  if (BTScanPosIsPinned(so->markPos))
555  memcpy(&so->currPos, &so->markPos,
556  offsetof(BTScanPosData, items[1]) +
557  so->markPos.lastItem * sizeof(BTScanPosItem));
558  if (so->currTuples)
559  memcpy(so->currTuples, so->markTuples,
561  }
562  else
564  }
565 }
566 
567 /*
568  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
569  */
570 Size
572 {
573  return sizeof(BTParallelScanDescData);
574 }
575 
576 /*
577  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
578  */
579 void
580 btinitparallelscan(void *target)
581 {
582  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
583 
584  SpinLockInit(&bt_target->btps_mutex);
585  bt_target->btps_scanPage = InvalidBlockNumber;
587  bt_target->btps_arrayKeyCount = 0;
588  ConditionVariableInit(&bt_target->btps_cv);
589 }
590 
591 /*
592  * btparallelrescan() -- reset parallel scan
593  */
594 void
596 {
597  BTParallelScanDesc btscan;
598  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
599 
600  Assert(parallel_scan);
601 
602  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
603  parallel_scan->ps_offset);
604 
605  /*
606  * In theory, we don't need to acquire the spinlock here, because there
607  * shouldn't be any other workers running at this point, but we do so for
608  * consistency.
609  */
610  SpinLockAcquire(&btscan->btps_mutex);
613  btscan->btps_arrayKeyCount = 0;
614  SpinLockRelease(&btscan->btps_mutex);
615 }
616 
617 /*
618  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
619  * page. Other scans must wait until we call _bt_parallel_release()
620  * or _bt_parallel_done().
621  *
622  * The return value is true if we successfully seized the scan and false
623  * if we did not. The latter case occurs if no pages remain for the current
624  * set of scankeys.
625  *
626  * If the return value is true, *pageno returns the next or current page
627  * of the scan (depending on the scan direction). An invalid block number
628  * means the scan hasn't yet started, and P_NONE means we've reached the end.
629  * The first time a participating process reaches the last page, it will return
630  * true and set *pageno to P_NONE; after that, further attempts to seize the
631  * scan will return false.
632  *
633  * Callers should ignore the value of pageno if the return value is false.
634  */
635 bool
637 {
638  BTScanOpaque so = (BTScanOpaque) scan->opaque;
639  BTPS_State pageStatus;
640  bool exit_loop = false;
641  bool status = true;
642  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
643  BTParallelScanDesc btscan;
644 
645  *pageno = P_NONE;
646 
647  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
648  parallel_scan->ps_offset);
649 
650  while (1)
651  {
652  SpinLockAcquire(&btscan->btps_mutex);
653  pageStatus = btscan->btps_pageStatus;
654 
655  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
656  {
657  /* Parallel scan has already advanced to a new set of scankeys. */
658  status = false;
659  }
660  else if (pageStatus == BTPARALLEL_DONE)
661  {
662  /*
663  * We're done with this set of scankeys. This may be the end, or
664  * there could be more sets to try.
665  */
666  status = false;
667  }
668  else if (pageStatus != BTPARALLEL_ADVANCING)
669  {
670  /*
671  * We have successfully seized control of the scan for the purpose
672  * of advancing it to a new page!
673  */
674  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
675  *pageno = btscan->btps_scanPage;
676  exit_loop = true;
677  }
678  SpinLockRelease(&btscan->btps_mutex);
679  if (exit_loop || !status)
680  break;
682  }
684 
685  return status;
686 }
687 
688 /*
689  * _bt_parallel_release() -- Complete the process of advancing the scan to a
690  * new page. We now have the new value btps_scanPage; some other backend
691  * can now begin advancing the scan.
692  */
693 void
695 {
696  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
697  BTParallelScanDesc btscan;
698 
699  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
700  parallel_scan->ps_offset);
701 
702  SpinLockAcquire(&btscan->btps_mutex);
703  btscan->btps_scanPage = scan_page;
705  SpinLockRelease(&btscan->btps_mutex);
707 }
708 
709 /*
710  * _bt_parallel_done() -- Mark the parallel scan as complete.
711  *
712  * When there are no pages left to scan, this function should be called to
713  * notify other workers. Otherwise, they might wait forever for the scan to
714  * advance to the next page.
715  */
716 void
718 {
719  BTScanOpaque so = (BTScanOpaque) scan->opaque;
720  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
721  BTParallelScanDesc btscan;
722  bool status_changed = false;
723 
724  /* Do nothing, for non-parallel scans */
725  if (parallel_scan == NULL)
726  return;
727 
728  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
729  parallel_scan->ps_offset);
730 
731  /*
732  * Mark the parallel scan as done for this combination of scan keys,
733  * unless some other process already did so. See also
734  * _bt_advance_array_keys.
735  */
736  SpinLockAcquire(&btscan->btps_mutex);
737  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
738  btscan->btps_pageStatus != BTPARALLEL_DONE)
739  {
740  btscan->btps_pageStatus = BTPARALLEL_DONE;
741  status_changed = true;
742  }
743  SpinLockRelease(&btscan->btps_mutex);
744 
745  /* wake up all the workers associated with this parallel scan */
746  if (status_changed)
747  ConditionVariableBroadcast(&btscan->btps_cv);
748 }
749 
750 /*
751  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
752  * keys.
753  *
754  * Updates the count of array keys processed for both local and parallel
755  * scans.
756  */
757 void
759 {
760  BTScanOpaque so = (BTScanOpaque) scan->opaque;
761  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
762  BTParallelScanDesc btscan;
763 
764  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
765  parallel_scan->ps_offset);
766 
767  so->arrayKeyCount++;
768  SpinLockAcquire(&btscan->btps_mutex);
769  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
770  {
771  btscan->btps_scanPage = InvalidBlockNumber;
772  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
773  btscan->btps_arrayKeyCount++;
774  }
775  SpinLockRelease(&btscan->btps_mutex);
776 }
777 
778 /*
779  * Bulk deletion of all index entries pointing to a set of heap tuples.
780  * The set of target tuples is specified via a callback routine that tells
781  * whether any given heap tuple (identified by ItemPointer) is being deleted.
782  *
783  * Result: a palloc'd struct containing statistical info for VACUUM displays.
784  */
787  IndexBulkDeleteCallback callback, void *callback_state)
788 {
789  Relation rel = info->index;
790  BTCycleId cycleid;
791 
792  /* allocate stats if first time through, else re-use existing struct */
793  if (stats == NULL)
795 
796  /* Establish the vacuum cycle ID to use for this scan */
797  /* The ENSURE stuff ensures we clean up shared memory on failure */
799  {
800  cycleid = _bt_start_vacuum(rel);
801 
802  btvacuumscan(info, stats, callback, callback_state, cycleid);
803  }
805  _bt_end_vacuum(rel);
806 
807  return stats;
808 }
809 
810 /*
811  * Post-VACUUM cleanup.
812  *
813  * Result: a palloc'd struct containing statistical info for VACUUM displays.
814  */
817 {
818  BlockNumber num_delpages;
819 
820  /* No-op in ANALYZE ONLY mode */
821  if (info->analyze_only)
822  return stats;
823 
824  /*
825  * If btbulkdelete was called, we need not do anything (we just maintain
826  * the information used within _bt_vacuum_needs_cleanup() by calling
827  * _bt_set_cleanup_info() below).
828  *
829  * If btbulkdelete was _not_ called, then we have a choice to make: we
830  * must decide whether or not a btvacuumscan() call is needed now (i.e.
831  * whether the ongoing VACUUM operation can entirely avoid a physical scan
832  * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
833  * now.
834  */
835  if (stats == NULL)
836  {
837  /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
838  if (!_bt_vacuum_needs_cleanup(info->index))
839  return NULL;
840 
841  /*
842  * Since we aren't going to actually delete any leaf items, there's no
843  * need to go through all the vacuum-cycle-ID pushups here.
844  *
845  * Posting list tuples are a source of inaccuracy for cleanup-only
846  * scans. btvacuumscan() will assume that the number of index tuples
847  * from each page can be used as num_index_tuples, even though
848  * num_index_tuples is supposed to represent the number of TIDs in the
849  * index. This naive approach can underestimate the number of tuples
850  * in the index significantly.
851  *
852  * We handle the problem by making num_index_tuples an estimate in
853  * cleanup-only case.
854  */
856  btvacuumscan(info, stats, NULL, NULL, 0);
857  stats->estimated_count = true;
858  }
859 
860  /*
861  * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
862  *
863  * num_delpages is the number of deleted pages now in the index that were
864  * not safe to place in the FSM to be recycled just yet. num_delpages is
865  * greater than 0 only when _bt_pagedel() actually deleted pages during
866  * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
867  * have failed to place any newly deleted pages in the FSM just moments
868  * ago. (Actually, there are edge cases where recycling of the current
869  * VACUUM's newly deleted pages does not even become safe by the time the
870  * next VACUUM comes around. See nbtree/README.)
871  */
872  Assert(stats->pages_deleted >= stats->pages_free);
873  num_delpages = stats->pages_deleted - stats->pages_free;
874  _bt_set_cleanup_info(info->index, num_delpages);
875 
876  /*
877  * It's quite possible for us to be fooled by concurrent page splits into
878  * double-counting some index tuples, so disbelieve any total that exceeds
879  * the underlying heap's count ... if we know that accurately. Otherwise
880  * this might just make matters worse.
881  */
882  if (!info->estimated_count)
883  {
884  if (stats->num_index_tuples > info->num_heap_tuples)
885  stats->num_index_tuples = info->num_heap_tuples;
886  }
887 
888  return stats;
889 }
890 
891 /*
892  * btvacuumscan --- scan the index for VACUUMing purposes
893  *
894  * This combines the functions of looking for leaf tuples that are deletable
895  * according to the vacuum callback, looking for empty pages that can be
896  * deleted, and looking for old deleted pages that can be recycled. Both
897  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
898  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
899  *
900  * The caller is responsible for initially allocating/zeroing a stats struct
901  * and for obtaining a vacuum cycle ID if necessary.
902  */
903 static void
905  IndexBulkDeleteCallback callback, void *callback_state,
906  BTCycleId cycleid)
907 {
908  Relation rel = info->index;
909  BTVacState vstate;
910  BlockNumber num_pages;
911  BlockNumber scanblkno;
912  bool needLock;
913 
914  /*
915  * Reset fields that track information about the entire index now. This
916  * avoids double-counting in the case where a single VACUUM command
917  * requires multiple scans of the index.
918  *
919  * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
920  * since they track information about the VACUUM command, and so must last
921  * across each call to btvacuumscan().
922  *
923  * (Note that pages_free is treated as state about the whole index, not
924  * the current VACUUM. This is appropriate because RecordFreeIndexPage()
925  * calls are idempotent, and get repeated for the same deleted pages in
926  * some scenarios. The point for us is to track the number of recyclable
927  * pages in the index at the end of the VACUUM command.)
928  */
929  stats->num_pages = 0;
930  stats->num_index_tuples = 0;
931  stats->pages_deleted = 0;
932  stats->pages_free = 0;
933 
934  /* Set up info to pass down to btvacuumpage */
935  vstate.info = info;
936  vstate.stats = stats;
937  vstate.callback = callback;
938  vstate.callback_state = callback_state;
939  vstate.cycleid = cycleid;
940 
941  /* Create a temporary memory context to run _bt_pagedel in */
943  "_bt_pagedel",
945 
946  /* Initialize vstate fields used by _bt_pendingfsm_finalize */
947  vstate.bufsize = 0;
948  vstate.maxbufsize = 0;
949  vstate.pendingpages = NULL;
950  vstate.npendingpages = 0;
951  /* Consider applying _bt_pendingfsm_finalize optimization */
952  _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
953 
954  /*
955  * The outer loop iterates over all index pages except the metapage, in
956  * physical order (we hope the kernel will cooperate in providing
957  * read-ahead for speed). It is critical that we visit all leaf pages,
958  * including ones added after we start the scan, else we might fail to
959  * delete some deletable tuples. Hence, we must repeatedly check the
960  * relation length. We must acquire the relation-extension lock while
961  * doing so to avoid a race condition: if someone else is extending the
962  * relation, there is a window where bufmgr/smgr have created a new
963  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
964  * we manage to scan such a page here, we'll improperly assume it can be
965  * recycled. Taking the lock synchronizes things enough to prevent a
966  * problem: either num_pages won't include the new page, or _bt_getbuf
967  * already has write lock on the buffer and it will be fully initialized
968  * before we can examine it. Also, we need not worry if a page is added
969  * immediately after we look; the page splitting code already has
970  * write-lock on the left page before it adds a right page, so we must
971  * already have processed any tuples due to be moved into such a page.
972  *
973  * We can skip locking for new or temp relations, however, since no one
974  * else could be accessing them.
975  */
976  needLock = !RELATION_IS_LOCAL(rel);
977 
978  scanblkno = BTREE_METAPAGE + 1;
979  for (;;)
980  {
981  /* Get the current relation length */
982  if (needLock)
984  num_pages = RelationGetNumberOfBlocks(rel);
985  if (needLock)
987 
988  if (info->report_progress)
990  num_pages);
991 
992  /* Quit if we've scanned the whole relation */
993  if (scanblkno >= num_pages)
994  break;
995  /* Iterate over pages, then loop back to recheck length */
996  for (; scanblkno < num_pages; scanblkno++)
997  {
998  btvacuumpage(&vstate, scanblkno);
999  if (info->report_progress)
1001  scanblkno);
1002  }
1003  }
1004 
1005  /* Set statistics num_pages field to final size of index */
1006  stats->num_pages = num_pages;
1007 
1009 
1010  /*
1011  * If there were any calls to _bt_pagedel() during scan of the index then
1012  * see if any of the resulting pages can be placed in the FSM now. When
1013  * it's not safe we'll have to leave it up to a future VACUUM operation.
1014  *
1015  * Finally, if we placed any pages in the FSM (either just now or during
1016  * the scan), forcibly update the upper-level FSM pages to ensure that
1017  * searchers can find them.
1018  */
1019  _bt_pendingfsm_finalize(rel, &vstate);
1020  if (stats->pages_free > 0)
1022 }
1023 
1024 /*
1025  * btvacuumpage --- VACUUM one page
1026  *
1027  * This processes a single page for btvacuumscan(). In some cases we must
1028  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1029  * a previous call here. This is how we handle page splits (that happened
1030  * after our cycleid was acquired) whose right half page happened to reuse
1031  * a block that we might have processed at some point before it was
1032  * recycled (i.e. before the page split).
1033  */
1034 static void
1036 {
1037  IndexVacuumInfo *info = vstate->info;
1038  IndexBulkDeleteResult *stats = vstate->stats;
1040  void *callback_state = vstate->callback_state;
1041  Relation rel = info->index;
1042  bool attempt_pagedel;
1043  BlockNumber blkno,
1044  backtrack_to;
1045  Buffer buf;
1046  Page page;
1047  BTPageOpaque opaque;
1048 
1049  blkno = scanblkno;
1050 
1051 backtrack:
1052 
1053  attempt_pagedel = false;
1054  backtrack_to = P_NONE;
1055 
1056  /* call vacuum_delay_point while not holding any buffer lock */
1058 
1059  /*
1060  * We can't use _bt_getbuf() here because it always applies
1061  * _bt_checkpage(), which will barf on an all-zero page. We want to
1062  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1063  * buffer access strategy.
1064  */
1066  info->strategy);
1067  _bt_lockbuf(rel, buf, BT_READ);
1068  page = BufferGetPage(buf);
1069  opaque = NULL;
1070  if (!PageIsNew(page))
1071  {
1072  _bt_checkpage(rel, buf);
1073  opaque = BTPageGetOpaque(page);
1074  }
1075 
1076  Assert(blkno <= scanblkno);
1077  if (blkno != scanblkno)
1078  {
1079  /*
1080  * We're backtracking.
1081  *
1082  * We followed a right link to a sibling leaf page (a page that
1083  * happens to be from a block located before scanblkno). The only
1084  * case we want to do anything with is a live leaf page having the
1085  * current vacuum cycle ID.
1086  *
1087  * The page had better be in a state that's consistent with what we
1088  * expect. Check for conditions that imply corruption in passing. It
1089  * can't be half-dead because only an interrupted VACUUM process can
1090  * leave pages in that state, so we'd definitely have dealt with it
1091  * back when the page was the scanblkno page (half-dead pages are
1092  * always marked fully deleted by _bt_pagedel()). This assumes that
1093  * there can be only one vacuum process running at a time.
1094  */
1095  if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1096  {
1097  Assert(false);
1098  ereport(LOG,
1099  (errcode(ERRCODE_INDEX_CORRUPTED),
1100  errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1101  blkno, scanblkno, RelationGetRelationName(rel))));
1102  _bt_relbuf(rel, buf);
1103  return;
1104  }
1105 
1106  /*
1107  * We may have already processed the page in an earlier call, when the
1108  * page was scanblkno. This happens when the leaf page split occurred
1109  * after the scan began, but before the right sibling page became the
1110  * scanblkno.
1111  *
1112  * Page may also have been deleted by current btvacuumpage() call,
1113  * since _bt_pagedel() sometimes deletes the right sibling page of
1114  * scanblkno in passing (it does so after we decided where to
1115  * backtrack to). We don't need to process this page as a deleted
1116  * page a second time now (in fact, it would be wrong to count it as a
1117  * deleted page in the bulk delete statistics a second time).
1118  */
1119  if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1120  {
1121  /* Done with current scanblkno (and all lower split pages) */
1122  _bt_relbuf(rel, buf);
1123  return;
1124  }
1125  }
1126 
1127  if (!opaque || BTPageIsRecyclable(page))
1128  {
1129  /* Okay to recycle this page (which could be leaf or internal) */
1130  RecordFreeIndexPage(rel, blkno);
1131  stats->pages_deleted++;
1132  stats->pages_free++;
1133  }
1134  else if (P_ISDELETED(opaque))
1135  {
1136  /*
1137  * Already deleted page (which could be leaf or internal). Can't
1138  * recycle yet.
1139  */
1140  stats->pages_deleted++;
1141  }
1142  else if (P_ISHALFDEAD(opaque))
1143  {
1144  /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1145  attempt_pagedel = true;
1146 
1147  /*
1148  * _bt_pagedel() will increment both pages_newly_deleted and
1149  * pages_deleted stats in all cases (barring corruption)
1150  */
1151  }
1152  else if (P_ISLEAF(opaque))
1153  {
1155  int ndeletable;
1157  int nupdatable;
1158  OffsetNumber offnum,
1159  minoff,
1160  maxoff;
1161  int nhtidsdead,
1162  nhtidslive;
1163 
1164  /*
1165  * Trade in the initial read lock for a full cleanup lock on this
1166  * page. We must get such a lock on every leaf page over the course
1167  * of the vacuum scan, whether or not it actually contains any
1168  * deletable tuples --- see nbtree/README.
1169  */
1171 
1172  /*
1173  * Check whether we need to backtrack to earlier pages. What we are
1174  * concerned about is a page split that happened since we started the
1175  * vacuum scan. If the split moved tuples on the right half of the
1176  * split (i.e. the tuples that sort high) to a block that we already
1177  * passed over, then we might have missed the tuples. We need to
1178  * backtrack now. (Must do this before possibly clearing btpo_cycleid
1179  * or deleting scanblkno page below!)
1180  */
1181  if (vstate->cycleid != 0 &&
1182  opaque->btpo_cycleid == vstate->cycleid &&
1183  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1184  !P_RIGHTMOST(opaque) &&
1185  opaque->btpo_next < scanblkno)
1186  backtrack_to = opaque->btpo_next;
1187 
1188  ndeletable = 0;
1189  nupdatable = 0;
1190  minoff = P_FIRSTDATAKEY(opaque);
1191  maxoff = PageGetMaxOffsetNumber(page);
1192  nhtidsdead = 0;
1193  nhtidslive = 0;
1194  if (callback)
1195  {
1196  /* btbulkdelete callback tells us what to delete (or update) */
1197  for (offnum = minoff;
1198  offnum <= maxoff;
1199  offnum = OffsetNumberNext(offnum))
1200  {
1201  IndexTuple itup;
1202 
1203  itup = (IndexTuple) PageGetItem(page,
1204  PageGetItemId(page, offnum));
1205 
1206  Assert(!BTreeTupleIsPivot(itup));
1207  if (!BTreeTupleIsPosting(itup))
1208  {
1209  /* Regular tuple, standard table TID representation */
1210  if (callback(&itup->t_tid, callback_state))
1211  {
1212  deletable[ndeletable++] = offnum;
1213  nhtidsdead++;
1214  }
1215  else
1216  nhtidslive++;
1217  }
1218  else
1219  {
1220  BTVacuumPosting vacposting;
1221  int nremaining;
1222 
1223  /* Posting list tuple */
1224  vacposting = btreevacuumposting(vstate, itup, offnum,
1225  &nremaining);
1226  if (vacposting == NULL)
1227  {
1228  /*
1229  * All table TIDs from the posting tuple remain, so no
1230  * delete or update required
1231  */
1232  Assert(nremaining == BTreeTupleGetNPosting(itup));
1233  }
1234  else if (nremaining > 0)
1235  {
1236 
1237  /*
1238  * Store metadata about posting list tuple in
1239  * updatable array for entire page. Existing tuple
1240  * will be updated during the later call to
1241  * _bt_delitems_vacuum().
1242  */
1243  Assert(nremaining < BTreeTupleGetNPosting(itup));
1244  updatable[nupdatable++] = vacposting;
1245  nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1246  }
1247  else
1248  {
1249  /*
1250  * All table TIDs from the posting list must be
1251  * deleted. We'll delete the index tuple completely
1252  * (no update required).
1253  */
1254  Assert(nremaining == 0);
1255  deletable[ndeletable++] = offnum;
1256  nhtidsdead += BTreeTupleGetNPosting(itup);
1257  pfree(vacposting);
1258  }
1259 
1260  nhtidslive += nremaining;
1261  }
1262  }
1263  }
1264 
1265  /*
1266  * Apply any needed deletes or updates. We issue just one
1267  * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1268  */
1269  if (ndeletable > 0 || nupdatable > 0)
1270  {
1271  Assert(nhtidsdead >= ndeletable + nupdatable);
1272  _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1273  nupdatable);
1274 
1275  stats->tuples_removed += nhtidsdead;
1276  /* must recompute maxoff */
1277  maxoff = PageGetMaxOffsetNumber(page);
1278 
1279  /* can't leak memory here */
1280  for (int i = 0; i < nupdatable; i++)
1281  pfree(updatable[i]);
1282  }
1283  else
1284  {
1285  /*
1286  * If the leaf page has been split during this vacuum cycle, it
1287  * seems worth expending a write to clear btpo_cycleid even if we
1288  * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1289  * takes care of this.) This ensures we won't process the page
1290  * again.
1291  *
1292  * We treat this like a hint-bit update because there's no need to
1293  * WAL-log it.
1294  */
1295  Assert(nhtidsdead == 0);
1296  if (vstate->cycleid != 0 &&
1297  opaque->btpo_cycleid == vstate->cycleid)
1298  {
1299  opaque->btpo_cycleid = 0;
1300  MarkBufferDirtyHint(buf, true);
1301  }
1302  }
1303 
1304  /*
1305  * If the leaf page is now empty, try to delete it; else count the
1306  * live tuples (live table TIDs in posting lists are counted as
1307  * separate live tuples). We don't delete when backtracking, though,
1308  * since that would require teaching _bt_pagedel() about backtracking
1309  * (doesn't seem worth adding more complexity to deal with that).
1310  *
1311  * We don't count the number of live TIDs during cleanup-only calls to
1312  * btvacuumscan (i.e. when callback is not set). We count the number
1313  * of index tuples directly instead. This avoids the expense of
1314  * directly examining all of the tuples on each page. VACUUM will
1315  * treat num_index_tuples as an estimate in cleanup-only case, so it
1316  * doesn't matter that this underestimates num_index_tuples
1317  * significantly in some cases.
1318  */
1319  if (minoff > maxoff)
1320  attempt_pagedel = (blkno == scanblkno);
1321  else if (callback)
1322  stats->num_index_tuples += nhtidslive;
1323  else
1324  stats->num_index_tuples += maxoff - minoff + 1;
1325 
1326  Assert(!attempt_pagedel || nhtidslive == 0);
1327  }
1328 
1329  if (attempt_pagedel)
1330  {
1331  MemoryContext oldcontext;
1332 
1333  /* Run pagedel in a temp context to avoid memory leakage */
1335  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1336 
1337  /*
1338  * _bt_pagedel maintains the bulk delete stats on our behalf;
1339  * pages_newly_deleted and pages_deleted are likely to be incremented
1340  * during call
1341  */
1342  Assert(blkno == scanblkno);
1343  _bt_pagedel(rel, buf, vstate);
1344 
1345  MemoryContextSwitchTo(oldcontext);
1346  /* pagedel released buffer, so we shouldn't */
1347  }
1348  else
1349  _bt_relbuf(rel, buf);
1350 
1351  if (backtrack_to != P_NONE)
1352  {
1353  blkno = backtrack_to;
1354  goto backtrack;
1355  }
1356 }
1357 
1358 /*
1359  * btreevacuumposting --- determine TIDs still needed in posting list
1360  *
1361  * Returns metadata describing how to build replacement tuple without the TIDs
1362  * that VACUUM needs to delete. Returned value is NULL in the common case
1363  * where no changes are needed to caller's posting list tuple (we avoid
1364  * allocating memory here as an optimization).
1365  *
1366  * The number of TIDs that should remain in the posting list tuple is set for
1367  * caller in *nremaining.
1368  */
1369 static BTVacuumPosting
1371  OffsetNumber updatedoffset, int *nremaining)
1372 {
1373  int live = 0;
1374  int nitem = BTreeTupleGetNPosting(posting);
1375  ItemPointer items = BTreeTupleGetPosting(posting);
1376  BTVacuumPosting vacposting = NULL;
1377 
1378  for (int i = 0; i < nitem; i++)
1379  {
1380  if (!vstate->callback(items + i, vstate->callback_state))
1381  {
1382  /* Live table TID */
1383  live++;
1384  }
1385  else if (vacposting == NULL)
1386  {
1387  /*
1388  * First dead table TID encountered.
1389  *
1390  * It's now clear that we need to delete one or more dead table
1391  * TIDs, so start maintaining metadata describing how to update
1392  * existing posting list tuple.
1393  */
1394  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1395  nitem * sizeof(uint16));
1396 
1397  vacposting->itup = posting;
1398  vacposting->updatedoffset = updatedoffset;
1399  vacposting->ndeletedtids = 0;
1400  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1401  }
1402  else
1403  {
1404  /* Second or subsequent dead table TID */
1405  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1406  }
1407  }
1408 
1409  *nremaining = live;
1410  return vacposting;
1411 }
1412 
1413 /*
1414  * btcanreturn() -- Check whether btree indexes support index-only scans.
1415  *
1416  * btrees always do, so this is trivial.
1417  */
1418 bool
1420 {
1421  return true;
1422 }
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static Datum values[MAXATTR]
Definition: bootstrap.c:156
int Buffer
Definition: buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4043
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4075
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:751
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:161
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:285
@ RBM_NORMAL
Definition: bufmgr.h:44
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1539
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static bool PageIsNew(Page page)
Definition: bufpage.h:230
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define OffsetToPointer(base, offset)
Definition: c.h:756
unsigned short uint16
Definition: c.h:489
size_t Size
Definition: c.h:589
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
void ConditionVariableCancelSleep(void)
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errcode(int sqlerrcode)
Definition: elog.c:858
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:81
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:86
IndexUniqueCheck
Definition: genam.h:115
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:165
Assert(fmt[strlen(fmt) - 1] !='\n')
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:431
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:481
#define ExclusiveLock
Definition: lockdefs.h:42
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:314
void pfree(void *pointer)
Definition: mcxt.c:1436
void * palloc0(Size size)
Definition: mcxt.c:1241
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:387
void * palloc(Size size)
Definition: mcxt.c:1210
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:153
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:100
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1816
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1166
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:794
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:234
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1121
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:69
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:181
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2956
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1051
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2915
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1419
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:694
BTPS_State
Definition: nbtree.c:55
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:57
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56
@ BTPARALLEL_IDLE
Definition: nbtree.c:58
@ BTPARALLEL_DONE
Definition: nbtree.c:59
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:344
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:717
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:786
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:758
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1370
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:816
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:212
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:595
struct BTParallelScanDescData BTParallelScanDescData
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:188
void btbuildempty(Relation index)
Definition: nbtree.c:152
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:636
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
Definition: nbtree.c:1035
Size btestimateparallelscan(void)
Definition: nbtree.c:571
void btinitparallelscan(void *target)
Definition: nbtree.c:580
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:904
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:286
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:485
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:449
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:390
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:78
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:96
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:515
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:996
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
static bool BTPageIsRecyclable(Page page)
Definition: nbtree.h:291
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:538
#define BTNProcs
Definition: nbtree.h:713
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1013
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
uint16 BTCycleId
Definition: nbtree.h:29
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_READ
Definition: nbtree.h:720
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1019
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1007
#define BTP_SPLIT_END
Definition: nbtree.h:81
#define BTOPTIONS_PROC
Definition: nbtree.h:712
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1080
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:860
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1466
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:301
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2033
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:2157
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:2061
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2111
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:551
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1725
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:210
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:629
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:610
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2134
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2689
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:525
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:1976
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:41
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:293
#define makeNode(_type_)
Definition: nodes.h:176
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:224
static char * buf
Definition: pg_test_fsync.c:67
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
#define InvalidOid
Definition: postgres_ext.h:36
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:120
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:119
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:648
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:571
#define RelationGetDescr(relation)
Definition: rel.h:529
#define RelationGetRelationName(relation)
Definition: rel.h:537
@ MAIN_FORKNUM
Definition: relpath.h:50
@ INIT_FORKNUM
Definition: relpath.h:53
int slock_t
Definition: s_lock.h:754
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6670
ScanKeyData * ScanKey
Definition: skey.h:75
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:554
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:691
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define BTMaxStrategyNumber
Definition: stratnum.h:35
BlockNumber btpo_next
Definition: nbtree.h:65
uint16 btpo_flags
Definition: nbtree.h:67
BTCycleId btpo_cycleid
Definition: nbtree.h:68
slock_t btps_mutex
Definition: nbtree.c:74
BTPS_State btps_pageStatus
Definition: nbtree.c:69
ConditionVariable btps_cv
Definition: nbtree.c:75
BlockNumber btps_scanPage
Definition: nbtree.c:68
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1051
char * markTuples
Definition: nbtree.h:1064
BTScanPosData currPos
Definition: nbtree.h:1076
int * killedItems
Definition: nbtree.h:1055
char * currTuples
Definition: nbtree.h:1063
ScanKey arrayKeyData
Definition: nbtree.h:1046
BTScanPosData markPos
Definition: nbtree.h:1077
ScanKey keyData
Definition: nbtree.h:1043
MemoryContext arrayContext
Definition: nbtree.h:1052
Buffer buf
Definition: nbtree.h:959
int nextTupleOffset
Definition: nbtree.h:978
int lastItem
Definition: nbtree.h:988
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:991
int itemIndex
Definition: nbtree.h:989
ItemPointerData heapTid
Definition: nbtree.h:952
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BTCycleId cycleid
Definition: nbtree.h:337
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
IndexBulkDeleteCallback callback
Definition: nbtree.h:335
MemoryContext pagedelcontext
Definition: nbtree.h:338
IndexVacuumInfo * info
Definition: nbtree.h:333
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344
void * callback_state
Definition: nbtree.h:336
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:917
uint16 ndeletedtids
Definition: nbtree.h:916
IndexTuple itup
Definition: nbtree.h:912
OffsetNumber updatedoffset
Definition: nbtree.h:913
ambuildphasename_function ambuildphasename
Definition: amapi.h:270
ambuildempty_function ambuildempty
Definition: amapi.h:262
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:265
bool amclusterable
Definition: amapi.h:238
amoptions_function amoptions
Definition: amapi.h:268
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:282
amrestrpos_function amrestrpos
Definition: amapi.h:279
aminsert_function aminsert
Definition: amapi.h:263
amendscan_function amendscan
Definition: amapi.h:277
uint16 amoptsprocnum
Definition: amapi.h:218
amparallelrescan_function amparallelrescan
Definition: amapi.h:284
Oid amkeytype
Definition: amapi.h:252
bool ampredlocks
Definition: amapi.h:240
uint16 amsupport
Definition: amapi.h:216
amcostestimate_function amcostestimate
Definition: amapi.h:267
bool amcanorderbyop
Definition: amapi.h:222
amadjustmembers_function amadjustmembers
Definition: amapi.h:272
ambuild_function ambuild
Definition: amapi.h:261
bool amstorage
Definition: amapi.h:236
uint16 amstrategies
Definition: amapi.h:214
bool amoptionalkey
Definition: amapi.h:230
amgettuple_function amgettuple
Definition: amapi.h:275
amcanreturn_function amcanreturn
Definition: amapi.h:266
bool amcanunique
Definition: amapi.h:226
amgetbitmap_function amgetbitmap
Definition: amapi.h:276
amproperty_function amproperty
Definition: amapi.h:269
ambulkdelete_function ambulkdelete
Definition: amapi.h:264
bool amsearcharray
Definition: amapi.h:232
bool amsummarizing
Definition: amapi.h:248
amvalidate_function amvalidate
Definition: amapi.h:271
ammarkpos_function ammarkpos
Definition: amapi.h:278
bool amcanmulticol
Definition: amapi.h:228
bool amusemaintenanceworkmem
Definition: amapi.h:246
ambeginscan_function ambeginscan
Definition: amapi.h:273
bool amcanparallel
Definition: amapi.h:242
amrescan_function amrescan
Definition: amapi.h:274
bool amcanorder
Definition: amapi.h:220
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:283
uint8 amparallelvacuumoptions
Definition: amapi.h:250
bool amcanbackward
Definition: amapi.h:224
bool amcaninclude
Definition: amapi.h:244
bool amsearchnulls
Definition: amapi.h:234
bool estimated_count
Definition: genam.h:77
BlockNumber pages_deleted
Definition: genam.h:81
BlockNumber pages_free
Definition: genam.h:82
BlockNumber num_pages
Definition: genam.h:76
double tuples_removed
Definition: genam.h:79
double num_index_tuples
Definition: genam.h:78
struct ScanKeyData * keyData
Definition: relscan.h:122
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
bool kill_prior_tuple
Definition: relscan.h:128
struct TupleDescData * xs_itupdesc
Definition: relscan.h:143
ItemPointerData xs_heaptid
Definition: relscan.h:147
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:46
double num_heap_tuples
Definition: genam.h:51
bool analyze_only
Definition: genam.h:47
BufferAccessStrategy strategy
Definition: genam.h:52
bool report_progress
Definition: genam.h:48
bool estimated_count
Definition: genam.h:49
Definition: type.h:95
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
void vacuum_delay_point(void)
Definition: vacuum.c:2211
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:47
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:54
@ WAIT_EVENT_BTREE_PAGE
Definition: wait_event.h:89
XLogRecPtr log_newpage(RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:1097