PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/relscan.h"
23 #include "access/xloginsert.h"
24 #include "commands/progress.h"
25 #include "commands/vacuum.h"
26 #include "miscadmin.h"
27 #include "nodes/execnodes.h"
28 #include "pgstat.h"
29 #include "storage/bulk_write.h"
31 #include "storage/indexfsm.h"
32 #include "storage/ipc.h"
33 #include "storage/lmgr.h"
34 #include "storage/smgr.h"
35 #include "utils/fmgrprotos.h"
36 #include "utils/index_selfuncs.h"
37 #include "utils/memutils.h"
38 
39 
40 /*
41  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
42  *
43  * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
44  * scan to advance it via another call to _bt_first.
45  *
46  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
47  * a new page; others must wait.
48  *
49  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
50  * to a new page; some process can start doing that.
51  *
52  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53  */
54 typedef enum
55 {
61 } BTPS_State;
62 
63 /*
64  * BTParallelScanDescData contains btree specific shared information required
65  * for parallel scan.
66  */
67 typedef struct BTParallelScanDescData
68 {
69  BlockNumber btps_scanPage; /* latest or next page to be scanned */
70  BTPS_State btps_pageStatus; /* indicates whether next page is
71  * available for scan. see above for
72  * possible states of parallel scan. */
73  slock_t btps_mutex; /* protects above variables, btps_arrElems */
74  ConditionVariable btps_cv; /* used to synchronize parallel scan */
75 
76  /*
77  * btps_arrElems is used when scans need to schedule another primitive
78  * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
79  */
82 
84 
85 
86 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
87  IndexBulkDeleteCallback callback, void *callback_state,
88  BTCycleId cycleid);
89 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
91  IndexTuple posting,
92  OffsetNumber updatedoffset,
93  int *nremaining);
94 
95 
96 /*
97  * Btree handler function: return IndexAmRoutine with access method parameters
98  * and callbacks.
99  */
100 Datum
102 {
103  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
104 
105  amroutine->amstrategies = BTMaxStrategyNumber;
106  amroutine->amsupport = BTNProcs;
107  amroutine->amoptsprocnum = BTOPTIONS_PROC;
108  amroutine->amcanorder = true;
109  amroutine->amcanorderbyop = false;
110  amroutine->amcanbackward = true;
111  amroutine->amcanunique = true;
112  amroutine->amcanmulticol = true;
113  amroutine->amoptionalkey = true;
114  amroutine->amsearcharray = true;
115  amroutine->amsearchnulls = true;
116  amroutine->amstorage = false;
117  amroutine->amclusterable = true;
118  amroutine->ampredlocks = true;
119  amroutine->amcanparallel = true;
120  amroutine->amcanbuildparallel = true;
121  amroutine->amcaninclude = true;
122  amroutine->amusemaintenanceworkmem = false;
123  amroutine->amsummarizing = false;
124  amroutine->amparallelvacuumoptions =
126  amroutine->amkeytype = InvalidOid;
127 
128  amroutine->ambuild = btbuild;
129  amroutine->ambuildempty = btbuildempty;
130  amroutine->aminsert = btinsert;
131  amroutine->aminsertcleanup = NULL;
132  amroutine->ambulkdelete = btbulkdelete;
133  amroutine->amvacuumcleanup = btvacuumcleanup;
134  amroutine->amcanreturn = btcanreturn;
135  amroutine->amcostestimate = btcostestimate;
136  amroutine->amoptions = btoptions;
137  amroutine->amproperty = btproperty;
138  amroutine->ambuildphasename = btbuildphasename;
139  amroutine->amvalidate = btvalidate;
140  amroutine->amadjustmembers = btadjustmembers;
141  amroutine->ambeginscan = btbeginscan;
142  amroutine->amrescan = btrescan;
143  amroutine->amgettuple = btgettuple;
144  amroutine->amgetbitmap = btgetbitmap;
145  amroutine->amendscan = btendscan;
146  amroutine->ammarkpos = btmarkpos;
147  amroutine->amrestrpos = btrestrpos;
150  amroutine->amparallelrescan = btparallelrescan;
151 
152  PG_RETURN_POINTER(amroutine);
153 }
154 
155 /*
156  * btbuildempty() -- build an empty btree index in the initialization fork
157  */
158 void
160 {
161  bool allequalimage = _bt_allequalimage(index, false);
162  BulkWriteState *bulkstate;
163  BulkWriteBuffer metabuf;
164 
165  bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
166 
167  /* Construct metapage. */
168  metabuf = smgr_bulk_get_buf(bulkstate);
169  _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
170  smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
171 
172  smgr_bulk_finish(bulkstate);
173 }
174 
175 /*
176  * btinsert() -- insert an index tuple into a btree.
177  *
178  * Descend the tree recursively, find the appropriate location for our
179  * new tuple, and put it there.
180  */
181 bool
182 btinsert(Relation rel, Datum *values, bool *isnull,
183  ItemPointer ht_ctid, Relation heapRel,
184  IndexUniqueCheck checkUnique,
185  bool indexUnchanged,
186  IndexInfo *indexInfo)
187 {
188  bool result;
189  IndexTuple itup;
190 
191  /* generate an index tuple */
192  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
193  itup->t_tid = *ht_ctid;
194 
195  result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
196 
197  pfree(itup);
198 
199  return result;
200 }
201 
202 /*
203  * btgettuple() -- Get the next tuple in the scan.
204  */
205 bool
207 {
208  BTScanOpaque so = (BTScanOpaque) scan->opaque;
209  bool res;
210 
211  /* btree indexes are never lossy */
212  scan->xs_recheck = false;
213 
214  /* Each loop iteration performs another primitive index scan */
215  do
216  {
217  /*
218  * If we've already initialized this scan, we can just advance it in
219  * the appropriate direction. If we haven't done so yet, we call
220  * _bt_first() to get the first item in the scan.
221  */
222  if (!BTScanPosIsValid(so->currPos))
223  res = _bt_first(scan, dir);
224  else
225  {
226  /*
227  * Check to see if we should kill the previously-fetched tuple.
228  */
229  if (scan->kill_prior_tuple)
230  {
231  /*
232  * Yes, remember it for later. (We'll deal with all such
233  * tuples at once right before leaving the index page.) The
234  * test for numKilled overrun is not just paranoia: if the
235  * caller reverses direction in the indexscan then the same
236  * item might get entered multiple times. It's not worth
237  * trying to optimize that, so we don't detect it, but instead
238  * just forget any excess entries.
239  */
240  if (so->killedItems == NULL)
241  so->killedItems = (int *)
242  palloc(MaxTIDsPerBTreePage * sizeof(int));
243  if (so->numKilled < MaxTIDsPerBTreePage)
244  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
245  }
246 
247  /*
248  * Now continue the scan.
249  */
250  res = _bt_next(scan, dir);
251  }
252 
253  /* If we have a tuple, return it ... */
254  if (res)
255  break;
256  /* ... otherwise see if we need another primitive index scan */
257  } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
258 
259  return res;
260 }
261 
262 /*
263  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
264  */
265 int64
267 {
268  BTScanOpaque so = (BTScanOpaque) scan->opaque;
269  int64 ntids = 0;
270  ItemPointer heapTid;
271 
272  /* Each loop iteration performs another primitive index scan */
273  do
274  {
275  /* Fetch the first page & tuple */
277  {
278  /* Save tuple ID, and continue scanning */
279  heapTid = &scan->xs_heaptid;
280  tbm_add_tuples(tbm, heapTid, 1, false);
281  ntids++;
282 
283  for (;;)
284  {
285  /*
286  * Advance to next tuple within page. This is the same as the
287  * easy case in _bt_next().
288  */
289  if (++so->currPos.itemIndex > so->currPos.lastItem)
290  {
291  /* let _bt_next do the heavy lifting */
292  if (!_bt_next(scan, ForwardScanDirection))
293  break;
294  }
295 
296  /* Save tuple ID, and continue scanning */
297  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
298  tbm_add_tuples(tbm, heapTid, 1, false);
299  ntids++;
300  }
301  }
302  /* Now see if we need another primitive index scan */
303  } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
304 
305  return ntids;
306 }
307 
308 /*
309  * btbeginscan() -- start a scan on a btree index
310  */
312 btbeginscan(Relation rel, int nkeys, int norderbys)
313 {
314  IndexScanDesc scan;
315  BTScanOpaque so;
316 
317  /* no order by operators allowed */
318  Assert(norderbys == 0);
319 
320  /* get the scan */
321  scan = RelationGetIndexScan(rel, nkeys, norderbys);
322 
323  /* allocate private workspace */
324  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
327  if (scan->numberOfKeys > 0)
328  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
329  else
330  so->keyData = NULL;
331 
332  so->needPrimScan = false;
333  so->scanBehind = false;
334  so->arrayKeys = NULL;
335  so->orderProcs = NULL;
336  so->arrayContext = NULL;
337 
338  so->killedItems = NULL; /* until needed */
339  so->numKilled = 0;
340 
341  /*
342  * We don't know yet whether the scan will be index-only, so we do not
343  * allocate the tuple workspace arrays until btrescan. However, we set up
344  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
345  */
346  so->currTuples = so->markTuples = NULL;
347 
348  scan->xs_itupdesc = RelationGetDescr(rel);
349 
350  scan->opaque = so;
351 
352  return scan;
353 }
354 
355 /*
356  * btrescan() -- rescan an index relation
357  */
358 void
359 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
360  ScanKey orderbys, int norderbys)
361 {
362  BTScanOpaque so = (BTScanOpaque) scan->opaque;
363 
364  /* we aren't holding any read locks, but gotta drop the pins */
366  {
367  /* Before leaving current page, deal with any killed items */
368  if (so->numKilled > 0)
369  _bt_killitems(scan);
372  }
373 
374  so->markItemIndex = -1;
375  so->needPrimScan = false;
376  so->scanBehind = false;
379 
380  /*
381  * Allocate tuple workspace arrays, if needed for an index-only scan and
382  * not already done in a previous rescan call. To save on palloc
383  * overhead, both workspaces are allocated as one palloc block; only this
384  * function and btendscan know that.
385  *
386  * NOTE: this data structure also makes it safe to return data from a
387  * "name" column, even though btree name_ops uses an underlying storage
388  * datatype of cstring. The risk there is that "name" is supposed to be
389  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
390  * However, since we only return data out of tuples sitting in the
391  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
392  * data out of the markTuples array --- running off the end of memory for
393  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
394  * adding special-case treatment for name_ops elsewhere.
395  */
396  if (scan->xs_want_itup && so->currTuples == NULL)
397  {
398  so->currTuples = (char *) palloc(BLCKSZ * 2);
399  so->markTuples = so->currTuples + BLCKSZ;
400  }
401 
402  /*
403  * Reset the scan keys
404  */
405  if (scankey && scan->numberOfKeys > 0)
406  memmove(scan->keyData,
407  scankey,
408  scan->numberOfKeys * sizeof(ScanKeyData));
409  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
410  so->numArrayKeys = 0; /* ditto */
411 }
412 
413 /*
414  * btendscan() -- close down a scan
415  */
416 void
418 {
419  BTScanOpaque so = (BTScanOpaque) scan->opaque;
420 
421  /* we aren't holding any read locks, but gotta drop the pins */
423  {
424  /* Before leaving current page, deal with any killed items */
425  if (so->numKilled > 0)
426  _bt_killitems(scan);
428  }
429 
430  so->markItemIndex = -1;
432 
433  /* No need to invalidate positions, the RAM is about to be freed. */
434 
435  /* Release storage */
436  if (so->keyData != NULL)
437  pfree(so->keyData);
438  /* so->arrayKeys and so->orderProcs are in arrayContext */
439  if (so->arrayContext != NULL)
441  if (so->killedItems != NULL)
442  pfree(so->killedItems);
443  if (so->currTuples != NULL)
444  pfree(so->currTuples);
445  /* so->markTuples should not be pfree'd, see btrescan */
446  pfree(so);
447 }
448 
449 /*
450  * btmarkpos() -- save current scan position
451  */
452 void
454 {
455  BTScanOpaque so = (BTScanOpaque) scan->opaque;
456 
457  /* There may be an old mark with a pin (but no lock). */
459 
460  /*
461  * Just record the current itemIndex. If we later step to next page
462  * before releasing the marked position, _bt_steppage makes a full copy of
463  * the currPos struct in markPos. If (as often happens) the mark is moved
464  * before we leave the page, we don't have to do that work.
465  */
466  if (BTScanPosIsValid(so->currPos))
467  so->markItemIndex = so->currPos.itemIndex;
468  else
469  {
471  so->markItemIndex = -1;
472  }
473 }
474 
475 /*
476  * btrestrpos() -- restore scan to last saved position
477  */
478 void
480 {
481  BTScanOpaque so = (BTScanOpaque) scan->opaque;
482 
483  if (so->markItemIndex >= 0)
484  {
485  /*
486  * The scan has never moved to a new page since the last mark. Just
487  * restore the itemIndex.
488  *
489  * NB: In this case we can't count on anything in so->markPos to be
490  * accurate.
491  */
492  so->currPos.itemIndex = so->markItemIndex;
493  }
494  else
495  {
496  /*
497  * The scan moved to a new page after last mark or restore, and we are
498  * now restoring to the marked page. We aren't holding any read
499  * locks, but if we're still holding the pin for the current position,
500  * we must drop it.
501  */
502  if (BTScanPosIsValid(so->currPos))
503  {
504  /* Before leaving current page, deal with any killed items */
505  if (so->numKilled > 0)
506  _bt_killitems(scan);
508  }
509 
510  if (BTScanPosIsValid(so->markPos))
511  {
512  /* bump pin on mark buffer for assignment to current buffer */
513  if (BTScanPosIsPinned(so->markPos))
515  memcpy(&so->currPos, &so->markPos,
516  offsetof(BTScanPosData, items[1]) +
517  so->markPos.lastItem * sizeof(BTScanPosItem));
518  if (so->currTuples)
519  memcpy(so->currTuples, so->markTuples,
521  /* Reset the scan's array keys (see _bt_steppage for why) */
522  if (so->numArrayKeys)
523  {
524  _bt_start_array_keys(scan, so->currPos.dir);
525  so->needPrimScan = false;
526  }
527  }
528  else
530  }
531 }
532 
533 /*
534  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
535  */
536 Size
537 btestimateparallelscan(int nkeys, int norderbys)
538 {
539  /* Pessimistically assume all input scankeys will be output with arrays */
540  return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
541 }
542 
543 /*
544  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
545  */
546 void
547 btinitparallelscan(void *target)
548 {
549  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
550 
551  SpinLockInit(&bt_target->btps_mutex);
552  bt_target->btps_scanPage = InvalidBlockNumber;
554  ConditionVariableInit(&bt_target->btps_cv);
555 }
556 
557 /*
558  * btparallelrescan() -- reset parallel scan
559  */
560 void
562 {
563  BTParallelScanDesc btscan;
564  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
565 
566  Assert(parallel_scan);
567 
568  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
569  parallel_scan->ps_offset);
570 
571  /*
572  * In theory, we don't need to acquire the spinlock here, because there
573  * shouldn't be any other workers running at this point, but we do so for
574  * consistency.
575  */
576  SpinLockAcquire(&btscan->btps_mutex);
579  SpinLockRelease(&btscan->btps_mutex);
580 }
581 
582 /*
583  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
584  * page. Other scans must wait until we call _bt_parallel_release()
585  * or _bt_parallel_done().
586  *
587  * The return value is true if we successfully seized the scan and false
588  * if we did not. The latter case occurs if no pages remain.
589  *
590  * If the return value is true, *pageno returns the next or current page
591  * of the scan (depending on the scan direction). An invalid block number
592  * means the scan hasn't yet started, or that caller needs to start the next
593  * primitive index scan (if it's the latter case we'll set so.needPrimScan).
594  * The first time a participating process reaches the last page, it will return
595  * true and set *pageno to P_NONE; after that, further attempts to seize the
596  * scan will return false.
597  *
598  * Callers should ignore the value of pageno if the return value is false.
599  *
600  * Callers that are in a position to start a new primitive index scan must
601  * pass first=true (all other callers pass first=false). We just return false
602  * for first=false callers that require another primitive index scan.
603  */
604 bool
605 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
606 {
607  BTScanOpaque so = (BTScanOpaque) scan->opaque;
608  bool exit_loop = false;
609  bool status = true;
610  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
611  BTParallelScanDesc btscan;
612 
613  *pageno = P_NONE;
614 
615  if (first)
616  {
617  /*
618  * Initialize array related state when called from _bt_first, assuming
619  * that this will either be the first primitive index scan for the
620  * scan, or a previous explicitly scheduled primitive scan.
621  *
622  * Note: so->needPrimScan is only set when a scheduled primitive index
623  * scan is set to be performed in caller's worker process. It should
624  * not be set here by us for the first primitive scan, nor should we
625  * ever set it for a parallel scan that has no array keys.
626  */
627  so->needPrimScan = false;
628  so->scanBehind = false;
629  }
630  else
631  {
632  /*
633  * Don't attempt to seize the scan when backend requires another
634  * primitive index scan unless we're in a position to start it now
635  */
636  if (so->needPrimScan)
637  return false;
638  }
639 
640  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
641  parallel_scan->ps_offset);
642 
643  while (1)
644  {
645  SpinLockAcquire(&btscan->btps_mutex);
646 
647  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
648  {
649  /* We're done with this parallel index scan */
650  status = false;
651  }
652  else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
653  {
654  Assert(so->numArrayKeys);
655 
656  /*
657  * If we can start another primitive scan right away, do so.
658  * Otherwise just wait.
659  */
660  if (first)
661  {
662  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
663  for (int i = 0; i < so->numArrayKeys; i++)
664  {
665  BTArrayKeyInfo *array = &so->arrayKeys[i];
666  ScanKey skey = &so->keyData[array->scan_key];
667 
668  array->cur_elem = btscan->btps_arrElems[i];
669  skey->sk_argument = array->elem_values[array->cur_elem];
670  }
671  so->needPrimScan = true;
672  so->scanBehind = false;
673  *pageno = InvalidBlockNumber;
674  exit_loop = true;
675  }
676  }
677  else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
678  {
679  /*
680  * We have successfully seized control of the scan for the purpose
681  * of advancing it to a new page!
682  */
683  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
684  *pageno = btscan->btps_scanPage;
685  exit_loop = true;
686  }
687  SpinLockRelease(&btscan->btps_mutex);
688  if (exit_loop || !status)
689  break;
690  ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
691  }
693 
694  return status;
695 }
696 
697 /*
698  * _bt_parallel_release() -- Complete the process of advancing the scan to a
699  * new page. We now have the new value btps_scanPage; some other backend
700  * can now begin advancing the scan.
701  *
702  * Callers whose scan uses array keys must save their scan_page argument so
703  * that it can be passed to _bt_parallel_primscan_schedule, should caller
704  * determine that another primitive index scan is required. If that happens,
705  * scan_page won't be scanned by any backend (unless the next primitive index
706  * scan lands on scan_page).
707  */
708 void
710 {
711  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
712  BTParallelScanDesc btscan;
713 
714  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
715  parallel_scan->ps_offset);
716 
717  SpinLockAcquire(&btscan->btps_mutex);
718  btscan->btps_scanPage = scan_page;
720  SpinLockRelease(&btscan->btps_mutex);
722 }
723 
724 /*
725  * _bt_parallel_done() -- Mark the parallel scan as complete.
726  *
727  * When there are no pages left to scan, this function should be called to
728  * notify other workers. Otherwise, they might wait forever for the scan to
729  * advance to the next page.
730  */
731 void
733 {
734  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
735  BTParallelScanDesc btscan;
736  bool status_changed = false;
737 
738  /* Do nothing, for non-parallel scans */
739  if (parallel_scan == NULL)
740  return;
741 
742  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
743  parallel_scan->ps_offset);
744 
745  /*
746  * Mark the parallel scan as done, unless some other process did so
747  * already
748  */
749  SpinLockAcquire(&btscan->btps_mutex);
750  if (btscan->btps_pageStatus != BTPARALLEL_DONE)
751  {
753  status_changed = true;
754  }
755  SpinLockRelease(&btscan->btps_mutex);
756 
757  /* wake up all the workers associated with this parallel scan */
758  if (status_changed)
760 }
761 
762 /*
763  * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
764  *
765  * Caller passes the block number most recently passed to _bt_parallel_release
766  * by its backend. Caller successfully schedules the next primitive index scan
767  * if the shared parallel state hasn't been seized since caller's backend last
768  * advanced the scan.
769  */
770 void
772 {
773  BTScanOpaque so = (BTScanOpaque) scan->opaque;
774  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
775  BTParallelScanDesc btscan;
776 
777  Assert(so->numArrayKeys);
778 
779  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
780  parallel_scan->ps_offset);
781 
782  SpinLockAcquire(&btscan->btps_mutex);
783  if (btscan->btps_scanPage == prev_scan_page &&
784  btscan->btps_pageStatus == BTPARALLEL_IDLE)
785  {
786  btscan->btps_scanPage = InvalidBlockNumber;
787  btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
788 
789  /* Serialize scan's current array keys */
790  for (int i = 0; i < so->numArrayKeys; i++)
791  {
792  BTArrayKeyInfo *array = &so->arrayKeys[i];
793 
794  btscan->btps_arrElems[i] = array->cur_elem;
795  }
796  }
797  SpinLockRelease(&btscan->btps_mutex);
798 }
799 
800 /*
801  * Bulk deletion of all index entries pointing to a set of heap tuples.
802  * The set of target tuples is specified via a callback routine that tells
803  * whether any given heap tuple (identified by ItemPointer) is being deleted.
804  *
805  * Result: a palloc'd struct containing statistical info for VACUUM displays.
806  */
809  IndexBulkDeleteCallback callback, void *callback_state)
810 {
811  Relation rel = info->index;
812  BTCycleId cycleid;
813 
814  /* allocate stats if first time through, else re-use existing struct */
815  if (stats == NULL)
817 
818  /* Establish the vacuum cycle ID to use for this scan */
819  /* The ENSURE stuff ensures we clean up shared memory on failure */
821  {
822  cycleid = _bt_start_vacuum(rel);
823 
824  btvacuumscan(info, stats, callback, callback_state, cycleid);
825  }
827  _bt_end_vacuum(rel);
828 
829  return stats;
830 }
831 
832 /*
833  * Post-VACUUM cleanup.
834  *
835  * Result: a palloc'd struct containing statistical info for VACUUM displays.
836  */
839 {
840  BlockNumber num_delpages;
841 
842  /* No-op in ANALYZE ONLY mode */
843  if (info->analyze_only)
844  return stats;
845 
846  /*
847  * If btbulkdelete was called, we need not do anything (we just maintain
848  * the information used within _bt_vacuum_needs_cleanup() by calling
849  * _bt_set_cleanup_info() below).
850  *
851  * If btbulkdelete was _not_ called, then we have a choice to make: we
852  * must decide whether or not a btvacuumscan() call is needed now (i.e.
853  * whether the ongoing VACUUM operation can entirely avoid a physical scan
854  * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
855  * now.
856  */
857  if (stats == NULL)
858  {
859  /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
860  if (!_bt_vacuum_needs_cleanup(info->index))
861  return NULL;
862 
863  /*
864  * Since we aren't going to actually delete any leaf items, there's no
865  * need to go through all the vacuum-cycle-ID pushups here.
866  *
867  * Posting list tuples are a source of inaccuracy for cleanup-only
868  * scans. btvacuumscan() will assume that the number of index tuples
869  * from each page can be used as num_index_tuples, even though
870  * num_index_tuples is supposed to represent the number of TIDs in the
871  * index. This naive approach can underestimate the number of tuples
872  * in the index significantly.
873  *
874  * We handle the problem by making num_index_tuples an estimate in
875  * cleanup-only case.
876  */
878  btvacuumscan(info, stats, NULL, NULL, 0);
879  stats->estimated_count = true;
880  }
881 
882  /*
883  * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
884  *
885  * num_delpages is the number of deleted pages now in the index that were
886  * not safe to place in the FSM to be recycled just yet. num_delpages is
887  * greater than 0 only when _bt_pagedel() actually deleted pages during
888  * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
889  * have failed to place any newly deleted pages in the FSM just moments
890  * ago. (Actually, there are edge cases where recycling of the current
891  * VACUUM's newly deleted pages does not even become safe by the time the
892  * next VACUUM comes around. See nbtree/README.)
893  */
894  Assert(stats->pages_deleted >= stats->pages_free);
895  num_delpages = stats->pages_deleted - stats->pages_free;
896  _bt_set_cleanup_info(info->index, num_delpages);
897 
898  /*
899  * It's quite possible for us to be fooled by concurrent page splits into
900  * double-counting some index tuples, so disbelieve any total that exceeds
901  * the underlying heap's count ... if we know that accurately. Otherwise
902  * this might just make matters worse.
903  */
904  if (!info->estimated_count)
905  {
906  if (stats->num_index_tuples > info->num_heap_tuples)
907  stats->num_index_tuples = info->num_heap_tuples;
908  }
909 
910  return stats;
911 }
912 
913 /*
914  * btvacuumscan --- scan the index for VACUUMing purposes
915  *
916  * This combines the functions of looking for leaf tuples that are deletable
917  * according to the vacuum callback, looking for empty pages that can be
918  * deleted, and looking for old deleted pages that can be recycled. Both
919  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
920  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
921  *
922  * The caller is responsible for initially allocating/zeroing a stats struct
923  * and for obtaining a vacuum cycle ID if necessary.
924  */
925 static void
927  IndexBulkDeleteCallback callback, void *callback_state,
928  BTCycleId cycleid)
929 {
930  Relation rel = info->index;
931  BTVacState vstate;
932  BlockNumber num_pages;
933  BlockNumber scanblkno;
934  bool needLock;
935 
936  /*
937  * Reset fields that track information about the entire index now. This
938  * avoids double-counting in the case where a single VACUUM command
939  * requires multiple scans of the index.
940  *
941  * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
942  * since they track information about the VACUUM command, and so must last
943  * across each call to btvacuumscan().
944  *
945  * (Note that pages_free is treated as state about the whole index, not
946  * the current VACUUM. This is appropriate because RecordFreeIndexPage()
947  * calls are idempotent, and get repeated for the same deleted pages in
948  * some scenarios. The point for us is to track the number of recyclable
949  * pages in the index at the end of the VACUUM command.)
950  */
951  stats->num_pages = 0;
952  stats->num_index_tuples = 0;
953  stats->pages_deleted = 0;
954  stats->pages_free = 0;
955 
956  /* Set up info to pass down to btvacuumpage */
957  vstate.info = info;
958  vstate.stats = stats;
959  vstate.callback = callback;
960  vstate.callback_state = callback_state;
961  vstate.cycleid = cycleid;
962 
963  /* Create a temporary memory context to run _bt_pagedel in */
965  "_bt_pagedel",
967 
968  /* Initialize vstate fields used by _bt_pendingfsm_finalize */
969  vstate.bufsize = 0;
970  vstate.maxbufsize = 0;
971  vstate.pendingpages = NULL;
972  vstate.npendingpages = 0;
973  /* Consider applying _bt_pendingfsm_finalize optimization */
974  _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
975 
976  /*
977  * The outer loop iterates over all index pages except the metapage, in
978  * physical order (we hope the kernel will cooperate in providing
979  * read-ahead for speed). It is critical that we visit all leaf pages,
980  * including ones added after we start the scan, else we might fail to
981  * delete some deletable tuples. Hence, we must repeatedly check the
982  * relation length. We must acquire the relation-extension lock while
983  * doing so to avoid a race condition: if someone else is extending the
984  * relation, there is a window where bufmgr/smgr have created a new
985  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
986  * we manage to scan such a page here, we'll improperly assume it can be
987  * recycled. Taking the lock synchronizes things enough to prevent a
988  * problem: either num_pages won't include the new page, or _bt_getbuf
989  * already has write lock on the buffer and it will be fully initialized
990  * before we can examine it. Also, we need not worry if a page is added
991  * immediately after we look; the page splitting code already has
992  * write-lock on the left page before it adds a right page, so we must
993  * already have processed any tuples due to be moved into such a page.
994  *
995  * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
996  * think the use of the extension lock is still required.
997  *
998  * We can skip locking for new or temp relations, however, since no one
999  * else could be accessing them.
1000  */
1001  needLock = !RELATION_IS_LOCAL(rel);
1002 
1003  scanblkno = BTREE_METAPAGE + 1;
1004  for (;;)
1005  {
1006  /* Get the current relation length */
1007  if (needLock)
1009  num_pages = RelationGetNumberOfBlocks(rel);
1010  if (needLock)
1012 
1013  if (info->report_progress)
1015  num_pages);
1016 
1017  /* Quit if we've scanned the whole relation */
1018  if (scanblkno >= num_pages)
1019  break;
1020  /* Iterate over pages, then loop back to recheck length */
1021  for (; scanblkno < num_pages; scanblkno++)
1022  {
1023  btvacuumpage(&vstate, scanblkno);
1024  if (info->report_progress)
1026  scanblkno);
1027  }
1028  }
1029 
1030  /* Set statistics num_pages field to final size of index */
1031  stats->num_pages = num_pages;
1032 
1034 
1035  /*
1036  * If there were any calls to _bt_pagedel() during scan of the index then
1037  * see if any of the resulting pages can be placed in the FSM now. When
1038  * it's not safe we'll have to leave it up to a future VACUUM operation.
1039  *
1040  * Finally, if we placed any pages in the FSM (either just now or during
1041  * the scan), forcibly update the upper-level FSM pages to ensure that
1042  * searchers can find them.
1043  */
1044  _bt_pendingfsm_finalize(rel, &vstate);
1045  if (stats->pages_free > 0)
1047 }
1048 
1049 /*
1050  * btvacuumpage --- VACUUM one page
1051  *
1052  * This processes a single page for btvacuumscan(). In some cases we must
1053  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1054  * a previous call here. This is how we handle page splits (that happened
1055  * after our cycleid was acquired) whose right half page happened to reuse
1056  * a block that we might have processed at some point before it was
1057  * recycled (i.e. before the page split).
1058  */
1059 static void
1061 {
1062  IndexVacuumInfo *info = vstate->info;
1063  IndexBulkDeleteResult *stats = vstate->stats;
1065  void *callback_state = vstate->callback_state;
1066  Relation rel = info->index;
1067  Relation heaprel = info->heaprel;
1068  bool attempt_pagedel;
1069  BlockNumber blkno,
1070  backtrack_to;
1071  Buffer buf;
1072  Page page;
1073  BTPageOpaque opaque;
1074 
1075  blkno = scanblkno;
1076 
1077 backtrack:
1078 
1079  attempt_pagedel = false;
1080  backtrack_to = P_NONE;
1081 
1082  /* call vacuum_delay_point while not holding any buffer lock */
1084 
1085  /*
1086  * We can't use _bt_getbuf() here because it always applies
1087  * _bt_checkpage(), which will barf on an all-zero page. We want to
1088  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1089  * buffer access strategy.
1090  */
1092  info->strategy);
1093  _bt_lockbuf(rel, buf, BT_READ);
1094  page = BufferGetPage(buf);
1095  opaque = NULL;
1096  if (!PageIsNew(page))
1097  {
1098  _bt_checkpage(rel, buf);
1099  opaque = BTPageGetOpaque(page);
1100  }
1101 
1102  Assert(blkno <= scanblkno);
1103  if (blkno != scanblkno)
1104  {
1105  /*
1106  * We're backtracking.
1107  *
1108  * We followed a right link to a sibling leaf page (a page that
1109  * happens to be from a block located before scanblkno). The only
1110  * case we want to do anything with is a live leaf page having the
1111  * current vacuum cycle ID.
1112  *
1113  * The page had better be in a state that's consistent with what we
1114  * expect. Check for conditions that imply corruption in passing. It
1115  * can't be half-dead because only an interrupted VACUUM process can
1116  * leave pages in that state, so we'd definitely have dealt with it
1117  * back when the page was the scanblkno page (half-dead pages are
1118  * always marked fully deleted by _bt_pagedel(), barring corruption).
1119  */
1120  if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1121  {
1122  Assert(false);
1123  ereport(LOG,
1124  (errcode(ERRCODE_INDEX_CORRUPTED),
1125  errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1126  blkno, scanblkno, RelationGetRelationName(rel))));
1127  _bt_relbuf(rel, buf);
1128  return;
1129  }
1130 
1131  /*
1132  * We may have already processed the page in an earlier call, when the
1133  * page was scanblkno. This happens when the leaf page split occurred
1134  * after the scan began, but before the right sibling page became the
1135  * scanblkno.
1136  *
1137  * Page may also have been deleted by current btvacuumpage() call,
1138  * since _bt_pagedel() sometimes deletes the right sibling page of
1139  * scanblkno in passing (it does so after we decided where to
1140  * backtrack to). We don't need to process this page as a deleted
1141  * page a second time now (in fact, it would be wrong to count it as a
1142  * deleted page in the bulk delete statistics a second time).
1143  */
1144  if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1145  {
1146  /* Done with current scanblkno (and all lower split pages) */
1147  _bt_relbuf(rel, buf);
1148  return;
1149  }
1150  }
1151 
1152  if (!opaque || BTPageIsRecyclable(page, heaprel))
1153  {
1154  /* Okay to recycle this page (which could be leaf or internal) */
1155  RecordFreeIndexPage(rel, blkno);
1156  stats->pages_deleted++;
1157  stats->pages_free++;
1158  }
1159  else if (P_ISDELETED(opaque))
1160  {
1161  /*
1162  * Already deleted page (which could be leaf or internal). Can't
1163  * recycle yet.
1164  */
1165  stats->pages_deleted++;
1166  }
1167  else if (P_ISHALFDEAD(opaque))
1168  {
1169  /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1170  attempt_pagedel = true;
1171 
1172  /*
1173  * _bt_pagedel() will increment both pages_newly_deleted and
1174  * pages_deleted stats in all cases (barring corruption)
1175  */
1176  }
1177  else if (P_ISLEAF(opaque))
1178  {
1180  int ndeletable;
1182  int nupdatable;
1183  OffsetNumber offnum,
1184  minoff,
1185  maxoff;
1186  int nhtidsdead,
1187  nhtidslive;
1188 
1189  /*
1190  * Trade in the initial read lock for a full cleanup lock on this
1191  * page. We must get such a lock on every leaf page over the course
1192  * of the vacuum scan, whether or not it actually contains any
1193  * deletable tuples --- see nbtree/README.
1194  */
1196 
1197  /*
1198  * Check whether we need to backtrack to earlier pages. What we are
1199  * concerned about is a page split that happened since we started the
1200  * vacuum scan. If the split moved tuples on the right half of the
1201  * split (i.e. the tuples that sort high) to a block that we already
1202  * passed over, then we might have missed the tuples. We need to
1203  * backtrack now. (Must do this before possibly clearing btpo_cycleid
1204  * or deleting scanblkno page below!)
1205  */
1206  if (vstate->cycleid != 0 &&
1207  opaque->btpo_cycleid == vstate->cycleid &&
1208  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1209  !P_RIGHTMOST(opaque) &&
1210  opaque->btpo_next < scanblkno)
1211  backtrack_to = opaque->btpo_next;
1212 
1213  ndeletable = 0;
1214  nupdatable = 0;
1215  minoff = P_FIRSTDATAKEY(opaque);
1216  maxoff = PageGetMaxOffsetNumber(page);
1217  nhtidsdead = 0;
1218  nhtidslive = 0;
1219  if (callback)
1220  {
1221  /* btbulkdelete callback tells us what to delete (or update) */
1222  for (offnum = minoff;
1223  offnum <= maxoff;
1224  offnum = OffsetNumberNext(offnum))
1225  {
1226  IndexTuple itup;
1227 
1228  itup = (IndexTuple) PageGetItem(page,
1229  PageGetItemId(page, offnum));
1230 
1231  Assert(!BTreeTupleIsPivot(itup));
1232  if (!BTreeTupleIsPosting(itup))
1233  {
1234  /* Regular tuple, standard table TID representation */
1235  if (callback(&itup->t_tid, callback_state))
1236  {
1237  deletable[ndeletable++] = offnum;
1238  nhtidsdead++;
1239  }
1240  else
1241  nhtidslive++;
1242  }
1243  else
1244  {
1245  BTVacuumPosting vacposting;
1246  int nremaining;
1247 
1248  /* Posting list tuple */
1249  vacposting = btreevacuumposting(vstate, itup, offnum,
1250  &nremaining);
1251  if (vacposting == NULL)
1252  {
1253  /*
1254  * All table TIDs from the posting tuple remain, so no
1255  * delete or update required
1256  */
1257  Assert(nremaining == BTreeTupleGetNPosting(itup));
1258  }
1259  else if (nremaining > 0)
1260  {
1261 
1262  /*
1263  * Store metadata about posting list tuple in
1264  * updatable array for entire page. Existing tuple
1265  * will be updated during the later call to
1266  * _bt_delitems_vacuum().
1267  */
1268  Assert(nremaining < BTreeTupleGetNPosting(itup));
1269  updatable[nupdatable++] = vacposting;
1270  nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1271  }
1272  else
1273  {
1274  /*
1275  * All table TIDs from the posting list must be
1276  * deleted. We'll delete the index tuple completely
1277  * (no update required).
1278  */
1279  Assert(nremaining == 0);
1280  deletable[ndeletable++] = offnum;
1281  nhtidsdead += BTreeTupleGetNPosting(itup);
1282  pfree(vacposting);
1283  }
1284 
1285  nhtidslive += nremaining;
1286  }
1287  }
1288  }
1289 
1290  /*
1291  * Apply any needed deletes or updates. We issue just one
1292  * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1293  */
1294  if (ndeletable > 0 || nupdatable > 0)
1295  {
1296  Assert(nhtidsdead >= ndeletable + nupdatable);
1297  _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1298  nupdatable);
1299 
1300  stats->tuples_removed += nhtidsdead;
1301  /* must recompute maxoff */
1302  maxoff = PageGetMaxOffsetNumber(page);
1303 
1304  /* can't leak memory here */
1305  for (int i = 0; i < nupdatable; i++)
1306  pfree(updatable[i]);
1307  }
1308  else
1309  {
1310  /*
1311  * If the leaf page has been split during this vacuum cycle, it
1312  * seems worth expending a write to clear btpo_cycleid even if we
1313  * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1314  * takes care of this.) This ensures we won't process the page
1315  * again.
1316  *
1317  * We treat this like a hint-bit update because there's no need to
1318  * WAL-log it.
1319  */
1320  Assert(nhtidsdead == 0);
1321  if (vstate->cycleid != 0 &&
1322  opaque->btpo_cycleid == vstate->cycleid)
1323  {
1324  opaque->btpo_cycleid = 0;
1325  MarkBufferDirtyHint(buf, true);
1326  }
1327  }
1328 
1329  /*
1330  * If the leaf page is now empty, try to delete it; else count the
1331  * live tuples (live table TIDs in posting lists are counted as
1332  * separate live tuples). We don't delete when backtracking, though,
1333  * since that would require teaching _bt_pagedel() about backtracking
1334  * (doesn't seem worth adding more complexity to deal with that).
1335  *
1336  * We don't count the number of live TIDs during cleanup-only calls to
1337  * btvacuumscan (i.e. when callback is not set). We count the number
1338  * of index tuples directly instead. This avoids the expense of
1339  * directly examining all of the tuples on each page. VACUUM will
1340  * treat num_index_tuples as an estimate in cleanup-only case, so it
1341  * doesn't matter that this underestimates num_index_tuples
1342  * significantly in some cases.
1343  */
1344  if (minoff > maxoff)
1345  attempt_pagedel = (blkno == scanblkno);
1346  else if (callback)
1347  stats->num_index_tuples += nhtidslive;
1348  else
1349  stats->num_index_tuples += maxoff - minoff + 1;
1350 
1351  Assert(!attempt_pagedel || nhtidslive == 0);
1352  }
1353 
1354  if (attempt_pagedel)
1355  {
1356  MemoryContext oldcontext;
1357 
1358  /* Run pagedel in a temp context to avoid memory leakage */
1360  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1361 
1362  /*
1363  * _bt_pagedel maintains the bulk delete stats on our behalf;
1364  * pages_newly_deleted and pages_deleted are likely to be incremented
1365  * during call
1366  */
1367  Assert(blkno == scanblkno);
1368  _bt_pagedel(rel, buf, vstate);
1369 
1370  MemoryContextSwitchTo(oldcontext);
1371  /* pagedel released buffer, so we shouldn't */
1372  }
1373  else
1374  _bt_relbuf(rel, buf);
1375 
1376  if (backtrack_to != P_NONE)
1377  {
1378  blkno = backtrack_to;
1379  goto backtrack;
1380  }
1381 }
1382 
1383 /*
1384  * btreevacuumposting --- determine TIDs still needed in posting list
1385  *
1386  * Returns metadata describing how to build replacement tuple without the TIDs
1387  * that VACUUM needs to delete. Returned value is NULL in the common case
1388  * where no changes are needed to caller's posting list tuple (we avoid
1389  * allocating memory here as an optimization).
1390  *
1391  * The number of TIDs that should remain in the posting list tuple is set for
1392  * caller in *nremaining.
1393  */
1394 static BTVacuumPosting
1396  OffsetNumber updatedoffset, int *nremaining)
1397 {
1398  int live = 0;
1399  int nitem = BTreeTupleGetNPosting(posting);
1401  BTVacuumPosting vacposting = NULL;
1402 
1403  for (int i = 0; i < nitem; i++)
1404  {
1405  if (!vstate->callback(items + i, vstate->callback_state))
1406  {
1407  /* Live table TID */
1408  live++;
1409  }
1410  else if (vacposting == NULL)
1411  {
1412  /*
1413  * First dead table TID encountered.
1414  *
1415  * It's now clear that we need to delete one or more dead table
1416  * TIDs, so start maintaining metadata describing how to update
1417  * existing posting list tuple.
1418  */
1419  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1420  nitem * sizeof(uint16));
1421 
1422  vacposting->itup = posting;
1423  vacposting->updatedoffset = updatedoffset;
1424  vacposting->ndeletedtids = 0;
1425  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1426  }
1427  else
1428  {
1429  /* Second or subsequent dead table TID */
1430  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1431  }
1432  }
1433 
1434  *nremaining = live;
1435  return vacposting;
1436 }
1437 
1438 /*
1439  * btcanreturn() -- Check whether btree indexes support index-only scans.
1440  *
1441  * btrees always do, so this is trivial.
1442  */
1443 bool
1445 {
1446  return true;
1447 }
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static Datum values[MAXATTR]
Definition: bootstrap.c:152
int Buffer
Definition: buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4882
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4914
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:792
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:281
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:408
@ RBM_NORMAL
Definition: bufmgr.h:45
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static bool PageIsNew(Page page)
Definition: bufpage.h:230
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:271
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:295
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:129
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:86
#define OffsetToPointer(base, offset)
Definition: c.h:772
unsigned short uint16
Definition: c.h:505
#define Assert(condition)
Definition: c.h:858
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:398
size_t Size
Definition: c.h:605
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1159
int errcode(int sqlerrcode)
Definition: elog.c:859
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:78
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:87
IndexUniqueCheck
Definition: genam.h:116
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:165
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:430
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:480
#define ExclusiveLock
Definition: lockdefs.h:42
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:383
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc0(Size size)
Definition: mcxt.c:1346
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
void * palloc(Size size)
Definition: mcxt.c:1316
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:102
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1802
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1154
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:232
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1109
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:179
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2995
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2954
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1444
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:709
BTPS_State
Definition: nbtree.c:55
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:58
@ BTPARALLEL_NEED_PRIMSCAN
Definition: nbtree.c:57
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56
@ BTPARALLEL_IDLE
Definition: nbtree.c:59
@ BTPARALLEL_DONE
Definition: nbtree.c:60
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:312
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:732
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:808
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1395
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:838
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:206
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:561
struct BTParallelScanDescData BTParallelScanDescData
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:182
void btbuildempty(Relation index)
Definition: nbtree.c:159
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
Definition: nbtree.c:1060
void btinitparallelscan(void *target)
Definition: nbtree.c:547
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:926
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
Definition: nbtree.c:605
Size btestimateparallelscan(int nkeys, int norderbys)
Definition: nbtree.c:537
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:266
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:453
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:417
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:359
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:83
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:101
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
Definition: nbtree.c:771
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:479
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:999
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:537
#define BTNProcs
Definition: nbtree.h:712
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1016
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
uint16 BTCycleId
Definition: nbtree.h:29
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_READ
Definition: nbtree.h:719
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:291
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1022
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1010
#define BTP_SPLIT_END
Definition: nbtree.h:81
#define BTOPTIONS_PROC
Definition: nbtree.h:711
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1081
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:876
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1492
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:293
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:4483
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:4607
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:4511
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:4561
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:4175
bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1668
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:4584
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:5139
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1343
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:4426
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:41
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:293
#define makeNode(_type_)
Definition: nodes.h:155
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:73
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
#define InvalidOid
Definition: postgres_ext.h:36
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:122
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:121
MemoryContextSwitchTo(old_ctx)
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:648
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
@ MAIN_FORKNUM
Definition: relpath.h:50
@ INIT_FORKNUM
Definition: relpath.h:53
int slock_t
Definition: s_lock.h:735
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6788
ScanKeyData * ScanKey
Definition: skey.h:75
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define BTMaxStrategyNumber
Definition: stratnum.h:35
Datum * elem_values
Definition: nbtree.h:1037
BlockNumber btpo_next
Definition: nbtree.h:65
uint16 btpo_flags
Definition: nbtree.h:67
BTCycleId btpo_cycleid
Definition: nbtree.h:68
slock_t btps_mutex
Definition: nbtree.c:73
BTPS_State btps_pageStatus
Definition: nbtree.c:70
ConditionVariable btps_cv
Definition: nbtree.c:74
BlockNumber btps_scanPage
Definition: nbtree.c:69
int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.c:80
bool needPrimScan
Definition: nbtree.h:1049
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1051
char * markTuples
Definition: nbtree.h:1065
FmgrInfo * orderProcs
Definition: nbtree.h:1052
BTScanPosData currPos
Definition: nbtree.h:1077
int * killedItems
Definition: nbtree.h:1056
char * currTuples
Definition: nbtree.h:1064
BTScanPosData markPos
Definition: nbtree.h:1078
ScanKey keyData
Definition: nbtree.h:1045
MemoryContext arrayContext
Definition: nbtree.h:1053
Buffer buf
Definition: nbtree.h:953
int nextTupleOffset
Definition: nbtree.h:981
int lastItem
Definition: nbtree.h:991
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:994
int itemIndex
Definition: nbtree.h:992
ScanDirection dir
Definition: nbtree.h:975
ItemPointerData heapTid
Definition: nbtree.h:946
IndexBulkDeleteResult * stats
Definition: nbtree.h:333
BTCycleId cycleid
Definition: nbtree.h:336
BTPendingFSM * pendingpages
Definition: nbtree.h:344
int npendingpages
Definition: nbtree.h:345
IndexBulkDeleteCallback callback
Definition: nbtree.h:334
MemoryContext pagedelcontext
Definition: nbtree.h:337
IndexVacuumInfo * info
Definition: nbtree.h:332
int bufsize
Definition: nbtree.h:342
int maxbufsize
Definition: nbtree.h:343
void * callback_state
Definition: nbtree.h:335
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:911
uint16 ndeletedtids
Definition: nbtree.h:910
IndexTuple itup
Definition: nbtree.h:906
OffsetNumber updatedoffset
Definition: nbtree.h:907
ambuildphasename_function ambuildphasename
Definition: amapi.h:277
ambuildempty_function ambuildempty
Definition: amapi.h:268
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:272
bool amclusterable
Definition: amapi.h:242
amoptions_function amoptions
Definition: amapi.h:275
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:289
amrestrpos_function amrestrpos
Definition: amapi.h:286
aminsert_function aminsert
Definition: amapi.h:269
amendscan_function amendscan
Definition: amapi.h:284
uint16 amoptsprocnum
Definition: amapi.h:222
amparallelrescan_function amparallelrescan
Definition: amapi.h:291
Oid amkeytype
Definition: amapi.h:258
bool ampredlocks
Definition: amapi.h:244
uint16 amsupport
Definition: amapi.h:220
amcostestimate_function amcostestimate
Definition: amapi.h:274
bool amcanorderbyop
Definition: amapi.h:226
amadjustmembers_function amadjustmembers
Definition: amapi.h:279
ambuild_function ambuild
Definition: amapi.h:267
bool amstorage
Definition: amapi.h:240
uint16 amstrategies
Definition: amapi.h:218
bool amoptionalkey
Definition: amapi.h:234
amgettuple_function amgettuple
Definition: amapi.h:282
amcanreturn_function amcanreturn
Definition: amapi.h:273
bool amcanunique
Definition: amapi.h:230
amgetbitmap_function amgetbitmap
Definition: amapi.h:283
amproperty_function amproperty
Definition: amapi.h:276
ambulkdelete_function ambulkdelete
Definition: amapi.h:271
bool amsearcharray
Definition: amapi.h:236
bool amsummarizing
Definition: amapi.h:254
amvalidate_function amvalidate
Definition: amapi.h:278
ammarkpos_function ammarkpos
Definition: amapi.h:285
bool amcanmulticol
Definition: amapi.h:232
bool amusemaintenanceworkmem
Definition: amapi.h:252
ambeginscan_function ambeginscan
Definition: amapi.h:280
bool amcanparallel
Definition: amapi.h:246
amrescan_function amrescan
Definition: amapi.h:281
bool amcanorder
Definition: amapi.h:224
bool amcanbuildparallel
Definition: amapi.h:248
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:290
uint8 amparallelvacuumoptions
Definition: amapi.h:256
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:270
bool amcanbackward
Definition: amapi.h:228
bool amcaninclude
Definition: amapi.h:250
bool amsearchnulls
Definition: amapi.h:238
bool estimated_count
Definition: genam.h:78
BlockNumber pages_deleted
Definition: genam.h:82
BlockNumber pages_free
Definition: genam.h:83
BlockNumber num_pages
Definition: genam.h:77
double tuples_removed
Definition: genam.h:80
double num_index_tuples
Definition: genam.h:79
struct ScanKeyData * keyData
Definition: relscan.h:122
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
bool kill_prior_tuple
Definition: relscan.h:128
struct TupleDescData * xs_itupdesc
Definition: relscan.h:143
ItemPointerData xs_heaptid
Definition: relscan.h:147
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:46
double num_heap_tuples
Definition: genam.h:52
bool analyze_only
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:53
Relation heaprel
Definition: genam.h:47
bool report_progress
Definition: genam.h:49
bool estimated_count
Definition: genam.h:50
Datum sk_argument
Definition: skey.h:72
Definition: type.h:95
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
static ItemArray items
Definition: test_tidstore.c:49
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:377
void vacuum_delay_point(void)
Definition: vacuum.c:2336
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:55