PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/relscan.h"
23 #include "access/xloginsert.h"
24 #include "commands/progress.h"
25 #include "commands/vacuum.h"
26 #include "miscadmin.h"
27 #include "nodes/execnodes.h"
28 #include "pgstat.h"
29 #include "storage/bulk_write.h"
31 #include "storage/indexfsm.h"
32 #include "storage/ipc.h"
33 #include "storage/lmgr.h"
34 #include "storage/smgr.h"
35 #include "utils/fmgrprotos.h"
36 #include "utils/index_selfuncs.h"
37 #include "utils/memutils.h"
38 
39 
40 /*
41  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
42  *
43  * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
44  * scan to advance it via another call to _bt_first.
45  *
46  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
47  * a new page; others must wait.
48  *
49  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
50  * to a new page; some process can start doing that.
51  *
52  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53  */
54 typedef enum
55 {
61 } BTPS_State;
62 
63 /*
64  * BTParallelScanDescData contains btree specific shared information required
65  * for parallel scan.
66  */
67 typedef struct BTParallelScanDescData
68 {
69  BlockNumber btps_scanPage; /* latest or next page to be scanned */
70  BTPS_State btps_pageStatus; /* indicates whether next page is
71  * available for scan. see above for
72  * possible states of parallel scan. */
73  slock_t btps_mutex; /* protects above variables, btps_arrElems */
74  ConditionVariable btps_cv; /* used to synchronize parallel scan */
75 
76  /*
77  * btps_arrElems is used when scans need to schedule another primitive
78  * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
79  */
82 
84 
85 
86 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
87  IndexBulkDeleteCallback callback, void *callback_state,
88  BTCycleId cycleid);
89 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
91  IndexTuple posting,
92  OffsetNumber updatedoffset,
93  int *nremaining);
94 
95 
96 /*
97  * Btree handler function: return IndexAmRoutine with access method parameters
98  * and callbacks.
99  */
100 Datum
102 {
103  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
104 
105  amroutine->amstrategies = BTMaxStrategyNumber;
106  amroutine->amsupport = BTNProcs;
107  amroutine->amoptsprocnum = BTOPTIONS_PROC;
108  amroutine->amcanorder = true;
109  amroutine->amcanorderbyop = false;
110  amroutine->amcanbackward = true;
111  amroutine->amcanunique = true;
112  amroutine->amcanmulticol = true;
113  amroutine->amoptionalkey = true;
114  amroutine->amsearcharray = true;
115  amroutine->amsearchnulls = true;
116  amroutine->amstorage = false;
117  amroutine->amclusterable = true;
118  amroutine->ampredlocks = true;
119  amroutine->amcanparallel = true;
120  amroutine->amcanbuildparallel = true;
121  amroutine->amcaninclude = true;
122  amroutine->amusemaintenanceworkmem = false;
123  amroutine->amsummarizing = false;
124  amroutine->amparallelvacuumoptions =
126  amroutine->amkeytype = InvalidOid;
127 
128  amroutine->ambuild = btbuild;
129  amroutine->ambuildempty = btbuildempty;
130  amroutine->aminsert = btinsert;
131  amroutine->aminsertcleanup = NULL;
132  amroutine->ambulkdelete = btbulkdelete;
133  amroutine->amvacuumcleanup = btvacuumcleanup;
134  amroutine->amcanreturn = btcanreturn;
135  amroutine->amcostestimate = btcostestimate;
136  amroutine->amgettreeheight = btgettreeheight;
137  amroutine->amoptions = btoptions;
138  amroutine->amproperty = btproperty;
139  amroutine->ambuildphasename = btbuildphasename;
140  amroutine->amvalidate = btvalidate;
141  amroutine->amadjustmembers = btadjustmembers;
142  amroutine->ambeginscan = btbeginscan;
143  amroutine->amrescan = btrescan;
144  amroutine->amgettuple = btgettuple;
145  amroutine->amgetbitmap = btgetbitmap;
146  amroutine->amendscan = btendscan;
147  amroutine->ammarkpos = btmarkpos;
148  amroutine->amrestrpos = btrestrpos;
151  amroutine->amparallelrescan = btparallelrescan;
152 
153  PG_RETURN_POINTER(amroutine);
154 }
155 
156 /*
157  * btbuildempty() -- build an empty btree index in the initialization fork
158  */
159 void
161 {
162  bool allequalimage = _bt_allequalimage(index, false);
163  BulkWriteState *bulkstate;
164  BulkWriteBuffer metabuf;
165 
166  bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
167 
168  /* Construct metapage. */
169  metabuf = smgr_bulk_get_buf(bulkstate);
170  _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
171  smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
172 
173  smgr_bulk_finish(bulkstate);
174 }
175 
176 /*
177  * btinsert() -- insert an index tuple into a btree.
178  *
179  * Descend the tree recursively, find the appropriate location for our
180  * new tuple, and put it there.
181  */
182 bool
183 btinsert(Relation rel, Datum *values, bool *isnull,
184  ItemPointer ht_ctid, Relation heapRel,
185  IndexUniqueCheck checkUnique,
186  bool indexUnchanged,
187  IndexInfo *indexInfo)
188 {
189  bool result;
190  IndexTuple itup;
191 
192  /* generate an index tuple */
193  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
194  itup->t_tid = *ht_ctid;
195 
196  result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
197 
198  pfree(itup);
199 
200  return result;
201 }
202 
203 /*
204  * btgettuple() -- Get the next tuple in the scan.
205  */
206 bool
208 {
209  BTScanOpaque so = (BTScanOpaque) scan->opaque;
210  bool res;
211 
212  /* btree indexes are never lossy */
213  scan->xs_recheck = false;
214 
215  /* Each loop iteration performs another primitive index scan */
216  do
217  {
218  /*
219  * If we've already initialized this scan, we can just advance it in
220  * the appropriate direction. If we haven't done so yet, we call
221  * _bt_first() to get the first item in the scan.
222  */
223  if (!BTScanPosIsValid(so->currPos))
224  res = _bt_first(scan, dir);
225  else
226  {
227  /*
228  * Check to see if we should kill the previously-fetched tuple.
229  */
230  if (scan->kill_prior_tuple)
231  {
232  /*
233  * Yes, remember it for later. (We'll deal with all such
234  * tuples at once right before leaving the index page.) The
235  * test for numKilled overrun is not just paranoia: if the
236  * caller reverses direction in the indexscan then the same
237  * item might get entered multiple times. It's not worth
238  * trying to optimize that, so we don't detect it, but instead
239  * just forget any excess entries.
240  */
241  if (so->killedItems == NULL)
242  so->killedItems = (int *)
243  palloc(MaxTIDsPerBTreePage * sizeof(int));
244  if (so->numKilled < MaxTIDsPerBTreePage)
245  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
246  }
247 
248  /*
249  * Now continue the scan.
250  */
251  res = _bt_next(scan, dir);
252  }
253 
254  /* If we have a tuple, return it ... */
255  if (res)
256  break;
257  /* ... otherwise see if we need another primitive index scan */
258  } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
259 
260  return res;
261 }
262 
263 /*
264  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
265  */
266 int64
268 {
269  BTScanOpaque so = (BTScanOpaque) scan->opaque;
270  int64 ntids = 0;
271  ItemPointer heapTid;
272 
273  /* Each loop iteration performs another primitive index scan */
274  do
275  {
276  /* Fetch the first page & tuple */
278  {
279  /* Save tuple ID, and continue scanning */
280  heapTid = &scan->xs_heaptid;
281  tbm_add_tuples(tbm, heapTid, 1, false);
282  ntids++;
283 
284  for (;;)
285  {
286  /*
287  * Advance to next tuple within page. This is the same as the
288  * easy case in _bt_next().
289  */
290  if (++so->currPos.itemIndex > so->currPos.lastItem)
291  {
292  /* let _bt_next do the heavy lifting */
293  if (!_bt_next(scan, ForwardScanDirection))
294  break;
295  }
296 
297  /* Save tuple ID, and continue scanning */
298  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
299  tbm_add_tuples(tbm, heapTid, 1, false);
300  ntids++;
301  }
302  }
303  /* Now see if we need another primitive index scan */
304  } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
305 
306  return ntids;
307 }
308 
309 /*
310  * btbeginscan() -- start a scan on a btree index
311  */
313 btbeginscan(Relation rel, int nkeys, int norderbys)
314 {
315  IndexScanDesc scan;
316  BTScanOpaque so;
317 
318  /* no order by operators allowed */
319  Assert(norderbys == 0);
320 
321  /* get the scan */
322  scan = RelationGetIndexScan(rel, nkeys, norderbys);
323 
324  /* allocate private workspace */
325  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
328  if (scan->numberOfKeys > 0)
329  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
330  else
331  so->keyData = NULL;
332 
333  so->needPrimScan = false;
334  so->scanBehind = false;
335  so->arrayKeys = NULL;
336  so->orderProcs = NULL;
337  so->arrayContext = NULL;
338 
339  so->killedItems = NULL; /* until needed */
340  so->numKilled = 0;
341 
342  /*
343  * We don't know yet whether the scan will be index-only, so we do not
344  * allocate the tuple workspace arrays until btrescan. However, we set up
345  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
346  */
347  so->currTuples = so->markTuples = NULL;
348 
349  scan->xs_itupdesc = RelationGetDescr(rel);
350 
351  scan->opaque = so;
352 
353  return scan;
354 }
355 
356 /*
357  * btrescan() -- rescan an index relation
358  */
359 void
360 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
361  ScanKey orderbys, int norderbys)
362 {
363  BTScanOpaque so = (BTScanOpaque) scan->opaque;
364 
365  /* we aren't holding any read locks, but gotta drop the pins */
367  {
368  /* Before leaving current page, deal with any killed items */
369  if (so->numKilled > 0)
370  _bt_killitems(scan);
373  }
374 
375  so->markItemIndex = -1;
376  so->needPrimScan = false;
377  so->scanBehind = false;
380 
381  /*
382  * Allocate tuple workspace arrays, if needed for an index-only scan and
383  * not already done in a previous rescan call. To save on palloc
384  * overhead, both workspaces are allocated as one palloc block; only this
385  * function and btendscan know that.
386  *
387  * NOTE: this data structure also makes it safe to return data from a
388  * "name" column, even though btree name_ops uses an underlying storage
389  * datatype of cstring. The risk there is that "name" is supposed to be
390  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
391  * However, since we only return data out of tuples sitting in the
392  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
393  * data out of the markTuples array --- running off the end of memory for
394  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
395  * adding special-case treatment for name_ops elsewhere.
396  */
397  if (scan->xs_want_itup && so->currTuples == NULL)
398  {
399  so->currTuples = (char *) palloc(BLCKSZ * 2);
400  so->markTuples = so->currTuples + BLCKSZ;
401  }
402 
403  /*
404  * Reset the scan keys
405  */
406  if (scankey && scan->numberOfKeys > 0)
407  memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
408  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
409  so->numArrayKeys = 0; /* ditto */
410 }
411 
412 /*
413  * btendscan() -- close down a scan
414  */
415 void
417 {
418  BTScanOpaque so = (BTScanOpaque) scan->opaque;
419 
420  /* we aren't holding any read locks, but gotta drop the pins */
422  {
423  /* Before leaving current page, deal with any killed items */
424  if (so->numKilled > 0)
425  _bt_killitems(scan);
427  }
428 
429  so->markItemIndex = -1;
431 
432  /* No need to invalidate positions, the RAM is about to be freed. */
433 
434  /* Release storage */
435  if (so->keyData != NULL)
436  pfree(so->keyData);
437  /* so->arrayKeys and so->orderProcs are in arrayContext */
438  if (so->arrayContext != NULL)
440  if (so->killedItems != NULL)
441  pfree(so->killedItems);
442  if (so->currTuples != NULL)
443  pfree(so->currTuples);
444  /* so->markTuples should not be pfree'd, see btrescan */
445  pfree(so);
446 }
447 
448 /*
449  * btmarkpos() -- save current scan position
450  */
451 void
453 {
454  BTScanOpaque so = (BTScanOpaque) scan->opaque;
455 
456  /* There may be an old mark with a pin (but no lock). */
458 
459  /*
460  * Just record the current itemIndex. If we later step to next page
461  * before releasing the marked position, _bt_steppage makes a full copy of
462  * the currPos struct in markPos. If (as often happens) the mark is moved
463  * before we leave the page, we don't have to do that work.
464  */
465  if (BTScanPosIsValid(so->currPos))
466  so->markItemIndex = so->currPos.itemIndex;
467  else
468  {
470  so->markItemIndex = -1;
471  }
472 }
473 
474 /*
475  * btrestrpos() -- restore scan to last saved position
476  */
477 void
479 {
480  BTScanOpaque so = (BTScanOpaque) scan->opaque;
481 
482  if (so->markItemIndex >= 0)
483  {
484  /*
485  * The scan has never moved to a new page since the last mark. Just
486  * restore the itemIndex.
487  *
488  * NB: In this case we can't count on anything in so->markPos to be
489  * accurate.
490  */
491  so->currPos.itemIndex = so->markItemIndex;
492  }
493  else
494  {
495  /*
496  * The scan moved to a new page after last mark or restore, and we are
497  * now restoring to the marked page. We aren't holding any read
498  * locks, but if we're still holding the pin for the current position,
499  * we must drop it.
500  */
501  if (BTScanPosIsValid(so->currPos))
502  {
503  /* Before leaving current page, deal with any killed items */
504  if (so->numKilled > 0)
505  _bt_killitems(scan);
507  }
508 
509  if (BTScanPosIsValid(so->markPos))
510  {
511  /* bump pin on mark buffer for assignment to current buffer */
512  if (BTScanPosIsPinned(so->markPos))
514  memcpy(&so->currPos, &so->markPos,
515  offsetof(BTScanPosData, items[1]) +
516  so->markPos.lastItem * sizeof(BTScanPosItem));
517  if (so->currTuples)
518  memcpy(so->currTuples, so->markTuples,
520  /* Reset the scan's array keys (see _bt_steppage for why) */
521  if (so->numArrayKeys)
522  {
523  _bt_start_array_keys(scan, so->currPos.dir);
524  so->needPrimScan = false;
525  }
526  }
527  else
529  }
530 }
531 
532 /*
533  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
534  */
535 Size
536 btestimateparallelscan(int nkeys, int norderbys)
537 {
538  /* Pessimistically assume all input scankeys will be output with arrays */
539  return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
540 }
541 
542 /*
543  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
544  */
545 void
546 btinitparallelscan(void *target)
547 {
548  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
549 
550  SpinLockInit(&bt_target->btps_mutex);
551  bt_target->btps_scanPage = InvalidBlockNumber;
553  ConditionVariableInit(&bt_target->btps_cv);
554 }
555 
556 /*
557  * btparallelrescan() -- reset parallel scan
558  */
559 void
561 {
562  BTParallelScanDesc btscan;
563  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
564 
565  Assert(parallel_scan);
566 
567  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
568  parallel_scan->ps_offset);
569 
570  /*
571  * In theory, we don't need to acquire the spinlock here, because there
572  * shouldn't be any other workers running at this point, but we do so for
573  * consistency.
574  */
575  SpinLockAcquire(&btscan->btps_mutex);
578  SpinLockRelease(&btscan->btps_mutex);
579 }
580 
581 /*
582  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
583  * page. Other scans must wait until we call _bt_parallel_release()
584  * or _bt_parallel_done().
585  *
586  * The return value is true if we successfully seized the scan and false
587  * if we did not. The latter case occurs when no pages remain, or when
588  * another primitive index scan is scheduled that caller's backend cannot
589  * start just yet (only backends that call from _bt_first are capable of
590  * starting primitive index scans, which they indicate by passing first=true).
591  *
592  * If the return value is true, *pageno returns the next or current page
593  * of the scan (depending on the scan direction). An invalid block number
594  * means the scan hasn't yet started, or that caller needs to start the next
595  * primitive index scan (if it's the latter case we'll set so.needPrimScan).
596  * The first time a participating process reaches the last page, it will return
597  * true and set *pageno to P_NONE; after that, further attempts to seize the
598  * scan will return false.
599  *
600  * Callers should ignore the value of pageno if the return value is false.
601  */
602 bool
603 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
604 {
605  BTScanOpaque so = (BTScanOpaque) scan->opaque;
606  bool exit_loop = false;
607  bool status = true;
608  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
609  BTParallelScanDesc btscan;
610 
611  *pageno = P_NONE;
612 
613  if (first)
614  {
615  /*
616  * Initialize array related state when called from _bt_first, assuming
617  * that this will be the first primitive index scan for the scan
618  */
619  so->needPrimScan = false;
620  so->scanBehind = false;
621  }
622  else
623  {
624  /*
625  * Don't attempt to seize the scan when it requires another primitive
626  * index scan, since caller's backend cannot start it right now
627  */
628  if (so->needPrimScan)
629  return false;
630  }
631 
632  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
633  parallel_scan->ps_offset);
634 
635  while (1)
636  {
637  SpinLockAcquire(&btscan->btps_mutex);
638 
639  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
640  {
641  /* We're done with this parallel index scan */
642  status = false;
643  }
644  else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
645  {
646  Assert(so->numArrayKeys);
647 
648  if (first)
649  {
650  /* Can start scheduled primitive scan right away, so do so */
651  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
652  for (int i = 0; i < so->numArrayKeys; i++)
653  {
654  BTArrayKeyInfo *array = &so->arrayKeys[i];
655  ScanKey skey = &so->keyData[array->scan_key];
656 
657  array->cur_elem = btscan->btps_arrElems[i];
658  skey->sk_argument = array->elem_values[array->cur_elem];
659  }
660  *pageno = InvalidBlockNumber;
661  exit_loop = true;
662  }
663  else
664  {
665  /*
666  * Don't attempt to seize the scan when it requires another
667  * primitive index scan, since caller's backend cannot start
668  * it right now
669  */
670  status = false;
671  }
672 
673  /*
674  * Either way, update backend local state to indicate that a
675  * pending primitive scan is required
676  */
677  so->needPrimScan = true;
678  so->scanBehind = false;
679  }
680  else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
681  {
682  /*
683  * We have successfully seized control of the scan for the purpose
684  * of advancing it to a new page!
685  */
686  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
687  *pageno = btscan->btps_scanPage;
688  exit_loop = true;
689  }
690  SpinLockRelease(&btscan->btps_mutex);
691  if (exit_loop || !status)
692  break;
693  ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
694  }
696 
697  return status;
698 }
699 
700 /*
701  * _bt_parallel_release() -- Complete the process of advancing the scan to a
702  * new page. We now have the new value btps_scanPage; some other backend
703  * can now begin advancing the scan.
704  *
705  * Callers whose scan uses array keys must save their scan_page argument so
706  * that it can be passed to _bt_parallel_primscan_schedule, should caller
707  * determine that another primitive index scan is required. If that happens,
708  * scan_page won't be scanned by any backend (unless the next primitive index
709  * scan lands on scan_page).
710  */
711 void
713 {
714  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
715  BTParallelScanDesc btscan;
716 
717  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
718  parallel_scan->ps_offset);
719 
720  SpinLockAcquire(&btscan->btps_mutex);
721  btscan->btps_scanPage = scan_page;
723  SpinLockRelease(&btscan->btps_mutex);
725 }
726 
727 /*
728  * _bt_parallel_done() -- Mark the parallel scan as complete.
729  *
730  * When there are no pages left to scan, this function should be called to
731  * notify other workers. Otherwise, they might wait forever for the scan to
732  * advance to the next page.
733  */
734 void
736 {
737  BTScanOpaque so = (BTScanOpaque) scan->opaque;
738  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
739  BTParallelScanDesc btscan;
740  bool status_changed = false;
741 
742  /* Do nothing, for non-parallel scans */
743  if (parallel_scan == NULL)
744  return;
745 
746  /*
747  * Should not mark parallel scan done when there's still a pending
748  * primitive index scan
749  */
750  if (so->needPrimScan)
751  return;
752 
753  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
754  parallel_scan->ps_offset);
755 
756  /*
757  * Mark the parallel scan as done, unless some other process did so
758  * already
759  */
760  SpinLockAcquire(&btscan->btps_mutex);
761  Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
762  if (btscan->btps_pageStatus != BTPARALLEL_DONE)
763  {
764  btscan->btps_pageStatus = BTPARALLEL_DONE;
765  status_changed = true;
766  }
767  SpinLockRelease(&btscan->btps_mutex);
768 
769  /* wake up all the workers associated with this parallel scan */
770  if (status_changed)
771  ConditionVariableBroadcast(&btscan->btps_cv);
772 }
773 
774 /*
775  * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
776  *
777  * Caller passes the block number most recently passed to _bt_parallel_release
778  * by its backend. Caller successfully schedules the next primitive index scan
779  * if the shared parallel state hasn't been seized since caller's backend last
780  * advanced the scan.
781  */
782 void
784 {
785  BTScanOpaque so = (BTScanOpaque) scan->opaque;
786  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
787  BTParallelScanDesc btscan;
788 
789  Assert(so->numArrayKeys);
790 
791  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
792  parallel_scan->ps_offset);
793 
794  SpinLockAcquire(&btscan->btps_mutex);
795  if (btscan->btps_scanPage == prev_scan_page &&
796  btscan->btps_pageStatus == BTPARALLEL_IDLE)
797  {
798  btscan->btps_scanPage = InvalidBlockNumber;
799  btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
800 
801  /* Serialize scan's current array keys */
802  for (int i = 0; i < so->numArrayKeys; i++)
803  {
804  BTArrayKeyInfo *array = &so->arrayKeys[i];
805 
806  btscan->btps_arrElems[i] = array->cur_elem;
807  }
808  }
809  SpinLockRelease(&btscan->btps_mutex);
810 }
811 
812 /*
813  * Bulk deletion of all index entries pointing to a set of heap tuples.
814  * The set of target tuples is specified via a callback routine that tells
815  * whether any given heap tuple (identified by ItemPointer) is being deleted.
816  *
817  * Result: a palloc'd struct containing statistical info for VACUUM displays.
818  */
821  IndexBulkDeleteCallback callback, void *callback_state)
822 {
823  Relation rel = info->index;
824  BTCycleId cycleid;
825 
826  /* allocate stats if first time through, else re-use existing struct */
827  if (stats == NULL)
829 
830  /* Establish the vacuum cycle ID to use for this scan */
831  /* The ENSURE stuff ensures we clean up shared memory on failure */
833  {
834  cycleid = _bt_start_vacuum(rel);
835 
836  btvacuumscan(info, stats, callback, callback_state, cycleid);
837  }
839  _bt_end_vacuum(rel);
840 
841  return stats;
842 }
843 
844 /*
845  * Post-VACUUM cleanup.
846  *
847  * Result: a palloc'd struct containing statistical info for VACUUM displays.
848  */
851 {
852  BlockNumber num_delpages;
853 
854  /* No-op in ANALYZE ONLY mode */
855  if (info->analyze_only)
856  return stats;
857 
858  /*
859  * If btbulkdelete was called, we need not do anything (we just maintain
860  * the information used within _bt_vacuum_needs_cleanup() by calling
861  * _bt_set_cleanup_info() below).
862  *
863  * If btbulkdelete was _not_ called, then we have a choice to make: we
864  * must decide whether or not a btvacuumscan() call is needed now (i.e.
865  * whether the ongoing VACUUM operation can entirely avoid a physical scan
866  * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
867  * now.
868  */
869  if (stats == NULL)
870  {
871  /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
872  if (!_bt_vacuum_needs_cleanup(info->index))
873  return NULL;
874 
875  /*
876  * Since we aren't going to actually delete any leaf items, there's no
877  * need to go through all the vacuum-cycle-ID pushups here.
878  *
879  * Posting list tuples are a source of inaccuracy for cleanup-only
880  * scans. btvacuumscan() will assume that the number of index tuples
881  * from each page can be used as num_index_tuples, even though
882  * num_index_tuples is supposed to represent the number of TIDs in the
883  * index. This naive approach can underestimate the number of tuples
884  * in the index significantly.
885  *
886  * We handle the problem by making num_index_tuples an estimate in
887  * cleanup-only case.
888  */
890  btvacuumscan(info, stats, NULL, NULL, 0);
891  stats->estimated_count = true;
892  }
893 
894  /*
895  * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
896  *
897  * num_delpages is the number of deleted pages now in the index that were
898  * not safe to place in the FSM to be recycled just yet. num_delpages is
899  * greater than 0 only when _bt_pagedel() actually deleted pages during
900  * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
901  * have failed to place any newly deleted pages in the FSM just moments
902  * ago. (Actually, there are edge cases where recycling of the current
903  * VACUUM's newly deleted pages does not even become safe by the time the
904  * next VACUUM comes around. See nbtree/README.)
905  */
906  Assert(stats->pages_deleted >= stats->pages_free);
907  num_delpages = stats->pages_deleted - stats->pages_free;
908  _bt_set_cleanup_info(info->index, num_delpages);
909 
910  /*
911  * It's quite possible for us to be fooled by concurrent page splits into
912  * double-counting some index tuples, so disbelieve any total that exceeds
913  * the underlying heap's count ... if we know that accurately. Otherwise
914  * this might just make matters worse.
915  */
916  if (!info->estimated_count)
917  {
918  if (stats->num_index_tuples > info->num_heap_tuples)
919  stats->num_index_tuples = info->num_heap_tuples;
920  }
921 
922  return stats;
923 }
924 
925 /*
926  * btvacuumscan --- scan the index for VACUUMing purposes
927  *
928  * This combines the functions of looking for leaf tuples that are deletable
929  * according to the vacuum callback, looking for empty pages that can be
930  * deleted, and looking for old deleted pages that can be recycled. Both
931  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
932  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
933  *
934  * The caller is responsible for initially allocating/zeroing a stats struct
935  * and for obtaining a vacuum cycle ID if necessary.
936  */
937 static void
939  IndexBulkDeleteCallback callback, void *callback_state,
940  BTCycleId cycleid)
941 {
942  Relation rel = info->index;
943  BTVacState vstate;
944  BlockNumber num_pages;
945  BlockNumber scanblkno;
946  bool needLock;
947 
948  /*
949  * Reset fields that track information about the entire index now. This
950  * avoids double-counting in the case where a single VACUUM command
951  * requires multiple scans of the index.
952  *
953  * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
954  * since they track information about the VACUUM command, and so must last
955  * across each call to btvacuumscan().
956  *
957  * (Note that pages_free is treated as state about the whole index, not
958  * the current VACUUM. This is appropriate because RecordFreeIndexPage()
959  * calls are idempotent, and get repeated for the same deleted pages in
960  * some scenarios. The point for us is to track the number of recyclable
961  * pages in the index at the end of the VACUUM command.)
962  */
963  stats->num_pages = 0;
964  stats->num_index_tuples = 0;
965  stats->pages_deleted = 0;
966  stats->pages_free = 0;
967 
968  /* Set up info to pass down to btvacuumpage */
969  vstate.info = info;
970  vstate.stats = stats;
971  vstate.callback = callback;
972  vstate.callback_state = callback_state;
973  vstate.cycleid = cycleid;
974 
975  /* Create a temporary memory context to run _bt_pagedel in */
977  "_bt_pagedel",
979 
980  /* Initialize vstate fields used by _bt_pendingfsm_finalize */
981  vstate.bufsize = 0;
982  vstate.maxbufsize = 0;
983  vstate.pendingpages = NULL;
984  vstate.npendingpages = 0;
985  /* Consider applying _bt_pendingfsm_finalize optimization */
986  _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
987 
988  /*
989  * The outer loop iterates over all index pages except the metapage, in
990  * physical order (we hope the kernel will cooperate in providing
991  * read-ahead for speed). It is critical that we visit all leaf pages,
992  * including ones added after we start the scan, else we might fail to
993  * delete some deletable tuples. Hence, we must repeatedly check the
994  * relation length. We must acquire the relation-extension lock while
995  * doing so to avoid a race condition: if someone else is extending the
996  * relation, there is a window where bufmgr/smgr have created a new
997  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
998  * we manage to scan such a page here, we'll improperly assume it can be
999  * recycled. Taking the lock synchronizes things enough to prevent a
1000  * problem: either num_pages won't include the new page, or _bt_getbuf
1001  * already has write lock on the buffer and it will be fully initialized
1002  * before we can examine it. Also, we need not worry if a page is added
1003  * immediately after we look; the page splitting code already has
1004  * write-lock on the left page before it adds a right page, so we must
1005  * already have processed any tuples due to be moved into such a page.
1006  *
1007  * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1008  * think the use of the extension lock is still required.
1009  *
1010  * We can skip locking for new or temp relations, however, since no one
1011  * else could be accessing them.
1012  */
1013  needLock = !RELATION_IS_LOCAL(rel);
1014 
1015  scanblkno = BTREE_METAPAGE + 1;
1016  for (;;)
1017  {
1018  /* Get the current relation length */
1019  if (needLock)
1021  num_pages = RelationGetNumberOfBlocks(rel);
1022  if (needLock)
1024 
1025  if (info->report_progress)
1027  num_pages);
1028 
1029  /* Quit if we've scanned the whole relation */
1030  if (scanblkno >= num_pages)
1031  break;
1032  /* Iterate over pages, then loop back to recheck length */
1033  for (; scanblkno < num_pages; scanblkno++)
1034  {
1035  btvacuumpage(&vstate, scanblkno);
1036  if (info->report_progress)
1038  scanblkno);
1039  }
1040  }
1041 
1042  /* Set statistics num_pages field to final size of index */
1043  stats->num_pages = num_pages;
1044 
1046 
1047  /*
1048  * If there were any calls to _bt_pagedel() during scan of the index then
1049  * see if any of the resulting pages can be placed in the FSM now. When
1050  * it's not safe we'll have to leave it up to a future VACUUM operation.
1051  *
1052  * Finally, if we placed any pages in the FSM (either just now or during
1053  * the scan), forcibly update the upper-level FSM pages to ensure that
1054  * searchers can find them.
1055  */
1056  _bt_pendingfsm_finalize(rel, &vstate);
1057  if (stats->pages_free > 0)
1059 }
1060 
1061 /*
1062  * btvacuumpage --- VACUUM one page
1063  *
1064  * This processes a single page for btvacuumscan(). In some cases we must
1065  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1066  * a previous call here. This is how we handle page splits (that happened
1067  * after our cycleid was acquired) whose right half page happened to reuse
1068  * a block that we might have processed at some point before it was
1069  * recycled (i.e. before the page split).
1070  */
1071 static void
1073 {
1074  IndexVacuumInfo *info = vstate->info;
1075  IndexBulkDeleteResult *stats = vstate->stats;
1077  void *callback_state = vstate->callback_state;
1078  Relation rel = info->index;
1079  Relation heaprel = info->heaprel;
1080  bool attempt_pagedel;
1081  BlockNumber blkno,
1082  backtrack_to;
1083  Buffer buf;
1084  Page page;
1085  BTPageOpaque opaque;
1086 
1087  blkno = scanblkno;
1088 
1089 backtrack:
1090 
1091  attempt_pagedel = false;
1092  backtrack_to = P_NONE;
1093 
1094  /* call vacuum_delay_point while not holding any buffer lock */
1096 
1097  /*
1098  * We can't use _bt_getbuf() here because it always applies
1099  * _bt_checkpage(), which will barf on an all-zero page. We want to
1100  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1101  * buffer access strategy.
1102  */
1104  info->strategy);
1105  _bt_lockbuf(rel, buf, BT_READ);
1106  page = BufferGetPage(buf);
1107  opaque = NULL;
1108  if (!PageIsNew(page))
1109  {
1110  _bt_checkpage(rel, buf);
1111  opaque = BTPageGetOpaque(page);
1112  }
1113 
1114  Assert(blkno <= scanblkno);
1115  if (blkno != scanblkno)
1116  {
1117  /*
1118  * We're backtracking.
1119  *
1120  * We followed a right link to a sibling leaf page (a page that
1121  * happens to be from a block located before scanblkno). The only
1122  * case we want to do anything with is a live leaf page having the
1123  * current vacuum cycle ID.
1124  *
1125  * The page had better be in a state that's consistent with what we
1126  * expect. Check for conditions that imply corruption in passing. It
1127  * can't be half-dead because only an interrupted VACUUM process can
1128  * leave pages in that state, so we'd definitely have dealt with it
1129  * back when the page was the scanblkno page (half-dead pages are
1130  * always marked fully deleted by _bt_pagedel(), barring corruption).
1131  */
1132  if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1133  {
1134  Assert(false);
1135  ereport(LOG,
1136  (errcode(ERRCODE_INDEX_CORRUPTED),
1137  errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1138  blkno, scanblkno, RelationGetRelationName(rel))));
1139  _bt_relbuf(rel, buf);
1140  return;
1141  }
1142 
1143  /*
1144  * We may have already processed the page in an earlier call, when the
1145  * page was scanblkno. This happens when the leaf page split occurred
1146  * after the scan began, but before the right sibling page became the
1147  * scanblkno.
1148  *
1149  * Page may also have been deleted by current btvacuumpage() call,
1150  * since _bt_pagedel() sometimes deletes the right sibling page of
1151  * scanblkno in passing (it does so after we decided where to
1152  * backtrack to). We don't need to process this page as a deleted
1153  * page a second time now (in fact, it would be wrong to count it as a
1154  * deleted page in the bulk delete statistics a second time).
1155  */
1156  if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1157  {
1158  /* Done with current scanblkno (and all lower split pages) */
1159  _bt_relbuf(rel, buf);
1160  return;
1161  }
1162  }
1163 
1164  if (!opaque || BTPageIsRecyclable(page, heaprel))
1165  {
1166  /* Okay to recycle this page (which could be leaf or internal) */
1167  RecordFreeIndexPage(rel, blkno);
1168  stats->pages_deleted++;
1169  stats->pages_free++;
1170  }
1171  else if (P_ISDELETED(opaque))
1172  {
1173  /*
1174  * Already deleted page (which could be leaf or internal). Can't
1175  * recycle yet.
1176  */
1177  stats->pages_deleted++;
1178  }
1179  else if (P_ISHALFDEAD(opaque))
1180  {
1181  /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1182  attempt_pagedel = true;
1183 
1184  /*
1185  * _bt_pagedel() will increment both pages_newly_deleted and
1186  * pages_deleted stats in all cases (barring corruption)
1187  */
1188  }
1189  else if (P_ISLEAF(opaque))
1190  {
1192  int ndeletable;
1194  int nupdatable;
1195  OffsetNumber offnum,
1196  minoff,
1197  maxoff;
1198  int nhtidsdead,
1199  nhtidslive;
1200 
1201  /*
1202  * Trade in the initial read lock for a full cleanup lock on this
1203  * page. We must get such a lock on every leaf page over the course
1204  * of the vacuum scan, whether or not it actually contains any
1205  * deletable tuples --- see nbtree/README.
1206  */
1208 
1209  /*
1210  * Check whether we need to backtrack to earlier pages. What we are
1211  * concerned about is a page split that happened since we started the
1212  * vacuum scan. If the split moved tuples on the right half of the
1213  * split (i.e. the tuples that sort high) to a block that we already
1214  * passed over, then we might have missed the tuples. We need to
1215  * backtrack now. (Must do this before possibly clearing btpo_cycleid
1216  * or deleting scanblkno page below!)
1217  */
1218  if (vstate->cycleid != 0 &&
1219  opaque->btpo_cycleid == vstate->cycleid &&
1220  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1221  !P_RIGHTMOST(opaque) &&
1222  opaque->btpo_next < scanblkno)
1223  backtrack_to = opaque->btpo_next;
1224 
1225  ndeletable = 0;
1226  nupdatable = 0;
1227  minoff = P_FIRSTDATAKEY(opaque);
1228  maxoff = PageGetMaxOffsetNumber(page);
1229  nhtidsdead = 0;
1230  nhtidslive = 0;
1231  if (callback)
1232  {
1233  /* btbulkdelete callback tells us what to delete (or update) */
1234  for (offnum = minoff;
1235  offnum <= maxoff;
1236  offnum = OffsetNumberNext(offnum))
1237  {
1238  IndexTuple itup;
1239 
1240  itup = (IndexTuple) PageGetItem(page,
1241  PageGetItemId(page, offnum));
1242 
1243  Assert(!BTreeTupleIsPivot(itup));
1244  if (!BTreeTupleIsPosting(itup))
1245  {
1246  /* Regular tuple, standard table TID representation */
1247  if (callback(&itup->t_tid, callback_state))
1248  {
1249  deletable[ndeletable++] = offnum;
1250  nhtidsdead++;
1251  }
1252  else
1253  nhtidslive++;
1254  }
1255  else
1256  {
1257  BTVacuumPosting vacposting;
1258  int nremaining;
1259 
1260  /* Posting list tuple */
1261  vacposting = btreevacuumposting(vstate, itup, offnum,
1262  &nremaining);
1263  if (vacposting == NULL)
1264  {
1265  /*
1266  * All table TIDs from the posting tuple remain, so no
1267  * delete or update required
1268  */
1269  Assert(nremaining == BTreeTupleGetNPosting(itup));
1270  }
1271  else if (nremaining > 0)
1272  {
1273 
1274  /*
1275  * Store metadata about posting list tuple in
1276  * updatable array for entire page. Existing tuple
1277  * will be updated during the later call to
1278  * _bt_delitems_vacuum().
1279  */
1280  Assert(nremaining < BTreeTupleGetNPosting(itup));
1281  updatable[nupdatable++] = vacposting;
1282  nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1283  }
1284  else
1285  {
1286  /*
1287  * All table TIDs from the posting list must be
1288  * deleted. We'll delete the index tuple completely
1289  * (no update required).
1290  */
1291  Assert(nremaining == 0);
1292  deletable[ndeletable++] = offnum;
1293  nhtidsdead += BTreeTupleGetNPosting(itup);
1294  pfree(vacposting);
1295  }
1296 
1297  nhtidslive += nremaining;
1298  }
1299  }
1300  }
1301 
1302  /*
1303  * Apply any needed deletes or updates. We issue just one
1304  * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1305  */
1306  if (ndeletable > 0 || nupdatable > 0)
1307  {
1308  Assert(nhtidsdead >= ndeletable + nupdatable);
1309  _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1310  nupdatable);
1311 
1312  stats->tuples_removed += nhtidsdead;
1313  /* must recompute maxoff */
1314  maxoff = PageGetMaxOffsetNumber(page);
1315 
1316  /* can't leak memory here */
1317  for (int i = 0; i < nupdatable; i++)
1318  pfree(updatable[i]);
1319  }
1320  else
1321  {
1322  /*
1323  * If the leaf page has been split during this vacuum cycle, it
1324  * seems worth expending a write to clear btpo_cycleid even if we
1325  * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1326  * takes care of this.) This ensures we won't process the page
1327  * again.
1328  *
1329  * We treat this like a hint-bit update because there's no need to
1330  * WAL-log it.
1331  */
1332  Assert(nhtidsdead == 0);
1333  if (vstate->cycleid != 0 &&
1334  opaque->btpo_cycleid == vstate->cycleid)
1335  {
1336  opaque->btpo_cycleid = 0;
1337  MarkBufferDirtyHint(buf, true);
1338  }
1339  }
1340 
1341  /*
1342  * If the leaf page is now empty, try to delete it; else count the
1343  * live tuples (live table TIDs in posting lists are counted as
1344  * separate live tuples). We don't delete when backtracking, though,
1345  * since that would require teaching _bt_pagedel() about backtracking
1346  * (doesn't seem worth adding more complexity to deal with that).
1347  *
1348  * We don't count the number of live TIDs during cleanup-only calls to
1349  * btvacuumscan (i.e. when callback is not set). We count the number
1350  * of index tuples directly instead. This avoids the expense of
1351  * directly examining all of the tuples on each page. VACUUM will
1352  * treat num_index_tuples as an estimate in cleanup-only case, so it
1353  * doesn't matter that this underestimates num_index_tuples
1354  * significantly in some cases.
1355  */
1356  if (minoff > maxoff)
1357  attempt_pagedel = (blkno == scanblkno);
1358  else if (callback)
1359  stats->num_index_tuples += nhtidslive;
1360  else
1361  stats->num_index_tuples += maxoff - minoff + 1;
1362 
1363  Assert(!attempt_pagedel || nhtidslive == 0);
1364  }
1365 
1366  if (attempt_pagedel)
1367  {
1368  MemoryContext oldcontext;
1369 
1370  /* Run pagedel in a temp context to avoid memory leakage */
1372  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1373 
1374  /*
1375  * _bt_pagedel maintains the bulk delete stats on our behalf;
1376  * pages_newly_deleted and pages_deleted are likely to be incremented
1377  * during call
1378  */
1379  Assert(blkno == scanblkno);
1380  _bt_pagedel(rel, buf, vstate);
1381 
1382  MemoryContextSwitchTo(oldcontext);
1383  /* pagedel released buffer, so we shouldn't */
1384  }
1385  else
1386  _bt_relbuf(rel, buf);
1387 
1388  if (backtrack_to != P_NONE)
1389  {
1390  blkno = backtrack_to;
1391  goto backtrack;
1392  }
1393 }
1394 
1395 /*
1396  * btreevacuumposting --- determine TIDs still needed in posting list
1397  *
1398  * Returns metadata describing how to build replacement tuple without the TIDs
1399  * that VACUUM needs to delete. Returned value is NULL in the common case
1400  * where no changes are needed to caller's posting list tuple (we avoid
1401  * allocating memory here as an optimization).
1402  *
1403  * The number of TIDs that should remain in the posting list tuple is set for
1404  * caller in *nremaining.
1405  */
1406 static BTVacuumPosting
1408  OffsetNumber updatedoffset, int *nremaining)
1409 {
1410  int live = 0;
1411  int nitem = BTreeTupleGetNPosting(posting);
1413  BTVacuumPosting vacposting = NULL;
1414 
1415  for (int i = 0; i < nitem; i++)
1416  {
1417  if (!vstate->callback(items + i, vstate->callback_state))
1418  {
1419  /* Live table TID */
1420  live++;
1421  }
1422  else if (vacposting == NULL)
1423  {
1424  /*
1425  * First dead table TID encountered.
1426  *
1427  * It's now clear that we need to delete one or more dead table
1428  * TIDs, so start maintaining metadata describing how to update
1429  * existing posting list tuple.
1430  */
1431  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1432  nitem * sizeof(uint16));
1433 
1434  vacposting->itup = posting;
1435  vacposting->updatedoffset = updatedoffset;
1436  vacposting->ndeletedtids = 0;
1437  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1438  }
1439  else
1440  {
1441  /* Second or subsequent dead table TID */
1442  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1443  }
1444  }
1445 
1446  *nremaining = live;
1447  return vacposting;
1448 }
1449 
1450 /*
1451  * btcanreturn() -- Check whether btree indexes support index-only scans.
1452  *
1453  * btrees always do, so this is trivial.
1454  */
1455 bool
1457 {
1458  return true;
1459 }
1460 
1461 /*
1462  * btgettreeheight() -- Compute tree height for use by btcostestimate().
1463  */
1464 int
1466 {
1467  return _bt_getrootheight(rel);
1468 }
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static Datum values[MAXATTR]
Definition: bootstrap.c:150
int Buffer
Definition: buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4938
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4970
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:793
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:273
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
@ RBM_NORMAL
Definition: bufmgr.h:45
Pointer Page
Definition: bufpage.h:81
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static bool PageIsNew(Page page)
Definition: bufpage.h:233
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:372
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:321
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:345
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:129
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:86
#define OffsetToPointer(base, offset)
Definition: c.h:775
unsigned short uint16
Definition: c.h:508
#define Assert(condition)
Definition: c.h:861
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:401
size_t Size
Definition: c.h:608
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode(int sqlerrcode)
Definition: elog.c:853
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:87
IndexUniqueCheck
Definition: genam.h:116
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:165
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:420
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:470
#define ExclusiveLock
Definition: lockdefs.h:42
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:383
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
void * palloc(Size size)
Definition: mcxt.c:1317
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:102
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:675
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1802
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1154
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:232
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1109
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:179
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2995
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2954
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1456
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:712
BTPS_State
Definition: nbtree.c:55
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:58
@ BTPARALLEL_NEED_PRIMSCAN
Definition: nbtree.c:57
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56
@ BTPARALLEL_IDLE
Definition: nbtree.c:59
@ BTPARALLEL_DONE
Definition: nbtree.c:60
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:313
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:735
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:820
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1407
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:850
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:207
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:560
struct BTParallelScanDescData BTParallelScanDescData
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:183
void btbuildempty(Relation index)
Definition: nbtree.c:160
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
Definition: nbtree.c:1072
int btgettreeheight(Relation rel)
Definition: nbtree.c:1465
void btinitparallelscan(void *target)
Definition: nbtree.c:546
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:938
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
Definition: nbtree.c:603
Size btestimateparallelscan(int nkeys, int norderbys)
Definition: nbtree.c:536
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:267
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:452
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:416
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:360
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:83
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:101
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
Definition: nbtree.c:783
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:478
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:999
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:537
#define BTNProcs
Definition: nbtree.h:712
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1016
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
uint16 BTCycleId
Definition: nbtree.h:29
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_READ
Definition: nbtree.h:719
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:291
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1022
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1010
#define BTP_SPLIT_END
Definition: nbtree.h:81
#define BTOPTIONS_PROC
Definition: nbtree.h:711
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1081
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:879
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1499
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:296
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:4462
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:4586
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:4490
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:4540
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:4154
bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1677
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:4563
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:5118
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1352
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:4405
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:41
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:293
#define makeNode(_type_)
Definition: nodes.h:155
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:73
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
#define InvalidOid
Definition: postgres_ext.h:36
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:123
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:122
MemoryContextSwitchTo(old_ctx)
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:648
#define RelationGetDescr(relation)
Definition: rel.h:531
#define RelationGetRelationName(relation)
Definition: rel.h:539
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6796
ScanKeyData * ScanKey
Definition: skey.h:75
#define SpinLockInit(lock)
Definition: spin.h:57
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59
#define BTMaxStrategyNumber
Definition: stratnum.h:35
Datum * elem_values
Definition: nbtree.h:1037
BlockNumber btpo_next
Definition: nbtree.h:65
uint16 btpo_flags
Definition: nbtree.h:67
BTCycleId btpo_cycleid
Definition: nbtree.h:68
slock_t btps_mutex
Definition: nbtree.c:73
BTPS_State btps_pageStatus
Definition: nbtree.c:70
ConditionVariable btps_cv
Definition: nbtree.c:74
BlockNumber btps_scanPage
Definition: nbtree.c:69
int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.c:80
bool needPrimScan
Definition: nbtree.h:1049
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1051
char * markTuples
Definition: nbtree.h:1065
FmgrInfo * orderProcs
Definition: nbtree.h:1052
BTScanPosData currPos
Definition: nbtree.h:1077
int * killedItems
Definition: nbtree.h:1056
char * currTuples
Definition: nbtree.h:1064
BTScanPosData markPos
Definition: nbtree.h:1078
ScanKey keyData
Definition: nbtree.h:1045
MemoryContext arrayContext
Definition: nbtree.h:1053
Buffer buf
Definition: nbtree.h:953
int nextTupleOffset
Definition: nbtree.h:981
int lastItem
Definition: nbtree.h:991
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:994
int itemIndex
Definition: nbtree.h:992
ScanDirection dir
Definition: nbtree.h:975
ItemPointerData heapTid
Definition: nbtree.h:946
IndexBulkDeleteResult * stats
Definition: nbtree.h:333
BTCycleId cycleid
Definition: nbtree.h:336
BTPendingFSM * pendingpages
Definition: nbtree.h:344
int npendingpages
Definition: nbtree.h:345
IndexBulkDeleteCallback callback
Definition: nbtree.h:334
MemoryContext pagedelcontext
Definition: nbtree.h:337
IndexVacuumInfo * info
Definition: nbtree.h:332
int bufsize
Definition: nbtree.h:342
int maxbufsize
Definition: nbtree.h:343
void * callback_state
Definition: nbtree.h:335
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:911
uint16 ndeletedtids
Definition: nbtree.h:910
IndexTuple itup
Definition: nbtree.h:906
OffsetNumber updatedoffset
Definition: nbtree.h:907
ambuildphasename_function ambuildphasename
Definition: amapi.h:285
ambuildempty_function ambuildempty
Definition: amapi.h:275
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:279
bool amclusterable
Definition: amapi.h:249
amoptions_function amoptions
Definition: amapi.h:283
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:297
amrestrpos_function amrestrpos
Definition: amapi.h:294
aminsert_function aminsert
Definition: amapi.h:276
amendscan_function amendscan
Definition: amapi.h:292
uint16 amoptsprocnum
Definition: amapi.h:229
amparallelrescan_function amparallelrescan
Definition: amapi.h:299
Oid amkeytype
Definition: amapi.h:265
bool ampredlocks
Definition: amapi.h:251
uint16 amsupport
Definition: amapi.h:227
amcostestimate_function amcostestimate
Definition: amapi.h:281
bool amcanorderbyop
Definition: amapi.h:233
amadjustmembers_function amadjustmembers
Definition: amapi.h:287
ambuild_function ambuild
Definition: amapi.h:274
bool amstorage
Definition: amapi.h:247
uint16 amstrategies
Definition: amapi.h:225
bool amoptionalkey
Definition: amapi.h:241
amgettuple_function amgettuple
Definition: amapi.h:290
amcanreturn_function amcanreturn
Definition: amapi.h:280
bool amcanunique
Definition: amapi.h:237
amgetbitmap_function amgetbitmap
Definition: amapi.h:291
amproperty_function amproperty
Definition: amapi.h:284
ambulkdelete_function ambulkdelete
Definition: amapi.h:278
bool amsearcharray
Definition: amapi.h:243
bool amsummarizing
Definition: amapi.h:261
amvalidate_function amvalidate
Definition: amapi.h:286
ammarkpos_function ammarkpos
Definition: amapi.h:293
bool amcanmulticol
Definition: amapi.h:239
bool amusemaintenanceworkmem
Definition: amapi.h:259
ambeginscan_function ambeginscan
Definition: amapi.h:288
bool amcanparallel
Definition: amapi.h:253
amrescan_function amrescan
Definition: amapi.h:289
bool amcanorder
Definition: amapi.h:231
bool amcanbuildparallel
Definition: amapi.h:255
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:298
uint8 amparallelvacuumoptions
Definition: amapi.h:263
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:277
bool amcanbackward
Definition: amapi.h:235
amgettreeheight_function amgettreeheight
Definition: amapi.h:282
bool amcaninclude
Definition: amapi.h:257
bool amsearchnulls
Definition: amapi.h:245
bool estimated_count
Definition: genam.h:78
BlockNumber pages_deleted
Definition: genam.h:82
BlockNumber pages_free
Definition: genam.h:83
BlockNumber num_pages
Definition: genam.h:77
double tuples_removed
Definition: genam.h:80
double num_index_tuples
Definition: genam.h:79
struct ScanKeyData * keyData
Definition: relscan.h:123
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:167
bool kill_prior_tuple
Definition: relscan.h:129
struct TupleDescData * xs_itupdesc
Definition: relscan.h:144
ItemPointerData xs_heaptid
Definition: relscan.h:148
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:46
double num_heap_tuples
Definition: genam.h:52
bool analyze_only
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:53
Relation heaprel
Definition: genam.h:47
bool report_progress
Definition: genam.h:49
bool estimated_count
Definition: genam.h:50
Datum sk_argument
Definition: skey.h:72
Definition: type.h:95
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
static ItemArray items
Definition: test_tidstore.c:49
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:377
void vacuum_delay_point(void)
Definition: vacuum.c:2362
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:55