PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "access/xloginsert.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "nodes/execnodes.h"
30 #include "pgstat.h"
31 #include "postmaster/autovacuum.h"
33 #include "storage/indexfsm.h"
34 #include "storage/ipc.h"
35 #include "storage/lmgr.h"
36 #include "storage/smgr.h"
37 #include "utils/builtins.h"
38 #include "utils/index_selfuncs.h"
39 #include "utils/memutils.h"
40 
41 
42 /*
43  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
44  *
45  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
46  * a new page; others must wait.
47  *
48  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
49  * to a new page; some process can start doing that.
50  *
51  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
52  * We reach this state once for every distinct combination of array keys.
53  */
54 typedef enum
55 {
60 } BTPS_State;
61 
62 /*
63  * BTParallelScanDescData contains btree specific shared information required
64  * for parallel scan.
65  */
66 typedef struct BTParallelScanDescData
67 {
68  BlockNumber btps_scanPage; /* latest or next page to be scanned */
69  BTPS_State btps_pageStatus; /* indicates whether next page is
70  * available for scan. see above for
71  * possible states of parallel scan. */
72  int btps_arrayKeyCount; /* count indicating number of array scan
73  * keys processed by parallel scan */
74  slock_t btps_mutex; /* protects above variables */
75  ConditionVariable btps_cv; /* used to synchronize parallel scan */
77 
79 
80 
81 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
82  IndexBulkDeleteCallback callback, void *callback_state,
83  BTCycleId cycleid);
84 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
86  IndexTuple posting,
87  OffsetNumber updatedoffset,
88  int *nremaining);
89 
90 
91 /*
92  * Btree handler function: return IndexAmRoutine with access method parameters
93  * and callbacks.
94  */
95 Datum
97 {
99 
100  amroutine->amstrategies = BTMaxStrategyNumber;
101  amroutine->amsupport = BTNProcs;
102  amroutine->amoptsprocnum = BTOPTIONS_PROC;
103  amroutine->amcanorder = true;
104  amroutine->amcanorderbyop = false;
105  amroutine->amcanbackward = true;
106  amroutine->amcanunique = true;
107  amroutine->amcanmulticol = true;
108  amroutine->amoptionalkey = true;
109  amroutine->amsearcharray = true;
110  amroutine->amsearchnulls = true;
111  amroutine->amstorage = false;
112  amroutine->amclusterable = true;
113  amroutine->ampredlocks = true;
114  amroutine->amcanparallel = true;
115  amroutine->amcanbuildparallel = true;
116  amroutine->amcaninclude = true;
117  amroutine->amusemaintenanceworkmem = false;
118  amroutine->amsummarizing = false;
119  amroutine->amparallelvacuumoptions =
121  amroutine->amkeytype = InvalidOid;
122 
123  amroutine->ambuild = btbuild;
124  amroutine->ambuildempty = btbuildempty;
125  amroutine->aminsert = btinsert;
126  amroutine->aminsertcleanup = NULL;
127  amroutine->ambulkdelete = btbulkdelete;
128  amroutine->amvacuumcleanup = btvacuumcleanup;
129  amroutine->amcanreturn = btcanreturn;
130  amroutine->amcostestimate = btcostestimate;
131  amroutine->amoptions = btoptions;
132  amroutine->amproperty = btproperty;
133  amroutine->ambuildphasename = btbuildphasename;
134  amroutine->amvalidate = btvalidate;
135  amroutine->amadjustmembers = btadjustmembers;
136  amroutine->ambeginscan = btbeginscan;
137  amroutine->amrescan = btrescan;
138  amroutine->amgettuple = btgettuple;
139  amroutine->amgetbitmap = btgetbitmap;
140  amroutine->amendscan = btendscan;
141  amroutine->ammarkpos = btmarkpos;
142  amroutine->amrestrpos = btrestrpos;
145  amroutine->amparallelrescan = btparallelrescan;
146 
147  PG_RETURN_POINTER(amroutine);
148 }
149 
150 /*
151  * btbuildempty() -- build an empty btree index in the initialization fork
152  */
153 void
155 {
156  bool allequalimage = _bt_allequalimage(index, false);
157  Buffer metabuf;
158  Page metapage;
159 
160  /*
161  * Initialize the metapage.
162  *
163  * Regular index build bypasses the buffer manager and uses smgr functions
164  * directly, with an smgrimmedsync() call at the end. That makes sense
165  * when the index is large, but for an empty index, it's better to use the
166  * buffer cache to avoid the smgrimmedsync().
167  */
170  _bt_lockbuf(index, metabuf, BT_WRITE);
171 
173 
174  metapage = BufferGetPage(metabuf);
175  _bt_initmetapage(metapage, P_NONE, 0, allequalimage);
176  MarkBufferDirty(metabuf);
177  log_newpage_buffer(metabuf, true);
178 
180 
181  _bt_unlockbuf(index, metabuf);
182  ReleaseBuffer(metabuf);
183 }
184 
185 /*
186  * btinsert() -- insert an index tuple into a btree.
187  *
188  * Descend the tree recursively, find the appropriate location for our
189  * new tuple, and put it there.
190  */
191 bool
192 btinsert(Relation rel, Datum *values, bool *isnull,
193  ItemPointer ht_ctid, Relation heapRel,
194  IndexUniqueCheck checkUnique,
195  bool indexUnchanged,
196  IndexInfo *indexInfo)
197 {
198  bool result;
199  IndexTuple itup;
200 
201  /* generate an index tuple */
202  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
203  itup->t_tid = *ht_ctid;
204 
205  result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
206 
207  pfree(itup);
208 
209  return result;
210 }
211 
212 /*
213  * btgettuple() -- Get the next tuple in the scan.
214  */
215 bool
217 {
218  BTScanOpaque so = (BTScanOpaque) scan->opaque;
219  bool res;
220 
221  /* btree indexes are never lossy */
222  scan->xs_recheck = false;
223 
224  /*
225  * If we have any array keys, initialize them during first call for a
226  * scan. We can't do this in btrescan because we don't know the scan
227  * direction at that time.
228  */
229  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
230  {
231  /* punt if we have any unsatisfiable array keys */
232  if (so->numArrayKeys < 0)
233  return false;
234 
235  _bt_start_array_keys(scan, dir);
236  }
237 
238  /* This loop handles advancing to the next array elements, if any */
239  do
240  {
241  /*
242  * If we've already initialized this scan, we can just advance it in
243  * the appropriate direction. If we haven't done so yet, we call
244  * _bt_first() to get the first item in the scan.
245  */
246  if (!BTScanPosIsValid(so->currPos))
247  res = _bt_first(scan, dir);
248  else
249  {
250  /*
251  * Check to see if we should kill the previously-fetched tuple.
252  */
253  if (scan->kill_prior_tuple)
254  {
255  /*
256  * Yes, remember it for later. (We'll deal with all such
257  * tuples at once right before leaving the index page.) The
258  * test for numKilled overrun is not just paranoia: if the
259  * caller reverses direction in the indexscan then the same
260  * item might get entered multiple times. It's not worth
261  * trying to optimize that, so we don't detect it, but instead
262  * just forget any excess entries.
263  */
264  if (so->killedItems == NULL)
265  so->killedItems = (int *)
266  palloc(MaxTIDsPerBTreePage * sizeof(int));
267  if (so->numKilled < MaxTIDsPerBTreePage)
268  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
269  }
270 
271  /*
272  * Now continue the scan.
273  */
274  res = _bt_next(scan, dir);
275  }
276 
277  /* If we have a tuple, return it ... */
278  if (res)
279  break;
280  /* ... otherwise see if we have more array keys to deal with */
281  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
282 
283  return res;
284 }
285 
286 /*
287  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
288  */
289 int64
291 {
292  BTScanOpaque so = (BTScanOpaque) scan->opaque;
293  int64 ntids = 0;
294  ItemPointer heapTid;
295 
296  /*
297  * If we have any array keys, initialize them.
298  */
299  if (so->numArrayKeys)
300  {
301  /* punt if we have any unsatisfiable array keys */
302  if (so->numArrayKeys < 0)
303  return ntids;
304 
306  }
307 
308  /* This loop handles advancing to the next array elements, if any */
309  do
310  {
311  /* Fetch the first page & tuple */
312  if (_bt_first(scan, ForwardScanDirection))
313  {
314  /* Save tuple ID, and continue scanning */
315  heapTid = &scan->xs_heaptid;
316  tbm_add_tuples(tbm, heapTid, 1, false);
317  ntids++;
318 
319  for (;;)
320  {
321  /*
322  * Advance to next tuple within page. This is the same as the
323  * easy case in _bt_next().
324  */
325  if (++so->currPos.itemIndex > so->currPos.lastItem)
326  {
327  /* let _bt_next do the heavy lifting */
328  if (!_bt_next(scan, ForwardScanDirection))
329  break;
330  }
331 
332  /* Save tuple ID, and continue scanning */
333  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
334  tbm_add_tuples(tbm, heapTid, 1, false);
335  ntids++;
336  }
337  }
338  /* Now see if we have more array keys to deal with */
340 
341  return ntids;
342 }
343 
344 /*
345  * btbeginscan() -- start a scan on a btree index
346  */
348 btbeginscan(Relation rel, int nkeys, int norderbys)
349 {
350  IndexScanDesc scan;
351  BTScanOpaque so;
352 
353  /* no order by operators allowed */
354  Assert(norderbys == 0);
355 
356  /* get the scan */
357  scan = RelationGetIndexScan(rel, nkeys, norderbys);
358 
359  /* allocate private workspace */
360  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
363  if (scan->numberOfKeys > 0)
364  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
365  else
366  so->keyData = NULL;
367 
368  so->arrayKeyData = NULL; /* assume no array keys for now */
369  so->arraysStarted = false;
370  so->numArrayKeys = 0;
371  so->arrayKeys = NULL;
372  so->arrayContext = NULL;
373 
374  so->killedItems = NULL; /* until needed */
375  so->numKilled = 0;
376 
377  /*
378  * We don't know yet whether the scan will be index-only, so we do not
379  * allocate the tuple workspace arrays until btrescan. However, we set up
380  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
381  */
382  so->currTuples = so->markTuples = NULL;
383 
384  scan->xs_itupdesc = RelationGetDescr(rel);
385 
386  scan->opaque = so;
387 
388  return scan;
389 }
390 
391 /*
392  * btrescan() -- rescan an index relation
393  */
394 void
395 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
396  ScanKey orderbys, int norderbys)
397 {
398  BTScanOpaque so = (BTScanOpaque) scan->opaque;
399 
400  /* we aren't holding any read locks, but gotta drop the pins */
402  {
403  /* Before leaving current page, deal with any killed items */
404  if (so->numKilled > 0)
405  _bt_killitems(scan);
408  }
409 
410  so->markItemIndex = -1;
411  so->arrayKeyCount = 0;
414 
415  /*
416  * Allocate tuple workspace arrays, if needed for an index-only scan and
417  * not already done in a previous rescan call. To save on palloc
418  * overhead, both workspaces are allocated as one palloc block; only this
419  * function and btendscan know that.
420  *
421  * NOTE: this data structure also makes it safe to return data from a
422  * "name" column, even though btree name_ops uses an underlying storage
423  * datatype of cstring. The risk there is that "name" is supposed to be
424  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
425  * However, since we only return data out of tuples sitting in the
426  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
427  * data out of the markTuples array --- running off the end of memory for
428  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
429  * adding special-case treatment for name_ops elsewhere.
430  */
431  if (scan->xs_want_itup && so->currTuples == NULL)
432  {
433  so->currTuples = (char *) palloc(BLCKSZ * 2);
434  so->markTuples = so->currTuples + BLCKSZ;
435  }
436 
437  /*
438  * Reset the scan keys
439  */
440  if (scankey && scan->numberOfKeys > 0)
441  memmove(scan->keyData,
442  scankey,
443  scan->numberOfKeys * sizeof(ScanKeyData));
444  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
445 
446  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
448 }
449 
450 /*
451  * btendscan() -- close down a scan
452  */
453 void
455 {
456  BTScanOpaque so = (BTScanOpaque) scan->opaque;
457 
458  /* we aren't holding any read locks, but gotta drop the pins */
460  {
461  /* Before leaving current page, deal with any killed items */
462  if (so->numKilled > 0)
463  _bt_killitems(scan);
465  }
466 
467  so->markItemIndex = -1;
469 
470  /* No need to invalidate positions, the RAM is about to be freed. */
471 
472  /* Release storage */
473  if (so->keyData != NULL)
474  pfree(so->keyData);
475  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
476  if (so->arrayContext != NULL)
478  if (so->killedItems != NULL)
479  pfree(so->killedItems);
480  if (so->currTuples != NULL)
481  pfree(so->currTuples);
482  /* so->markTuples should not be pfree'd, see btrescan */
483  pfree(so);
484 }
485 
486 /*
487  * btmarkpos() -- save current scan position
488  */
489 void
491 {
492  BTScanOpaque so = (BTScanOpaque) scan->opaque;
493 
494  /* There may be an old mark with a pin (but no lock). */
496 
497  /*
498  * Just record the current itemIndex. If we later step to next page
499  * before releasing the marked position, _bt_steppage makes a full copy of
500  * the currPos struct in markPos. If (as often happens) the mark is moved
501  * before we leave the page, we don't have to do that work.
502  */
503  if (BTScanPosIsValid(so->currPos))
504  so->markItemIndex = so->currPos.itemIndex;
505  else
506  {
508  so->markItemIndex = -1;
509  }
510 
511  /* Also record the current positions of any array keys */
512  if (so->numArrayKeys)
513  _bt_mark_array_keys(scan);
514 }
515 
516 /*
517  * btrestrpos() -- restore scan to last saved position
518  */
519 void
521 {
522  BTScanOpaque so = (BTScanOpaque) scan->opaque;
523 
524  /* Restore the marked positions of any array keys */
525  if (so->numArrayKeys)
527 
528  if (so->markItemIndex >= 0)
529  {
530  /*
531  * The scan has never moved to a new page since the last mark. Just
532  * restore the itemIndex.
533  *
534  * NB: In this case we can't count on anything in so->markPos to be
535  * accurate.
536  */
537  so->currPos.itemIndex = so->markItemIndex;
538  }
539  else
540  {
541  /*
542  * The scan moved to a new page after last mark or restore, and we are
543  * now restoring to the marked page. We aren't holding any read
544  * locks, but if we're still holding the pin for the current position,
545  * we must drop it.
546  */
547  if (BTScanPosIsValid(so->currPos))
548  {
549  /* Before leaving current page, deal with any killed items */
550  if (so->numKilled > 0)
551  _bt_killitems(scan);
553  }
554 
555  if (BTScanPosIsValid(so->markPos))
556  {
557  /* bump pin on mark buffer for assignment to current buffer */
558  if (BTScanPosIsPinned(so->markPos))
560  memcpy(&so->currPos, &so->markPos,
561  offsetof(BTScanPosData, items[1]) +
562  so->markPos.lastItem * sizeof(BTScanPosItem));
563  if (so->currTuples)
564  memcpy(so->currTuples, so->markTuples,
566  }
567  else
569  }
570 }
571 
572 /*
573  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
574  */
575 Size
577 {
578  return sizeof(BTParallelScanDescData);
579 }
580 
581 /*
582  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
583  */
584 void
585 btinitparallelscan(void *target)
586 {
587  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
588 
589  SpinLockInit(&bt_target->btps_mutex);
590  bt_target->btps_scanPage = InvalidBlockNumber;
592  bt_target->btps_arrayKeyCount = 0;
593  ConditionVariableInit(&bt_target->btps_cv);
594 }
595 
596 /*
597  * btparallelrescan() -- reset parallel scan
598  */
599 void
601 {
602  BTParallelScanDesc btscan;
603  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
604 
605  Assert(parallel_scan);
606 
607  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
608  parallel_scan->ps_offset);
609 
610  /*
611  * In theory, we don't need to acquire the spinlock here, because there
612  * shouldn't be any other workers running at this point, but we do so for
613  * consistency.
614  */
615  SpinLockAcquire(&btscan->btps_mutex);
618  btscan->btps_arrayKeyCount = 0;
619  SpinLockRelease(&btscan->btps_mutex);
620 }
621 
622 /*
623  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
624  * page. Other scans must wait until we call _bt_parallel_release()
625  * or _bt_parallel_done().
626  *
627  * The return value is true if we successfully seized the scan and false
628  * if we did not. The latter case occurs if no pages remain for the current
629  * set of scankeys.
630  *
631  * If the return value is true, *pageno returns the next or current page
632  * of the scan (depending on the scan direction). An invalid block number
633  * means the scan hasn't yet started, and P_NONE means we've reached the end.
634  * The first time a participating process reaches the last page, it will return
635  * true and set *pageno to P_NONE; after that, further attempts to seize the
636  * scan will return false.
637  *
638  * Callers should ignore the value of pageno if the return value is false.
639  */
640 bool
642 {
643  BTScanOpaque so = (BTScanOpaque) scan->opaque;
644  BTPS_State pageStatus;
645  bool exit_loop = false;
646  bool status = true;
647  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
648  BTParallelScanDesc btscan;
649 
650  *pageno = P_NONE;
651 
652  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
653  parallel_scan->ps_offset);
654 
655  while (1)
656  {
657  SpinLockAcquire(&btscan->btps_mutex);
658  pageStatus = btscan->btps_pageStatus;
659 
660  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
661  {
662  /* Parallel scan has already advanced to a new set of scankeys. */
663  status = false;
664  }
665  else if (pageStatus == BTPARALLEL_DONE)
666  {
667  /*
668  * We're done with this set of scankeys. This may be the end, or
669  * there could be more sets to try.
670  */
671  status = false;
672  }
673  else if (pageStatus != BTPARALLEL_ADVANCING)
674  {
675  /*
676  * We have successfully seized control of the scan for the purpose
677  * of advancing it to a new page!
678  */
679  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
680  *pageno = btscan->btps_scanPage;
681  exit_loop = true;
682  }
683  SpinLockRelease(&btscan->btps_mutex);
684  if (exit_loop || !status)
685  break;
686  ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
687  }
689 
690  return status;
691 }
692 
693 /*
694  * _bt_parallel_release() -- Complete the process of advancing the scan to a
695  * new page. We now have the new value btps_scanPage; some other backend
696  * can now begin advancing the scan.
697  */
698 void
700 {
701  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
702  BTParallelScanDesc btscan;
703 
704  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
705  parallel_scan->ps_offset);
706 
707  SpinLockAcquire(&btscan->btps_mutex);
708  btscan->btps_scanPage = scan_page;
710  SpinLockRelease(&btscan->btps_mutex);
712 }
713 
714 /*
715  * _bt_parallel_done() -- Mark the parallel scan as complete.
716  *
717  * When there are no pages left to scan, this function should be called to
718  * notify other workers. Otherwise, they might wait forever for the scan to
719  * advance to the next page.
720  */
721 void
723 {
724  BTScanOpaque so = (BTScanOpaque) scan->opaque;
725  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
726  BTParallelScanDesc btscan;
727  bool status_changed = false;
728 
729  /* Do nothing, for non-parallel scans */
730  if (parallel_scan == NULL)
731  return;
732 
733  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
734  parallel_scan->ps_offset);
735 
736  /*
737  * Mark the parallel scan as done for this combination of scan keys,
738  * unless some other process already did so. See also
739  * _bt_advance_array_keys.
740  */
741  SpinLockAcquire(&btscan->btps_mutex);
742  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
743  btscan->btps_pageStatus != BTPARALLEL_DONE)
744  {
745  btscan->btps_pageStatus = BTPARALLEL_DONE;
746  status_changed = true;
747  }
748  SpinLockRelease(&btscan->btps_mutex);
749 
750  /* wake up all the workers associated with this parallel scan */
751  if (status_changed)
752  ConditionVariableBroadcast(&btscan->btps_cv);
753 }
754 
755 /*
756  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
757  * keys.
758  *
759  * Updates the count of array keys processed for both local and parallel
760  * scans.
761  */
762 void
764 {
765  BTScanOpaque so = (BTScanOpaque) scan->opaque;
766  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
767  BTParallelScanDesc btscan;
768 
769  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
770  parallel_scan->ps_offset);
771 
772  so->arrayKeyCount++;
773  SpinLockAcquire(&btscan->btps_mutex);
774  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
775  {
776  btscan->btps_scanPage = InvalidBlockNumber;
777  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
778  btscan->btps_arrayKeyCount++;
779  }
780  SpinLockRelease(&btscan->btps_mutex);
781 }
782 
783 /*
784  * Bulk deletion of all index entries pointing to a set of heap tuples.
785  * The set of target tuples is specified via a callback routine that tells
786  * whether any given heap tuple (identified by ItemPointer) is being deleted.
787  *
788  * Result: a palloc'd struct containing statistical info for VACUUM displays.
789  */
792  IndexBulkDeleteCallback callback, void *callback_state)
793 {
794  Relation rel = info->index;
795  BTCycleId cycleid;
796 
797  /* allocate stats if first time through, else re-use existing struct */
798  if (stats == NULL)
800 
801  /* Establish the vacuum cycle ID to use for this scan */
802  /* The ENSURE stuff ensures we clean up shared memory on failure */
804  {
805  cycleid = _bt_start_vacuum(rel);
806 
807  btvacuumscan(info, stats, callback, callback_state, cycleid);
808  }
810  _bt_end_vacuum(rel);
811 
812  return stats;
813 }
814 
815 /*
816  * Post-VACUUM cleanup.
817  *
818  * Result: a palloc'd struct containing statistical info for VACUUM displays.
819  */
822 {
823  BlockNumber num_delpages;
824 
825  /* No-op in ANALYZE ONLY mode */
826  if (info->analyze_only)
827  return stats;
828 
829  /*
830  * If btbulkdelete was called, we need not do anything (we just maintain
831  * the information used within _bt_vacuum_needs_cleanup() by calling
832  * _bt_set_cleanup_info() below).
833  *
834  * If btbulkdelete was _not_ called, then we have a choice to make: we
835  * must decide whether or not a btvacuumscan() call is needed now (i.e.
836  * whether the ongoing VACUUM operation can entirely avoid a physical scan
837  * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
838  * now.
839  */
840  if (stats == NULL)
841  {
842  /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
843  if (!_bt_vacuum_needs_cleanup(info->index))
844  return NULL;
845 
846  /*
847  * Since we aren't going to actually delete any leaf items, there's no
848  * need to go through all the vacuum-cycle-ID pushups here.
849  *
850  * Posting list tuples are a source of inaccuracy for cleanup-only
851  * scans. btvacuumscan() will assume that the number of index tuples
852  * from each page can be used as num_index_tuples, even though
853  * num_index_tuples is supposed to represent the number of TIDs in the
854  * index. This naive approach can underestimate the number of tuples
855  * in the index significantly.
856  *
857  * We handle the problem by making num_index_tuples an estimate in
858  * cleanup-only case.
859  */
861  btvacuumscan(info, stats, NULL, NULL, 0);
862  stats->estimated_count = true;
863  }
864 
865  /*
866  * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
867  *
868  * num_delpages is the number of deleted pages now in the index that were
869  * not safe to place in the FSM to be recycled just yet. num_delpages is
870  * greater than 0 only when _bt_pagedel() actually deleted pages during
871  * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
872  * have failed to place any newly deleted pages in the FSM just moments
873  * ago. (Actually, there are edge cases where recycling of the current
874  * VACUUM's newly deleted pages does not even become safe by the time the
875  * next VACUUM comes around. See nbtree/README.)
876  */
877  Assert(stats->pages_deleted >= stats->pages_free);
878  num_delpages = stats->pages_deleted - stats->pages_free;
879  _bt_set_cleanup_info(info->index, num_delpages);
880 
881  /*
882  * It's quite possible for us to be fooled by concurrent page splits into
883  * double-counting some index tuples, so disbelieve any total that exceeds
884  * the underlying heap's count ... if we know that accurately. Otherwise
885  * this might just make matters worse.
886  */
887  if (!info->estimated_count)
888  {
889  if (stats->num_index_tuples > info->num_heap_tuples)
890  stats->num_index_tuples = info->num_heap_tuples;
891  }
892 
893  return stats;
894 }
895 
896 /*
897  * btvacuumscan --- scan the index for VACUUMing purposes
898  *
899  * This combines the functions of looking for leaf tuples that are deletable
900  * according to the vacuum callback, looking for empty pages that can be
901  * deleted, and looking for old deleted pages that can be recycled. Both
902  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
903  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
904  *
905  * The caller is responsible for initially allocating/zeroing a stats struct
906  * and for obtaining a vacuum cycle ID if necessary.
907  */
908 static void
910  IndexBulkDeleteCallback callback, void *callback_state,
911  BTCycleId cycleid)
912 {
913  Relation rel = info->index;
914  BTVacState vstate;
915  BlockNumber num_pages;
916  BlockNumber scanblkno;
917  bool needLock;
918 
919  /*
920  * Reset fields that track information about the entire index now. This
921  * avoids double-counting in the case where a single VACUUM command
922  * requires multiple scans of the index.
923  *
924  * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
925  * since they track information about the VACUUM command, and so must last
926  * across each call to btvacuumscan().
927  *
928  * (Note that pages_free is treated as state about the whole index, not
929  * the current VACUUM. This is appropriate because RecordFreeIndexPage()
930  * calls are idempotent, and get repeated for the same deleted pages in
931  * some scenarios. The point for us is to track the number of recyclable
932  * pages in the index at the end of the VACUUM command.)
933  */
934  stats->num_pages = 0;
935  stats->num_index_tuples = 0;
936  stats->pages_deleted = 0;
937  stats->pages_free = 0;
938 
939  /* Set up info to pass down to btvacuumpage */
940  vstate.info = info;
941  vstate.stats = stats;
942  vstate.callback = callback;
943  vstate.callback_state = callback_state;
944  vstate.cycleid = cycleid;
945 
946  /* Create a temporary memory context to run _bt_pagedel in */
948  "_bt_pagedel",
950 
951  /* Initialize vstate fields used by _bt_pendingfsm_finalize */
952  vstate.bufsize = 0;
953  vstate.maxbufsize = 0;
954  vstate.pendingpages = NULL;
955  vstate.npendingpages = 0;
956  /* Consider applying _bt_pendingfsm_finalize optimization */
957  _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
958 
959  /*
960  * The outer loop iterates over all index pages except the metapage, in
961  * physical order (we hope the kernel will cooperate in providing
962  * read-ahead for speed). It is critical that we visit all leaf pages,
963  * including ones added after we start the scan, else we might fail to
964  * delete some deletable tuples. Hence, we must repeatedly check the
965  * relation length. We must acquire the relation-extension lock while
966  * doing so to avoid a race condition: if someone else is extending the
967  * relation, there is a window where bufmgr/smgr have created a new
968  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
969  * we manage to scan such a page here, we'll improperly assume it can be
970  * recycled. Taking the lock synchronizes things enough to prevent a
971  * problem: either num_pages won't include the new page, or _bt_getbuf
972  * already has write lock on the buffer and it will be fully initialized
973  * before we can examine it. Also, we need not worry if a page is added
974  * immediately after we look; the page splitting code already has
975  * write-lock on the left page before it adds a right page, so we must
976  * already have processed any tuples due to be moved into such a page.
977  *
978  * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
979  * think the use of the extension lock is still required.
980  *
981  * We can skip locking for new or temp relations, however, since no one
982  * else could be accessing them.
983  */
984  needLock = !RELATION_IS_LOCAL(rel);
985 
986  scanblkno = BTREE_METAPAGE + 1;
987  for (;;)
988  {
989  /* Get the current relation length */
990  if (needLock)
992  num_pages = RelationGetNumberOfBlocks(rel);
993  if (needLock)
995 
996  if (info->report_progress)
998  num_pages);
999 
1000  /* Quit if we've scanned the whole relation */
1001  if (scanblkno >= num_pages)
1002  break;
1003  /* Iterate over pages, then loop back to recheck length */
1004  for (; scanblkno < num_pages; scanblkno++)
1005  {
1006  btvacuumpage(&vstate, scanblkno);
1007  if (info->report_progress)
1009  scanblkno);
1010  }
1011  }
1012 
1013  /* Set statistics num_pages field to final size of index */
1014  stats->num_pages = num_pages;
1015 
1017 
1018  /*
1019  * If there were any calls to _bt_pagedel() during scan of the index then
1020  * see if any of the resulting pages can be placed in the FSM now. When
1021  * it's not safe we'll have to leave it up to a future VACUUM operation.
1022  *
1023  * Finally, if we placed any pages in the FSM (either just now or during
1024  * the scan), forcibly update the upper-level FSM pages to ensure that
1025  * searchers can find them.
1026  */
1027  _bt_pendingfsm_finalize(rel, &vstate);
1028  if (stats->pages_free > 0)
1030 }
1031 
1032 /*
1033  * btvacuumpage --- VACUUM one page
1034  *
1035  * This processes a single page for btvacuumscan(). In some cases we must
1036  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1037  * a previous call here. This is how we handle page splits (that happened
1038  * after our cycleid was acquired) whose right half page happened to reuse
1039  * a block that we might have processed at some point before it was
1040  * recycled (i.e. before the page split).
1041  */
1042 static void
1044 {
1045  IndexVacuumInfo *info = vstate->info;
1046  IndexBulkDeleteResult *stats = vstate->stats;
1048  void *callback_state = vstate->callback_state;
1049  Relation rel = info->index;
1050  Relation heaprel = info->heaprel;
1051  bool attempt_pagedel;
1052  BlockNumber blkno,
1053  backtrack_to;
1054  Buffer buf;
1055  Page page;
1056  BTPageOpaque opaque;
1057 
1058  blkno = scanblkno;
1059 
1060 backtrack:
1061 
1062  attempt_pagedel = false;
1063  backtrack_to = P_NONE;
1064 
1065  /* call vacuum_delay_point while not holding any buffer lock */
1067 
1068  /*
1069  * We can't use _bt_getbuf() here because it always applies
1070  * _bt_checkpage(), which will barf on an all-zero page. We want to
1071  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1072  * buffer access strategy.
1073  */
1075  info->strategy);
1076  _bt_lockbuf(rel, buf, BT_READ);
1077  page = BufferGetPage(buf);
1078  opaque = NULL;
1079  if (!PageIsNew(page))
1080  {
1081  _bt_checkpage(rel, buf);
1082  opaque = BTPageGetOpaque(page);
1083  }
1084 
1085  Assert(blkno <= scanblkno);
1086  if (blkno != scanblkno)
1087  {
1088  /*
1089  * We're backtracking.
1090  *
1091  * We followed a right link to a sibling leaf page (a page that
1092  * happens to be from a block located before scanblkno). The only
1093  * case we want to do anything with is a live leaf page having the
1094  * current vacuum cycle ID.
1095  *
1096  * The page had better be in a state that's consistent with what we
1097  * expect. Check for conditions that imply corruption in passing. It
1098  * can't be half-dead because only an interrupted VACUUM process can
1099  * leave pages in that state, so we'd definitely have dealt with it
1100  * back when the page was the scanblkno page (half-dead pages are
1101  * always marked fully deleted by _bt_pagedel(), barring corruption).
1102  */
1103  if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1104  {
1105  Assert(false);
1106  ereport(LOG,
1107  (errcode(ERRCODE_INDEX_CORRUPTED),
1108  errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1109  blkno, scanblkno, RelationGetRelationName(rel))));
1110  _bt_relbuf(rel, buf);
1111  return;
1112  }
1113 
1114  /*
1115  * We may have already processed the page in an earlier call, when the
1116  * page was scanblkno. This happens when the leaf page split occurred
1117  * after the scan began, but before the right sibling page became the
1118  * scanblkno.
1119  *
1120  * Page may also have been deleted by current btvacuumpage() call,
1121  * since _bt_pagedel() sometimes deletes the right sibling page of
1122  * scanblkno in passing (it does so after we decided where to
1123  * backtrack to). We don't need to process this page as a deleted
1124  * page a second time now (in fact, it would be wrong to count it as a
1125  * deleted page in the bulk delete statistics a second time).
1126  */
1127  if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1128  {
1129  /* Done with current scanblkno (and all lower split pages) */
1130  _bt_relbuf(rel, buf);
1131  return;
1132  }
1133  }
1134 
1135  if (!opaque || BTPageIsRecyclable(page, heaprel))
1136  {
1137  /* Okay to recycle this page (which could be leaf or internal) */
1138  RecordFreeIndexPage(rel, blkno);
1139  stats->pages_deleted++;
1140  stats->pages_free++;
1141  }
1142  else if (P_ISDELETED(opaque))
1143  {
1144  /*
1145  * Already deleted page (which could be leaf or internal). Can't
1146  * recycle yet.
1147  */
1148  stats->pages_deleted++;
1149  }
1150  else if (P_ISHALFDEAD(opaque))
1151  {
1152  /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1153  attempt_pagedel = true;
1154 
1155  /*
1156  * _bt_pagedel() will increment both pages_newly_deleted and
1157  * pages_deleted stats in all cases (barring corruption)
1158  */
1159  }
1160  else if (P_ISLEAF(opaque))
1161  {
1163  int ndeletable;
1165  int nupdatable;
1166  OffsetNumber offnum,
1167  minoff,
1168  maxoff;
1169  int nhtidsdead,
1170  nhtidslive;
1171 
1172  /*
1173  * Trade in the initial read lock for a full cleanup lock on this
1174  * page. We must get such a lock on every leaf page over the course
1175  * of the vacuum scan, whether or not it actually contains any
1176  * deletable tuples --- see nbtree/README.
1177  */
1179 
1180  /*
1181  * Check whether we need to backtrack to earlier pages. What we are
1182  * concerned about is a page split that happened since we started the
1183  * vacuum scan. If the split moved tuples on the right half of the
1184  * split (i.e. the tuples that sort high) to a block that we already
1185  * passed over, then we might have missed the tuples. We need to
1186  * backtrack now. (Must do this before possibly clearing btpo_cycleid
1187  * or deleting scanblkno page below!)
1188  */
1189  if (vstate->cycleid != 0 &&
1190  opaque->btpo_cycleid == vstate->cycleid &&
1191  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1192  !P_RIGHTMOST(opaque) &&
1193  opaque->btpo_next < scanblkno)
1194  backtrack_to = opaque->btpo_next;
1195 
1196  ndeletable = 0;
1197  nupdatable = 0;
1198  minoff = P_FIRSTDATAKEY(opaque);
1199  maxoff = PageGetMaxOffsetNumber(page);
1200  nhtidsdead = 0;
1201  nhtidslive = 0;
1202  if (callback)
1203  {
1204  /* btbulkdelete callback tells us what to delete (or update) */
1205  for (offnum = minoff;
1206  offnum <= maxoff;
1207  offnum = OffsetNumberNext(offnum))
1208  {
1209  IndexTuple itup;
1210 
1211  itup = (IndexTuple) PageGetItem(page,
1212  PageGetItemId(page, offnum));
1213 
1214  Assert(!BTreeTupleIsPivot(itup));
1215  if (!BTreeTupleIsPosting(itup))
1216  {
1217  /* Regular tuple, standard table TID representation */
1218  if (callback(&itup->t_tid, callback_state))
1219  {
1220  deletable[ndeletable++] = offnum;
1221  nhtidsdead++;
1222  }
1223  else
1224  nhtidslive++;
1225  }
1226  else
1227  {
1228  BTVacuumPosting vacposting;
1229  int nremaining;
1230 
1231  /* Posting list tuple */
1232  vacposting = btreevacuumposting(vstate, itup, offnum,
1233  &nremaining);
1234  if (vacposting == NULL)
1235  {
1236  /*
1237  * All table TIDs from the posting tuple remain, so no
1238  * delete or update required
1239  */
1240  Assert(nremaining == BTreeTupleGetNPosting(itup));
1241  }
1242  else if (nremaining > 0)
1243  {
1244 
1245  /*
1246  * Store metadata about posting list tuple in
1247  * updatable array for entire page. Existing tuple
1248  * will be updated during the later call to
1249  * _bt_delitems_vacuum().
1250  */
1251  Assert(nremaining < BTreeTupleGetNPosting(itup));
1252  updatable[nupdatable++] = vacposting;
1253  nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1254  }
1255  else
1256  {
1257  /*
1258  * All table TIDs from the posting list must be
1259  * deleted. We'll delete the index tuple completely
1260  * (no update required).
1261  */
1262  Assert(nremaining == 0);
1263  deletable[ndeletable++] = offnum;
1264  nhtidsdead += BTreeTupleGetNPosting(itup);
1265  pfree(vacposting);
1266  }
1267 
1268  nhtidslive += nremaining;
1269  }
1270  }
1271  }
1272 
1273  /*
1274  * Apply any needed deletes or updates. We issue just one
1275  * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1276  */
1277  if (ndeletable > 0 || nupdatable > 0)
1278  {
1279  Assert(nhtidsdead >= ndeletable + nupdatable);
1280  _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1281  nupdatable);
1282 
1283  stats->tuples_removed += nhtidsdead;
1284  /* must recompute maxoff */
1285  maxoff = PageGetMaxOffsetNumber(page);
1286 
1287  /* can't leak memory here */
1288  for (int i = 0; i < nupdatable; i++)
1289  pfree(updatable[i]);
1290  }
1291  else
1292  {
1293  /*
1294  * If the leaf page has been split during this vacuum cycle, it
1295  * seems worth expending a write to clear btpo_cycleid even if we
1296  * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1297  * takes care of this.) This ensures we won't process the page
1298  * again.
1299  *
1300  * We treat this like a hint-bit update because there's no need to
1301  * WAL-log it.
1302  */
1303  Assert(nhtidsdead == 0);
1304  if (vstate->cycleid != 0 &&
1305  opaque->btpo_cycleid == vstate->cycleid)
1306  {
1307  opaque->btpo_cycleid = 0;
1308  MarkBufferDirtyHint(buf, true);
1309  }
1310  }
1311 
1312  /*
1313  * If the leaf page is now empty, try to delete it; else count the
1314  * live tuples (live table TIDs in posting lists are counted as
1315  * separate live tuples). We don't delete when backtracking, though,
1316  * since that would require teaching _bt_pagedel() about backtracking
1317  * (doesn't seem worth adding more complexity to deal with that).
1318  *
1319  * We don't count the number of live TIDs during cleanup-only calls to
1320  * btvacuumscan (i.e. when callback is not set). We count the number
1321  * of index tuples directly instead. This avoids the expense of
1322  * directly examining all of the tuples on each page. VACUUM will
1323  * treat num_index_tuples as an estimate in cleanup-only case, so it
1324  * doesn't matter that this underestimates num_index_tuples
1325  * significantly in some cases.
1326  */
1327  if (minoff > maxoff)
1328  attempt_pagedel = (blkno == scanblkno);
1329  else if (callback)
1330  stats->num_index_tuples += nhtidslive;
1331  else
1332  stats->num_index_tuples += maxoff - minoff + 1;
1333 
1334  Assert(!attempt_pagedel || nhtidslive == 0);
1335  }
1336 
1337  if (attempt_pagedel)
1338  {
1339  MemoryContext oldcontext;
1340 
1341  /* Run pagedel in a temp context to avoid memory leakage */
1343  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1344 
1345  /*
1346  * _bt_pagedel maintains the bulk delete stats on our behalf;
1347  * pages_newly_deleted and pages_deleted are likely to be incremented
1348  * during call
1349  */
1350  Assert(blkno == scanblkno);
1351  _bt_pagedel(rel, buf, vstate);
1352 
1353  MemoryContextSwitchTo(oldcontext);
1354  /* pagedel released buffer, so we shouldn't */
1355  }
1356  else
1357  _bt_relbuf(rel, buf);
1358 
1359  if (backtrack_to != P_NONE)
1360  {
1361  blkno = backtrack_to;
1362  goto backtrack;
1363  }
1364 }
1365 
1366 /*
1367  * btreevacuumposting --- determine TIDs still needed in posting list
1368  *
1369  * Returns metadata describing how to build replacement tuple without the TIDs
1370  * that VACUUM needs to delete. Returned value is NULL in the common case
1371  * where no changes are needed to caller's posting list tuple (we avoid
1372  * allocating memory here as an optimization).
1373  *
1374  * The number of TIDs that should remain in the posting list tuple is set for
1375  * caller in *nremaining.
1376  */
1377 static BTVacuumPosting
1379  OffsetNumber updatedoffset, int *nremaining)
1380 {
1381  int live = 0;
1382  int nitem = BTreeTupleGetNPosting(posting);
1383  ItemPointer items = BTreeTupleGetPosting(posting);
1384  BTVacuumPosting vacposting = NULL;
1385 
1386  for (int i = 0; i < nitem; i++)
1387  {
1388  if (!vstate->callback(items + i, vstate->callback_state))
1389  {
1390  /* Live table TID */
1391  live++;
1392  }
1393  else if (vacposting == NULL)
1394  {
1395  /*
1396  * First dead table TID encountered.
1397  *
1398  * It's now clear that we need to delete one or more dead table
1399  * TIDs, so start maintaining metadata describing how to update
1400  * existing posting list tuple.
1401  */
1402  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1403  nitem * sizeof(uint16));
1404 
1405  vacposting->itup = posting;
1406  vacposting->updatedoffset = updatedoffset;
1407  vacposting->ndeletedtids = 0;
1408  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1409  }
1410  else
1411  {
1412  /* Second or subsequent dead table TID */
1413  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1414  }
1415  }
1416 
1417  *nremaining = live;
1418  return vacposting;
1419 }
1420 
1421 /*
1422  * btcanreturn() -- Check whether btree indexes support index-only scans.
1423  *
1424  * btrees always do, so this is trivial.
1425  */
1426 bool
1428 {
1429  return true;
1430 }
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static Datum values[MAXATTR]
Definition: bootstrap.c:156
int Buffer
Definition: buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4593
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3378
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4561
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2190
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4625
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:782
#define P_NEW
Definition: bufmgr.h:152
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:229
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
@ RBM_NORMAL
Definition: bufmgr.h:44
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static bool PageIsNew(Page page)
Definition: bufpage.h:230
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define OffsetToPointer(base, offset)
Definition: c.h:761
unsigned short uint16
Definition: c.h:494
size_t Size
Definition: c.h:594
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1162
int errcode(int sqlerrcode)
Definition: elog.c:860
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:81
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:87
IndexUniqueCheck
Definition: genam.h:116
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:165
Assert(fmt[strlen(fmt) - 1] !='\n')
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:431
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:481
#define ExclusiveLock
Definition: lockdefs.h:42
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:330
void pfree(void *pointer)
Definition: mcxt.c:1431
void * palloc0(Size size)
Definition: mcxt.c:1232
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:403
void * palloc(Size size)
Definition: mcxt.c:1201
#define AllocSetContextCreate
Definition: memutils.h:128
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:152
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:103
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1024
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1803
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1155
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:798
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:233
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1071
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1110
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:68
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:180
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2996
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1040
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2955
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1427
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:699
BTPS_State
Definition: nbtree.c:55
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:57
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56
@ BTPARALLEL_IDLE
Definition: nbtree.c:58
@ BTPARALLEL_DONE
Definition: nbtree.c:59
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:348
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:722
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:791
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:763
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1378
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:821
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:216
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:600
struct BTParallelScanDescData BTParallelScanDescData
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:192
void btbuildempty(Relation index)
Definition: nbtree.c:154
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:641
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
Definition: nbtree.c:1043
Size btestimateparallelscan(void)
Definition: nbtree.c:576
void btinitparallelscan(void *target)
Definition: nbtree.c:585
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:909
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:290
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:490
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:454
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:395
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:78
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:96
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:520
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:990
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:537
#define BTNProcs
Definition: nbtree.h:712
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1007
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
uint16 BTCycleId
Definition: nbtree.h:29
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_READ
Definition: nbtree.h:719
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:291
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1013
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1001
#define BT_WRITE
Definition: nbtree.h:720
#define BTP_SPLIT_END
Definition: nbtree.h:81
#define BTOPTIONS_PROC
Definition: nbtree.h:711
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1076
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:876
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1459
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:301
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2085
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:2209
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:2113
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2163
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:545
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1777
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:202
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:631
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:612
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2186
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2741
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:517
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:2028
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:41
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:293
#define makeNode(_type_)
Definition: nodes.h:155
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
static char * buf
Definition: pg_test_fsync.c:73
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
#define InvalidOid
Definition: postgres_ext.h:36
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:122
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:121
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:647
#define RelationGetDescr(relation)
Definition: rel.h:530
#define RelationGetRelationName(relation)
Definition: rel.h:538
@ MAIN_FORKNUM
Definition: relpath.h:50
@ INIT_FORKNUM
Definition: relpath.h:53
int slock_t
Definition: s_lock.h:754
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6783
ScanKeyData * ScanKey
Definition: skey.h:75
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define BTMaxStrategyNumber
Definition: stratnum.h:35
BlockNumber btpo_next
Definition: nbtree.h:65
uint16 btpo_flags
Definition: nbtree.h:67
BTCycleId btpo_cycleid
Definition: nbtree.h:68
slock_t btps_mutex
Definition: nbtree.c:74
BTPS_State btps_pageStatus
Definition: nbtree.c:69
ConditionVariable btps_cv
Definition: nbtree.c:75
BlockNumber btps_scanPage
Definition: nbtree.c:68
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1047
bool arraysStarted
Definition: nbtree.h:1041
char * markTuples
Definition: nbtree.h:1060
BTScanPosData currPos
Definition: nbtree.h:1072
int * killedItems
Definition: nbtree.h:1051
char * currTuples
Definition: nbtree.h:1059
ScanKey arrayKeyData
Definition: nbtree.h:1040
BTScanPosData markPos
Definition: nbtree.h:1073
ScanKey keyData
Definition: nbtree.h:1037
MemoryContext arrayContext
Definition: nbtree.h:1048
Buffer buf
Definition: nbtree.h:953
int nextTupleOffset
Definition: nbtree.h:972
int lastItem
Definition: nbtree.h:982
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:985
int itemIndex
Definition: nbtree.h:983
ItemPointerData heapTid
Definition: nbtree.h:946
IndexBulkDeleteResult * stats
Definition: nbtree.h:333
BTCycleId cycleid
Definition: nbtree.h:336
BTPendingFSM * pendingpages
Definition: nbtree.h:344
int npendingpages
Definition: nbtree.h:345
IndexBulkDeleteCallback callback
Definition: nbtree.h:334
MemoryContext pagedelcontext
Definition: nbtree.h:337
IndexVacuumInfo * info
Definition: nbtree.h:332
int bufsize
Definition: nbtree.h:342
int maxbufsize
Definition: nbtree.h:343
void * callback_state
Definition: nbtree.h:335
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:911
uint16 ndeletedtids
Definition: nbtree.h:910
IndexTuple itup
Definition: nbtree.h:906
OffsetNumber updatedoffset
Definition: nbtree.h:907
ambuildphasename_function ambuildphasename
Definition: amapi.h:276
ambuildempty_function ambuildempty
Definition: amapi.h:267
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:271
bool amclusterable
Definition: amapi.h:241
amoptions_function amoptions
Definition: amapi.h:274
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:288
amrestrpos_function amrestrpos
Definition: amapi.h:285
aminsert_function aminsert
Definition: amapi.h:268
amendscan_function amendscan
Definition: amapi.h:283
uint16 amoptsprocnum
Definition: amapi.h:221
amparallelrescan_function amparallelrescan
Definition: amapi.h:290
Oid amkeytype
Definition: amapi.h:257
bool ampredlocks
Definition: amapi.h:243
uint16 amsupport
Definition: amapi.h:219
amcostestimate_function amcostestimate
Definition: amapi.h:273
bool amcanorderbyop
Definition: amapi.h:225
amadjustmembers_function amadjustmembers
Definition: amapi.h:278
ambuild_function ambuild
Definition: amapi.h:266
bool amstorage
Definition: amapi.h:239
uint16 amstrategies
Definition: amapi.h:217
bool amoptionalkey
Definition: amapi.h:233
amgettuple_function amgettuple
Definition: amapi.h:281
amcanreturn_function amcanreturn
Definition: amapi.h:272
bool amcanunique
Definition: amapi.h:229
amgetbitmap_function amgetbitmap
Definition: amapi.h:282
amproperty_function amproperty
Definition: amapi.h:275
ambulkdelete_function ambulkdelete
Definition: amapi.h:270
bool amsearcharray
Definition: amapi.h:235
bool amsummarizing
Definition: amapi.h:253
amvalidate_function amvalidate
Definition: amapi.h:277
ammarkpos_function ammarkpos
Definition: amapi.h:284
bool amcanmulticol
Definition: amapi.h:231
bool amusemaintenanceworkmem
Definition: amapi.h:251
ambeginscan_function ambeginscan
Definition: amapi.h:279
bool amcanparallel
Definition: amapi.h:245
amrescan_function amrescan
Definition: amapi.h:280
bool amcanorder
Definition: amapi.h:223
bool amcanbuildparallel
Definition: amapi.h:247
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:289
uint8 amparallelvacuumoptions
Definition: amapi.h:255
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:269
bool amcanbackward
Definition: amapi.h:227
bool amcaninclude
Definition: amapi.h:249
bool amsearchnulls
Definition: amapi.h:237
bool estimated_count
Definition: genam.h:78
BlockNumber pages_deleted
Definition: genam.h:82
BlockNumber pages_free
Definition: genam.h:83
BlockNumber num_pages
Definition: genam.h:77
double tuples_removed
Definition: genam.h:80
double num_index_tuples
Definition: genam.h:79
struct ScanKeyData * keyData
Definition: relscan.h:122
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
bool kill_prior_tuple
Definition: relscan.h:128
struct TupleDescData * xs_itupdesc
Definition: relscan.h:143
ItemPointerData xs_heaptid
Definition: relscan.h:147
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:46
double num_heap_tuples
Definition: genam.h:52
bool analyze_only
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:53
Relation heaprel
Definition: genam.h:47
bool report_progress
Definition: genam.h:49
bool estimated_count
Definition: genam.h:50
Definition: type.h:95
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:377
void vacuum_delay_point(void)
Definition: vacuum.c:2322
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:47
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:54
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1238