PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "access/xloginsert.h"
26 #include "commands/progress.h"
27 #include "commands/vacuum.h"
28 #include "miscadmin.h"
29 #include "nodes/execnodes.h"
30 #include "pgstat.h"
31 #include "postmaster/autovacuum.h"
33 #include "storage/indexfsm.h"
34 #include "storage/ipc.h"
35 #include "storage/lmgr.h"
36 #include "storage/smgr.h"
37 #include "utils/builtins.h"
38 #include "utils/index_selfuncs.h"
39 #include "utils/memutils.h"
40 
41 
42 /*
43  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
44  *
45  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
46  * a new page; others must wait.
47  *
48  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
49  * to a new page; some process can start doing that.
50  *
51  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
52  * We reach this state once for every distinct combination of array keys.
53  */
54 typedef enum
55 {
61 
62 /*
63  * BTParallelScanDescData contains btree specific shared information required
64  * for parallel scan.
65  */
66 typedef struct BTParallelScanDescData
67 {
68  BlockNumber btps_scanPage; /* latest or next page to be scanned */
69  BTPS_State btps_pageStatus; /* indicates whether next page is
70  * available for scan. see above for
71  * possible states of parallel scan. */
72  int btps_arrayKeyCount; /* count indicating number of array scan
73  * keys processed by parallel scan */
74  slock_t btps_mutex; /* protects above variables */
75  ConditionVariable btps_cv; /* used to synchronize parallel scan */
77 
79 
80 
81 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
82  IndexBulkDeleteCallback callback, void *callback_state,
83  BTCycleId cycleid);
84 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
86  IndexTuple posting,
87  OffsetNumber updatedoffset,
88  int *nremaining);
89 
90 
91 /*
92  * Btree handler function: return IndexAmRoutine with access method parameters
93  * and callbacks.
94  */
95 Datum
97 {
99 
100  amroutine->amstrategies = BTMaxStrategyNumber;
101  amroutine->amsupport = BTNProcs;
102  amroutine->amoptsprocnum = BTOPTIONS_PROC;
103  amroutine->amcanorder = true;
104  amroutine->amcanorderbyop = false;
105  amroutine->amcanbackward = true;
106  amroutine->amcanunique = true;
107  amroutine->amcanmulticol = true;
108  amroutine->amoptionalkey = true;
109  amroutine->amsearcharray = true;
110  amroutine->amsearchnulls = true;
111  amroutine->amstorage = false;
112  amroutine->amclusterable = true;
113  amroutine->ampredlocks = true;
114  amroutine->amcanparallel = true;
115  amroutine->amcaninclude = true;
116  amroutine->amusemaintenanceworkmem = false;
117  amroutine->amsummarizing = false;
118  amroutine->amparallelvacuumoptions =
120  amroutine->amkeytype = InvalidOid;
121 
122  amroutine->ambuild = btbuild;
123  amroutine->ambuildempty = btbuildempty;
124  amroutine->aminsert = btinsert;
125  amroutine->ambulkdelete = btbulkdelete;
126  amroutine->amvacuumcleanup = btvacuumcleanup;
127  amroutine->amcanreturn = btcanreturn;
128  amroutine->amcostestimate = btcostestimate;
129  amroutine->amoptions = btoptions;
130  amroutine->amproperty = btproperty;
131  amroutine->ambuildphasename = btbuildphasename;
132  amroutine->amvalidate = btvalidate;
133  amroutine->amadjustmembers = btadjustmembers;
134  amroutine->ambeginscan = btbeginscan;
135  amroutine->amrescan = btrescan;
136  amroutine->amgettuple = btgettuple;
137  amroutine->amgetbitmap = btgetbitmap;
138  amroutine->amendscan = btendscan;
139  amroutine->ammarkpos = btmarkpos;
140  amroutine->amrestrpos = btrestrpos;
143  amroutine->amparallelrescan = btparallelrescan;
144 
145  PG_RETURN_POINTER(amroutine);
146 }
147 
148 /*
149  * btbuildempty() -- build an empty btree index in the initialization fork
150  */
151 void
153 {
154  bool allequalimage = _bt_allequalimage(index, false);
155  Buffer metabuf;
156  Page metapage;
157 
158  /*
159  * Initalize the metapage.
160  *
161  * Regular index build bypasses the buffer manager and uses smgr functions
162  * directly, with an smgrimmedsync() call at the end. That makes sense
163  * when the index is large, but for an empty index, it's better to use the
164  * buffer cache to avoid the smgrimmedsync().
165  */
168  _bt_lockbuf(index, metabuf, BT_WRITE);
169 
171 
172  metapage = BufferGetPage(metabuf);
173  _bt_initmetapage(metapage, P_NONE, 0, allequalimage);
174  MarkBufferDirty(metabuf);
175  log_newpage_buffer(metabuf, true);
176 
178 
179  _bt_unlockbuf(index, metabuf);
180  ReleaseBuffer(metabuf);
181 }
182 
183 /*
184  * btinsert() -- insert an index tuple into a btree.
185  *
186  * Descend the tree recursively, find the appropriate location for our
187  * new tuple, and put it there.
188  */
189 bool
190 btinsert(Relation rel, Datum *values, bool *isnull,
191  ItemPointer ht_ctid, Relation heapRel,
192  IndexUniqueCheck checkUnique,
193  bool indexUnchanged,
194  IndexInfo *indexInfo)
195 {
196  bool result;
197  IndexTuple itup;
198 
199  /* generate an index tuple */
200  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
201  itup->t_tid = *ht_ctid;
202 
203  result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
204 
205  pfree(itup);
206 
207  return result;
208 }
209 
210 /*
211  * btgettuple() -- Get the next tuple in the scan.
212  */
213 bool
215 {
216  BTScanOpaque so = (BTScanOpaque) scan->opaque;
217  bool res;
218 
219  /* btree indexes are never lossy */
220  scan->xs_recheck = false;
221 
222  /*
223  * If we have any array keys, initialize them during first call for a
224  * scan. We can't do this in btrescan because we don't know the scan
225  * direction at that time.
226  */
227  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
228  {
229  /* punt if we have any unsatisfiable array keys */
230  if (so->numArrayKeys < 0)
231  return false;
232 
233  _bt_start_array_keys(scan, dir);
234  }
235 
236  /* This loop handles advancing to the next array elements, if any */
237  do
238  {
239  /*
240  * If we've already initialized this scan, we can just advance it in
241  * the appropriate direction. If we haven't done so yet, we call
242  * _bt_first() to get the first item in the scan.
243  */
244  if (!BTScanPosIsValid(so->currPos))
245  res = _bt_first(scan, dir);
246  else
247  {
248  /*
249  * Check to see if we should kill the previously-fetched tuple.
250  */
251  if (scan->kill_prior_tuple)
252  {
253  /*
254  * Yes, remember it for later. (We'll deal with all such
255  * tuples at once right before leaving the index page.) The
256  * test for numKilled overrun is not just paranoia: if the
257  * caller reverses direction in the indexscan then the same
258  * item might get entered multiple times. It's not worth
259  * trying to optimize that, so we don't detect it, but instead
260  * just forget any excess entries.
261  */
262  if (so->killedItems == NULL)
263  so->killedItems = (int *)
264  palloc(MaxTIDsPerBTreePage * sizeof(int));
265  if (so->numKilled < MaxTIDsPerBTreePage)
266  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
267  }
268 
269  /*
270  * Now continue the scan.
271  */
272  res = _bt_next(scan, dir);
273  }
274 
275  /* If we have a tuple, return it ... */
276  if (res)
277  break;
278  /* ... otherwise see if we have more array keys to deal with */
279  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
280 
281  return res;
282 }
283 
284 /*
285  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
286  */
287 int64
289 {
290  BTScanOpaque so = (BTScanOpaque) scan->opaque;
291  int64 ntids = 0;
292  ItemPointer heapTid;
293 
294  /*
295  * If we have any array keys, initialize them.
296  */
297  if (so->numArrayKeys)
298  {
299  /* punt if we have any unsatisfiable array keys */
300  if (so->numArrayKeys < 0)
301  return ntids;
302 
304  }
305 
306  /* This loop handles advancing to the next array elements, if any */
307  do
308  {
309  /* Fetch the first page & tuple */
310  if (_bt_first(scan, ForwardScanDirection))
311  {
312  /* Save tuple ID, and continue scanning */
313  heapTid = &scan->xs_heaptid;
314  tbm_add_tuples(tbm, heapTid, 1, false);
315  ntids++;
316 
317  for (;;)
318  {
319  /*
320  * Advance to next tuple within page. This is the same as the
321  * easy case in _bt_next().
322  */
323  if (++so->currPos.itemIndex > so->currPos.lastItem)
324  {
325  /* let _bt_next do the heavy lifting */
326  if (!_bt_next(scan, ForwardScanDirection))
327  break;
328  }
329 
330  /* Save tuple ID, and continue scanning */
331  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
332  tbm_add_tuples(tbm, heapTid, 1, false);
333  ntids++;
334  }
335  }
336  /* Now see if we have more array keys to deal with */
338 
339  return ntids;
340 }
341 
342 /*
343  * btbeginscan() -- start a scan on a btree index
344  */
346 btbeginscan(Relation rel, int nkeys, int norderbys)
347 {
348  IndexScanDesc scan;
349  BTScanOpaque so;
350 
351  /* no order by operators allowed */
352  Assert(norderbys == 0);
353 
354  /* get the scan */
355  scan = RelationGetIndexScan(rel, nkeys, norderbys);
356 
357  /* allocate private workspace */
358  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
361  if (scan->numberOfKeys > 0)
362  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
363  else
364  so->keyData = NULL;
365 
366  so->arrayKeyData = NULL; /* assume no array keys for now */
367  so->numArrayKeys = 0;
368  so->arrayKeys = NULL;
369  so->arrayContext = NULL;
370 
371  so->killedItems = NULL; /* until needed */
372  so->numKilled = 0;
373 
374  /*
375  * We don't know yet whether the scan will be index-only, so we do not
376  * allocate the tuple workspace arrays until btrescan. However, we set up
377  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
378  */
379  so->currTuples = so->markTuples = NULL;
380 
381  scan->xs_itupdesc = RelationGetDescr(rel);
382 
383  scan->opaque = so;
384 
385  return scan;
386 }
387 
388 /*
389  * btrescan() -- rescan an index relation
390  */
391 void
392 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
393  ScanKey orderbys, int norderbys)
394 {
395  BTScanOpaque so = (BTScanOpaque) scan->opaque;
396 
397  /* we aren't holding any read locks, but gotta drop the pins */
399  {
400  /* Before leaving current page, deal with any killed items */
401  if (so->numKilled > 0)
402  _bt_killitems(scan);
405  }
406 
407  so->markItemIndex = -1;
408  so->arrayKeyCount = 0;
411 
412  /*
413  * Allocate tuple workspace arrays, if needed for an index-only scan and
414  * not already done in a previous rescan call. To save on palloc
415  * overhead, both workspaces are allocated as one palloc block; only this
416  * function and btendscan know that.
417  *
418  * NOTE: this data structure also makes it safe to return data from a
419  * "name" column, even though btree name_ops uses an underlying storage
420  * datatype of cstring. The risk there is that "name" is supposed to be
421  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
422  * However, since we only return data out of tuples sitting in the
423  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
424  * data out of the markTuples array --- running off the end of memory for
425  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
426  * adding special-case treatment for name_ops elsewhere.
427  */
428  if (scan->xs_want_itup && so->currTuples == NULL)
429  {
430  so->currTuples = (char *) palloc(BLCKSZ * 2);
431  so->markTuples = so->currTuples + BLCKSZ;
432  }
433 
434  /*
435  * Reset the scan keys
436  */
437  if (scankey && scan->numberOfKeys > 0)
438  memmove(scan->keyData,
439  scankey,
440  scan->numberOfKeys * sizeof(ScanKeyData));
441  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
442 
443  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
445 }
446 
447 /*
448  * btendscan() -- close down a scan
449  */
450 void
452 {
453  BTScanOpaque so = (BTScanOpaque) scan->opaque;
454 
455  /* we aren't holding any read locks, but gotta drop the pins */
457  {
458  /* Before leaving current page, deal with any killed items */
459  if (so->numKilled > 0)
460  _bt_killitems(scan);
462  }
463 
464  so->markItemIndex = -1;
466 
467  /* No need to invalidate positions, the RAM is about to be freed. */
468 
469  /* Release storage */
470  if (so->keyData != NULL)
471  pfree(so->keyData);
472  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
473  if (so->arrayContext != NULL)
475  if (so->killedItems != NULL)
476  pfree(so->killedItems);
477  if (so->currTuples != NULL)
478  pfree(so->currTuples);
479  /* so->markTuples should not be pfree'd, see btrescan */
480  pfree(so);
481 }
482 
483 /*
484  * btmarkpos() -- save current scan position
485  */
486 void
488 {
489  BTScanOpaque so = (BTScanOpaque) scan->opaque;
490 
491  /* There may be an old mark with a pin (but no lock). */
493 
494  /*
495  * Just record the current itemIndex. If we later step to next page
496  * before releasing the marked position, _bt_steppage makes a full copy of
497  * the currPos struct in markPos. If (as often happens) the mark is moved
498  * before we leave the page, we don't have to do that work.
499  */
500  if (BTScanPosIsValid(so->currPos))
501  so->markItemIndex = so->currPos.itemIndex;
502  else
503  {
505  so->markItemIndex = -1;
506  }
507 
508  /* Also record the current positions of any array keys */
509  if (so->numArrayKeys)
510  _bt_mark_array_keys(scan);
511 }
512 
513 /*
514  * btrestrpos() -- restore scan to last saved position
515  */
516 void
518 {
519  BTScanOpaque so = (BTScanOpaque) scan->opaque;
520 
521  /* Restore the marked positions of any array keys */
522  if (so->numArrayKeys)
524 
525  if (so->markItemIndex >= 0)
526  {
527  /*
528  * The scan has never moved to a new page since the last mark. Just
529  * restore the itemIndex.
530  *
531  * NB: In this case we can't count on anything in so->markPos to be
532  * accurate.
533  */
534  so->currPos.itemIndex = so->markItemIndex;
535  }
536  else
537  {
538  /*
539  * The scan moved to a new page after last mark or restore, and we are
540  * now restoring to the marked page. We aren't holding any read
541  * locks, but if we're still holding the pin for the current position,
542  * we must drop it.
543  */
544  if (BTScanPosIsValid(so->currPos))
545  {
546  /* Before leaving current page, deal with any killed items */
547  if (so->numKilled > 0)
548  _bt_killitems(scan);
550  }
551 
552  if (BTScanPosIsValid(so->markPos))
553  {
554  /* bump pin on mark buffer for assignment to current buffer */
555  if (BTScanPosIsPinned(so->markPos))
557  memcpy(&so->currPos, &so->markPos,
558  offsetof(BTScanPosData, items[1]) +
559  so->markPos.lastItem * sizeof(BTScanPosItem));
560  if (so->currTuples)
561  memcpy(so->currTuples, so->markTuples,
563  }
564  else
566  }
567 }
568 
569 /*
570  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
571  */
572 Size
574 {
575  return sizeof(BTParallelScanDescData);
576 }
577 
578 /*
579  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
580  */
581 void
582 btinitparallelscan(void *target)
583 {
584  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
585 
586  SpinLockInit(&bt_target->btps_mutex);
587  bt_target->btps_scanPage = InvalidBlockNumber;
589  bt_target->btps_arrayKeyCount = 0;
590  ConditionVariableInit(&bt_target->btps_cv);
591 }
592 
593 /*
594  * btparallelrescan() -- reset parallel scan
595  */
596 void
598 {
599  BTParallelScanDesc btscan;
600  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
601 
602  Assert(parallel_scan);
603 
604  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
605  parallel_scan->ps_offset);
606 
607  /*
608  * In theory, we don't need to acquire the spinlock here, because there
609  * shouldn't be any other workers running at this point, but we do so for
610  * consistency.
611  */
612  SpinLockAcquire(&btscan->btps_mutex);
615  btscan->btps_arrayKeyCount = 0;
616  SpinLockRelease(&btscan->btps_mutex);
617 }
618 
619 /*
620  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
621  * page. Other scans must wait until we call _bt_parallel_release()
622  * or _bt_parallel_done().
623  *
624  * The return value is true if we successfully seized the scan and false
625  * if we did not. The latter case occurs if no pages remain for the current
626  * set of scankeys.
627  *
628  * If the return value is true, *pageno returns the next or current page
629  * of the scan (depending on the scan direction). An invalid block number
630  * means the scan hasn't yet started, and P_NONE means we've reached the end.
631  * The first time a participating process reaches the last page, it will return
632  * true and set *pageno to P_NONE; after that, further attempts to seize the
633  * scan will return false.
634  *
635  * Callers should ignore the value of pageno if the return value is false.
636  */
637 bool
639 {
640  BTScanOpaque so = (BTScanOpaque) scan->opaque;
641  BTPS_State pageStatus;
642  bool exit_loop = false;
643  bool status = true;
644  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
645  BTParallelScanDesc btscan;
646 
647  *pageno = P_NONE;
648 
649  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
650  parallel_scan->ps_offset);
651 
652  while (1)
653  {
654  SpinLockAcquire(&btscan->btps_mutex);
655  pageStatus = btscan->btps_pageStatus;
656 
657  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
658  {
659  /* Parallel scan has already advanced to a new set of scankeys. */
660  status = false;
661  }
662  else if (pageStatus == BTPARALLEL_DONE)
663  {
664  /*
665  * We're done with this set of scankeys. This may be the end, or
666  * there could be more sets to try.
667  */
668  status = false;
669  }
670  else if (pageStatus != BTPARALLEL_ADVANCING)
671  {
672  /*
673  * We have successfully seized control of the scan for the purpose
674  * of advancing it to a new page!
675  */
676  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
677  *pageno = btscan->btps_scanPage;
678  exit_loop = true;
679  }
680  SpinLockRelease(&btscan->btps_mutex);
681  if (exit_loop || !status)
682  break;
683  ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
684  }
686 
687  return status;
688 }
689 
690 /*
691  * _bt_parallel_release() -- Complete the process of advancing the scan to a
692  * new page. We now have the new value btps_scanPage; some other backend
693  * can now begin advancing the scan.
694  */
695 void
697 {
698  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
699  BTParallelScanDesc btscan;
700 
701  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
702  parallel_scan->ps_offset);
703 
704  SpinLockAcquire(&btscan->btps_mutex);
705  btscan->btps_scanPage = scan_page;
707  SpinLockRelease(&btscan->btps_mutex);
709 }
710 
711 /*
712  * _bt_parallel_done() -- Mark the parallel scan as complete.
713  *
714  * When there are no pages left to scan, this function should be called to
715  * notify other workers. Otherwise, they might wait forever for the scan to
716  * advance to the next page.
717  */
718 void
720 {
721  BTScanOpaque so = (BTScanOpaque) scan->opaque;
722  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
723  BTParallelScanDesc btscan;
724  bool status_changed = false;
725 
726  /* Do nothing, for non-parallel scans */
727  if (parallel_scan == NULL)
728  return;
729 
730  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
731  parallel_scan->ps_offset);
732 
733  /*
734  * Mark the parallel scan as done for this combination of scan keys,
735  * unless some other process already did so. See also
736  * _bt_advance_array_keys.
737  */
738  SpinLockAcquire(&btscan->btps_mutex);
739  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
740  btscan->btps_pageStatus != BTPARALLEL_DONE)
741  {
742  btscan->btps_pageStatus = BTPARALLEL_DONE;
743  status_changed = true;
744  }
745  SpinLockRelease(&btscan->btps_mutex);
746 
747  /* wake up all the workers associated with this parallel scan */
748  if (status_changed)
749  ConditionVariableBroadcast(&btscan->btps_cv);
750 }
751 
752 /*
753  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
754  * keys.
755  *
756  * Updates the count of array keys processed for both local and parallel
757  * scans.
758  */
759 void
761 {
762  BTScanOpaque so = (BTScanOpaque) scan->opaque;
763  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
764  BTParallelScanDesc btscan;
765 
766  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
767  parallel_scan->ps_offset);
768 
769  so->arrayKeyCount++;
770  SpinLockAcquire(&btscan->btps_mutex);
771  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
772  {
773  btscan->btps_scanPage = InvalidBlockNumber;
774  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
775  btscan->btps_arrayKeyCount++;
776  }
777  SpinLockRelease(&btscan->btps_mutex);
778 }
779 
780 /*
781  * Bulk deletion of all index entries pointing to a set of heap tuples.
782  * The set of target tuples is specified via a callback routine that tells
783  * whether any given heap tuple (identified by ItemPointer) is being deleted.
784  *
785  * Result: a palloc'd struct containing statistical info for VACUUM displays.
786  */
789  IndexBulkDeleteCallback callback, void *callback_state)
790 {
791  Relation rel = info->index;
792  BTCycleId cycleid;
793 
794  /* allocate stats if first time through, else re-use existing struct */
795  if (stats == NULL)
797 
798  /* Establish the vacuum cycle ID to use for this scan */
799  /* The ENSURE stuff ensures we clean up shared memory on failure */
801  {
802  cycleid = _bt_start_vacuum(rel);
803 
804  btvacuumscan(info, stats, callback, callback_state, cycleid);
805  }
807  _bt_end_vacuum(rel);
808 
809  return stats;
810 }
811 
812 /*
813  * Post-VACUUM cleanup.
814  *
815  * Result: a palloc'd struct containing statistical info for VACUUM displays.
816  */
819 {
820  BlockNumber num_delpages;
821 
822  /* No-op in ANALYZE ONLY mode */
823  if (info->analyze_only)
824  return stats;
825 
826  /*
827  * If btbulkdelete was called, we need not do anything (we just maintain
828  * the information used within _bt_vacuum_needs_cleanup() by calling
829  * _bt_set_cleanup_info() below).
830  *
831  * If btbulkdelete was _not_ called, then we have a choice to make: we
832  * must decide whether or not a btvacuumscan() call is needed now (i.e.
833  * whether the ongoing VACUUM operation can entirely avoid a physical scan
834  * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
835  * now.
836  */
837  if (stats == NULL)
838  {
839  /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
840  if (!_bt_vacuum_needs_cleanup(info->index))
841  return NULL;
842 
843  /*
844  * Since we aren't going to actually delete any leaf items, there's no
845  * need to go through all the vacuum-cycle-ID pushups here.
846  *
847  * Posting list tuples are a source of inaccuracy for cleanup-only
848  * scans. btvacuumscan() will assume that the number of index tuples
849  * from each page can be used as num_index_tuples, even though
850  * num_index_tuples is supposed to represent the number of TIDs in the
851  * index. This naive approach can underestimate the number of tuples
852  * in the index significantly.
853  *
854  * We handle the problem by making num_index_tuples an estimate in
855  * cleanup-only case.
856  */
858  btvacuumscan(info, stats, NULL, NULL, 0);
859  stats->estimated_count = true;
860  }
861 
862  /*
863  * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
864  *
865  * num_delpages is the number of deleted pages now in the index that were
866  * not safe to place in the FSM to be recycled just yet. num_delpages is
867  * greater than 0 only when _bt_pagedel() actually deleted pages during
868  * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
869  * have failed to place any newly deleted pages in the FSM just moments
870  * ago. (Actually, there are edge cases where recycling of the current
871  * VACUUM's newly deleted pages does not even become safe by the time the
872  * next VACUUM comes around. See nbtree/README.)
873  */
874  Assert(stats->pages_deleted >= stats->pages_free);
875  num_delpages = stats->pages_deleted - stats->pages_free;
876  _bt_set_cleanup_info(info->index, num_delpages);
877 
878  /*
879  * It's quite possible for us to be fooled by concurrent page splits into
880  * double-counting some index tuples, so disbelieve any total that exceeds
881  * the underlying heap's count ... if we know that accurately. Otherwise
882  * this might just make matters worse.
883  */
884  if (!info->estimated_count)
885  {
886  if (stats->num_index_tuples > info->num_heap_tuples)
887  stats->num_index_tuples = info->num_heap_tuples;
888  }
889 
890  return stats;
891 }
892 
893 /*
894  * btvacuumscan --- scan the index for VACUUMing purposes
895  *
896  * This combines the functions of looking for leaf tuples that are deletable
897  * according to the vacuum callback, looking for empty pages that can be
898  * deleted, and looking for old deleted pages that can be recycled. Both
899  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
900  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
901  *
902  * The caller is responsible for initially allocating/zeroing a stats struct
903  * and for obtaining a vacuum cycle ID if necessary.
904  */
905 static void
907  IndexBulkDeleteCallback callback, void *callback_state,
908  BTCycleId cycleid)
909 {
910  Relation rel = info->index;
911  BTVacState vstate;
912  BlockNumber num_pages;
913  BlockNumber scanblkno;
914  bool needLock;
915 
916  /*
917  * Reset fields that track information about the entire index now. This
918  * avoids double-counting in the case where a single VACUUM command
919  * requires multiple scans of the index.
920  *
921  * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
922  * since they track information about the VACUUM command, and so must last
923  * across each call to btvacuumscan().
924  *
925  * (Note that pages_free is treated as state about the whole index, not
926  * the current VACUUM. This is appropriate because RecordFreeIndexPage()
927  * calls are idempotent, and get repeated for the same deleted pages in
928  * some scenarios. The point for us is to track the number of recyclable
929  * pages in the index at the end of the VACUUM command.)
930  */
931  stats->num_pages = 0;
932  stats->num_index_tuples = 0;
933  stats->pages_deleted = 0;
934  stats->pages_free = 0;
935 
936  /* Set up info to pass down to btvacuumpage */
937  vstate.info = info;
938  vstate.stats = stats;
939  vstate.callback = callback;
940  vstate.callback_state = callback_state;
941  vstate.cycleid = cycleid;
942 
943  /* Create a temporary memory context to run _bt_pagedel in */
945  "_bt_pagedel",
947 
948  /* Initialize vstate fields used by _bt_pendingfsm_finalize */
949  vstate.bufsize = 0;
950  vstate.maxbufsize = 0;
951  vstate.pendingpages = NULL;
952  vstate.npendingpages = 0;
953  /* Consider applying _bt_pendingfsm_finalize optimization */
954  _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
955 
956  /*
957  * The outer loop iterates over all index pages except the metapage, in
958  * physical order (we hope the kernel will cooperate in providing
959  * read-ahead for speed). It is critical that we visit all leaf pages,
960  * including ones added after we start the scan, else we might fail to
961  * delete some deletable tuples. Hence, we must repeatedly check the
962  * relation length. We must acquire the relation-extension lock while
963  * doing so to avoid a race condition: if someone else is extending the
964  * relation, there is a window where bufmgr/smgr have created a new
965  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
966  * we manage to scan such a page here, we'll improperly assume it can be
967  * recycled. Taking the lock synchronizes things enough to prevent a
968  * problem: either num_pages won't include the new page, or _bt_getbuf
969  * already has write lock on the buffer and it will be fully initialized
970  * before we can examine it. Also, we need not worry if a page is added
971  * immediately after we look; the page splitting code already has
972  * write-lock on the left page before it adds a right page, so we must
973  * already have processed any tuples due to be moved into such a page.
974  *
975  * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
976  * think the use of the extension lock is still required.
977  *
978  * We can skip locking for new or temp relations, however, since no one
979  * else could be accessing them.
980  */
981  needLock = !RELATION_IS_LOCAL(rel);
982 
983  scanblkno = BTREE_METAPAGE + 1;
984  for (;;)
985  {
986  /* Get the current relation length */
987  if (needLock)
989  num_pages = RelationGetNumberOfBlocks(rel);
990  if (needLock)
992 
993  if (info->report_progress)
995  num_pages);
996 
997  /* Quit if we've scanned the whole relation */
998  if (scanblkno >= num_pages)
999  break;
1000  /* Iterate over pages, then loop back to recheck length */
1001  for (; scanblkno < num_pages; scanblkno++)
1002  {
1003  btvacuumpage(&vstate, scanblkno);
1004  if (info->report_progress)
1006  scanblkno);
1007  }
1008  }
1009 
1010  /* Set statistics num_pages field to final size of index */
1011  stats->num_pages = num_pages;
1012 
1014 
1015  /*
1016  * If there were any calls to _bt_pagedel() during scan of the index then
1017  * see if any of the resulting pages can be placed in the FSM now. When
1018  * it's not safe we'll have to leave it up to a future VACUUM operation.
1019  *
1020  * Finally, if we placed any pages in the FSM (either just now or during
1021  * the scan), forcibly update the upper-level FSM pages to ensure that
1022  * searchers can find them.
1023  */
1024  _bt_pendingfsm_finalize(rel, &vstate);
1025  if (stats->pages_free > 0)
1027 }
1028 
1029 /*
1030  * btvacuumpage --- VACUUM one page
1031  *
1032  * This processes a single page for btvacuumscan(). In some cases we must
1033  * backtrack to re-examine and VACUUM pages that were the scanblkno during
1034  * a previous call here. This is how we handle page splits (that happened
1035  * after our cycleid was acquired) whose right half page happened to reuse
1036  * a block that we might have processed at some point before it was
1037  * recycled (i.e. before the page split).
1038  */
1039 static void
1041 {
1042  IndexVacuumInfo *info = vstate->info;
1043  IndexBulkDeleteResult *stats = vstate->stats;
1045  void *callback_state = vstate->callback_state;
1046  Relation rel = info->index;
1047  Relation heaprel = info->heaprel;
1048  bool attempt_pagedel;
1049  BlockNumber blkno,
1050  backtrack_to;
1051  Buffer buf;
1052  Page page;
1053  BTPageOpaque opaque;
1054 
1055  blkno = scanblkno;
1056 
1057 backtrack:
1058 
1059  attempt_pagedel = false;
1060  backtrack_to = P_NONE;
1061 
1062  /* call vacuum_delay_point while not holding any buffer lock */
1064 
1065  /*
1066  * We can't use _bt_getbuf() here because it always applies
1067  * _bt_checkpage(), which will barf on an all-zero page. We want to
1068  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1069  * buffer access strategy.
1070  */
1072  info->strategy);
1073  _bt_lockbuf(rel, buf, BT_READ);
1074  page = BufferGetPage(buf);
1075  opaque = NULL;
1076  if (!PageIsNew(page))
1077  {
1078  _bt_checkpage(rel, buf);
1079  opaque = BTPageGetOpaque(page);
1080  }
1081 
1082  Assert(blkno <= scanblkno);
1083  if (blkno != scanblkno)
1084  {
1085  /*
1086  * We're backtracking.
1087  *
1088  * We followed a right link to a sibling leaf page (a page that
1089  * happens to be from a block located before scanblkno). The only
1090  * case we want to do anything with is a live leaf page having the
1091  * current vacuum cycle ID.
1092  *
1093  * The page had better be in a state that's consistent with what we
1094  * expect. Check for conditions that imply corruption in passing. It
1095  * can't be half-dead because only an interrupted VACUUM process can
1096  * leave pages in that state, so we'd definitely have dealt with it
1097  * back when the page was the scanblkno page (half-dead pages are
1098  * always marked fully deleted by _bt_pagedel(), barring corruption).
1099  */
1100  if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1101  {
1102  Assert(false);
1103  ereport(LOG,
1104  (errcode(ERRCODE_INDEX_CORRUPTED),
1105  errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1106  blkno, scanblkno, RelationGetRelationName(rel))));
1107  _bt_relbuf(rel, buf);
1108  return;
1109  }
1110 
1111  /*
1112  * We may have already processed the page in an earlier call, when the
1113  * page was scanblkno. This happens when the leaf page split occurred
1114  * after the scan began, but before the right sibling page became the
1115  * scanblkno.
1116  *
1117  * Page may also have been deleted by current btvacuumpage() call,
1118  * since _bt_pagedel() sometimes deletes the right sibling page of
1119  * scanblkno in passing (it does so after we decided where to
1120  * backtrack to). We don't need to process this page as a deleted
1121  * page a second time now (in fact, it would be wrong to count it as a
1122  * deleted page in the bulk delete statistics a second time).
1123  */
1124  if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1125  {
1126  /* Done with current scanblkno (and all lower split pages) */
1127  _bt_relbuf(rel, buf);
1128  return;
1129  }
1130  }
1131 
1132  if (!opaque || BTPageIsRecyclable(page, heaprel))
1133  {
1134  /* Okay to recycle this page (which could be leaf or internal) */
1135  RecordFreeIndexPage(rel, blkno);
1136  stats->pages_deleted++;
1137  stats->pages_free++;
1138  }
1139  else if (P_ISDELETED(opaque))
1140  {
1141  /*
1142  * Already deleted page (which could be leaf or internal). Can't
1143  * recycle yet.
1144  */
1145  stats->pages_deleted++;
1146  }
1147  else if (P_ISHALFDEAD(opaque))
1148  {
1149  /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1150  attempt_pagedel = true;
1151 
1152  /*
1153  * _bt_pagedel() will increment both pages_newly_deleted and
1154  * pages_deleted stats in all cases (barring corruption)
1155  */
1156  }
1157  else if (P_ISLEAF(opaque))
1158  {
1160  int ndeletable;
1162  int nupdatable;
1163  OffsetNumber offnum,
1164  minoff,
1165  maxoff;
1166  int nhtidsdead,
1167  nhtidslive;
1168 
1169  /*
1170  * Trade in the initial read lock for a full cleanup lock on this
1171  * page. We must get such a lock on every leaf page over the course
1172  * of the vacuum scan, whether or not it actually contains any
1173  * deletable tuples --- see nbtree/README.
1174  */
1176 
1177  /*
1178  * Check whether we need to backtrack to earlier pages. What we are
1179  * concerned about is a page split that happened since we started the
1180  * vacuum scan. If the split moved tuples on the right half of the
1181  * split (i.e. the tuples that sort high) to a block that we already
1182  * passed over, then we might have missed the tuples. We need to
1183  * backtrack now. (Must do this before possibly clearing btpo_cycleid
1184  * or deleting scanblkno page below!)
1185  */
1186  if (vstate->cycleid != 0 &&
1187  opaque->btpo_cycleid == vstate->cycleid &&
1188  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1189  !P_RIGHTMOST(opaque) &&
1190  opaque->btpo_next < scanblkno)
1191  backtrack_to = opaque->btpo_next;
1192 
1193  ndeletable = 0;
1194  nupdatable = 0;
1195  minoff = P_FIRSTDATAKEY(opaque);
1196  maxoff = PageGetMaxOffsetNumber(page);
1197  nhtidsdead = 0;
1198  nhtidslive = 0;
1199  if (callback)
1200  {
1201  /* btbulkdelete callback tells us what to delete (or update) */
1202  for (offnum = minoff;
1203  offnum <= maxoff;
1204  offnum = OffsetNumberNext(offnum))
1205  {
1206  IndexTuple itup;
1207 
1208  itup = (IndexTuple) PageGetItem(page,
1209  PageGetItemId(page, offnum));
1210 
1211  Assert(!BTreeTupleIsPivot(itup));
1212  if (!BTreeTupleIsPosting(itup))
1213  {
1214  /* Regular tuple, standard table TID representation */
1215  if (callback(&itup->t_tid, callback_state))
1216  {
1217  deletable[ndeletable++] = offnum;
1218  nhtidsdead++;
1219  }
1220  else
1221  nhtidslive++;
1222  }
1223  else
1224  {
1225  BTVacuumPosting vacposting;
1226  int nremaining;
1227 
1228  /* Posting list tuple */
1229  vacposting = btreevacuumposting(vstate, itup, offnum,
1230  &nremaining);
1231  if (vacposting == NULL)
1232  {
1233  /*
1234  * All table TIDs from the posting tuple remain, so no
1235  * delete or update required
1236  */
1237  Assert(nremaining == BTreeTupleGetNPosting(itup));
1238  }
1239  else if (nremaining > 0)
1240  {
1241 
1242  /*
1243  * Store metadata about posting list tuple in
1244  * updatable array for entire page. Existing tuple
1245  * will be updated during the later call to
1246  * _bt_delitems_vacuum().
1247  */
1248  Assert(nremaining < BTreeTupleGetNPosting(itup));
1249  updatable[nupdatable++] = vacposting;
1250  nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1251  }
1252  else
1253  {
1254  /*
1255  * All table TIDs from the posting list must be
1256  * deleted. We'll delete the index tuple completely
1257  * (no update required).
1258  */
1259  Assert(nremaining == 0);
1260  deletable[ndeletable++] = offnum;
1261  nhtidsdead += BTreeTupleGetNPosting(itup);
1262  pfree(vacposting);
1263  }
1264 
1265  nhtidslive += nremaining;
1266  }
1267  }
1268  }
1269 
1270  /*
1271  * Apply any needed deletes or updates. We issue just one
1272  * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1273  */
1274  if (ndeletable > 0 || nupdatable > 0)
1275  {
1276  Assert(nhtidsdead >= ndeletable + nupdatable);
1277  _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1278  nupdatable);
1279 
1280  stats->tuples_removed += nhtidsdead;
1281  /* must recompute maxoff */
1282  maxoff = PageGetMaxOffsetNumber(page);
1283 
1284  /* can't leak memory here */
1285  for (int i = 0; i < nupdatable; i++)
1286  pfree(updatable[i]);
1287  }
1288  else
1289  {
1290  /*
1291  * If the leaf page has been split during this vacuum cycle, it
1292  * seems worth expending a write to clear btpo_cycleid even if we
1293  * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1294  * takes care of this.) This ensures we won't process the page
1295  * again.
1296  *
1297  * We treat this like a hint-bit update because there's no need to
1298  * WAL-log it.
1299  */
1300  Assert(nhtidsdead == 0);
1301  if (vstate->cycleid != 0 &&
1302  opaque->btpo_cycleid == vstate->cycleid)
1303  {
1304  opaque->btpo_cycleid = 0;
1305  MarkBufferDirtyHint(buf, true);
1306  }
1307  }
1308 
1309  /*
1310  * If the leaf page is now empty, try to delete it; else count the
1311  * live tuples (live table TIDs in posting lists are counted as
1312  * separate live tuples). We don't delete when backtracking, though,
1313  * since that would require teaching _bt_pagedel() about backtracking
1314  * (doesn't seem worth adding more complexity to deal with that).
1315  *
1316  * We don't count the number of live TIDs during cleanup-only calls to
1317  * btvacuumscan (i.e. when callback is not set). We count the number
1318  * of index tuples directly instead. This avoids the expense of
1319  * directly examining all of the tuples on each page. VACUUM will
1320  * treat num_index_tuples as an estimate in cleanup-only case, so it
1321  * doesn't matter that this underestimates num_index_tuples
1322  * significantly in some cases.
1323  */
1324  if (minoff > maxoff)
1325  attempt_pagedel = (blkno == scanblkno);
1326  else if (callback)
1327  stats->num_index_tuples += nhtidslive;
1328  else
1329  stats->num_index_tuples += maxoff - minoff + 1;
1330 
1331  Assert(!attempt_pagedel || nhtidslive == 0);
1332  }
1333 
1334  if (attempt_pagedel)
1335  {
1336  MemoryContext oldcontext;
1337 
1338  /* Run pagedel in a temp context to avoid memory leakage */
1340  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1341 
1342  /*
1343  * _bt_pagedel maintains the bulk delete stats on our behalf;
1344  * pages_newly_deleted and pages_deleted are likely to be incremented
1345  * during call
1346  */
1347  Assert(blkno == scanblkno);
1348  _bt_pagedel(rel, buf, vstate);
1349 
1350  MemoryContextSwitchTo(oldcontext);
1351  /* pagedel released buffer, so we shouldn't */
1352  }
1353  else
1354  _bt_relbuf(rel, buf);
1355 
1356  if (backtrack_to != P_NONE)
1357  {
1358  blkno = backtrack_to;
1359  goto backtrack;
1360  }
1361 }
1362 
1363 /*
1364  * btreevacuumposting --- determine TIDs still needed in posting list
1365  *
1366  * Returns metadata describing how to build replacement tuple without the TIDs
1367  * that VACUUM needs to delete. Returned value is NULL in the common case
1368  * where no changes are needed to caller's posting list tuple (we avoid
1369  * allocating memory here as an optimization).
1370  *
1371  * The number of TIDs that should remain in the posting list tuple is set for
1372  * caller in *nremaining.
1373  */
1374 static BTVacuumPosting
1376  OffsetNumber updatedoffset, int *nremaining)
1377 {
1378  int live = 0;
1379  int nitem = BTreeTupleGetNPosting(posting);
1380  ItemPointer items = BTreeTupleGetPosting(posting);
1381  BTVacuumPosting vacposting = NULL;
1382 
1383  for (int i = 0; i < nitem; i++)
1384  {
1385  if (!vstate->callback(items + i, vstate->callback_state))
1386  {
1387  /* Live table TID */
1388  live++;
1389  }
1390  else if (vacposting == NULL)
1391  {
1392  /*
1393  * First dead table TID encountered.
1394  *
1395  * It's now clear that we need to delete one or more dead table
1396  * TIDs, so start maintaining metadata describing how to update
1397  * existing posting list tuple.
1398  */
1399  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1400  nitem * sizeof(uint16));
1401 
1402  vacposting->itup = posting;
1403  vacposting->updatedoffset = updatedoffset;
1404  vacposting->ndeletedtids = 0;
1405  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1406  }
1407  else
1408  {
1409  /* Second or subsequent dead table TID */
1410  vacposting->deletetids[vacposting->ndeletedtids++] = i;
1411  }
1412  }
1413 
1414  *nremaining = live;
1415  return vacposting;
1416 }
1417 
1418 /*
1419  * btcanreturn() -- Check whether btree indexes support index-only scans.
1420  *
1421  * btrees always do, so this is trivial.
1422  */
1423 bool
1425 {
1426  return true;
1427 }
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static Datum values[MAXATTR]
Definition: bootstrap.c:156
int Buffer
Definition: buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4512
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3290
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4480
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2111
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4544
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:755
#define P_NEW
Definition: bufmgr.h:152
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:227
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
@ RBM_NORMAL
Definition: bufmgr.h:44
Pointer Page
Definition: bufpage.h:78
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:351
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:240
static bool PageIsNew(Page page)
Definition: bufpage.h:230
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:369
#define OffsetToPointer(base, offset)
Definition: c.h:761
unsigned short uint16
Definition: c.h:494
size_t Size
Definition: c.h:594
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errcode(int sqlerrcode)
Definition: elog.c:858
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:81
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:87
IndexUniqueCheck
Definition: genam.h:116
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:165
Assert(fmt[strlen(fmt) - 1] !='\n')
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:431
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:481
#define ExclusiveLock
Definition: lockdefs.h:42
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:330
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc0(Size size)
Definition: mcxt.c:1257
MemoryContext CurrentMemoryContext
Definition: mcxt.c:135
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:403
void * palloc(Size size)
Definition: mcxt.c:1226
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:153
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:102
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1807
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1154
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:232
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1109
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:179
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2990
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2949
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1424
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:696
BTPS_State
Definition: nbtree.c:55
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:57
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56
@ BTPARALLEL_IDLE
Definition: nbtree.c:58
@ BTPARALLEL_DONE
Definition: nbtree.c:59
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:346
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:719
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:788
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:760
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1375
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:818
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:214
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:597
struct BTParallelScanDescData BTParallelScanDescData
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:190
void btbuildempty(Relation index)
Definition: nbtree.c:152
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:638
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
Definition: nbtree.c:1040
Size btestimateparallelscan(void)
Definition: nbtree.c:573
void btinitparallelscan(void *target)
Definition: nbtree.c:582
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:906
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:288
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:487
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:451
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:392
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:78
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:96
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:517
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:995
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:537
#define BTNProcs
Definition: nbtree.h:712
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1012
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
uint16 BTCycleId
Definition: nbtree.h:29
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_READ
Definition: nbtree.h:719
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:291
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1018
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1006
#define BT_WRITE
Definition: nbtree.h:720
#define BTP_SPLIT_END
Definition: nbtree.h:81
#define BTOPTIONS_PROC
Definition: nbtree.h:711
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1079
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:860
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1477
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:301
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2033
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:2157
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:2061
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2111
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:551
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1725
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:210
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:629
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:610
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2134
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:2689
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:525
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:1976
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:41
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:293
#define makeNode(_type_)
Definition: nodes.h:176
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:138
static char * buf
Definition: pg_test_fsync.c:67
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
#define InvalidOid
Definition: postgres_ext.h:36
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:122
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:121
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:649
#define RelationGetDescr(relation)
Definition: rel.h:530
#define RelationGetRelationName(relation)
Definition: rel.h:538
@ MAIN_FORKNUM
Definition: relpath.h:50
@ INIT_FORKNUM
Definition: relpath.h:53
int slock_t
Definition: s_lock.h:754
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6670
ScanKeyData * ScanKey
Definition: skey.h:75
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define BTMaxStrategyNumber
Definition: stratnum.h:35
BlockNumber btpo_next
Definition: nbtree.h:65
uint16 btpo_flags
Definition: nbtree.h:67
BTCycleId btpo_cycleid
Definition: nbtree.h:68
slock_t btps_mutex
Definition: nbtree.c:74
BTPS_State btps_pageStatus
Definition: nbtree.c:69
ConditionVariable btps_cv
Definition: nbtree.c:75
BlockNumber btps_scanPage
Definition: nbtree.c:68
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1050
char * markTuples
Definition: nbtree.h:1063
BTScanPosData currPos
Definition: nbtree.h:1075
int * killedItems
Definition: nbtree.h:1054
char * currTuples
Definition: nbtree.h:1062
ScanKey arrayKeyData
Definition: nbtree.h:1045
BTScanPosData markPos
Definition: nbtree.h:1076
ScanKey keyData
Definition: nbtree.h:1042
MemoryContext arrayContext
Definition: nbtree.h:1051
Buffer buf
Definition: nbtree.h:958
int nextTupleOffset
Definition: nbtree.h:977
int lastItem
Definition: nbtree.h:987
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:990
int itemIndex
Definition: nbtree.h:988
ItemPointerData heapTid
Definition: nbtree.h:951
IndexBulkDeleteResult * stats
Definition: nbtree.h:333
BTCycleId cycleid
Definition: nbtree.h:336
BTPendingFSM * pendingpages
Definition: nbtree.h:344
int npendingpages
Definition: nbtree.h:345
IndexBulkDeleteCallback callback
Definition: nbtree.h:334
MemoryContext pagedelcontext
Definition: nbtree.h:337
IndexVacuumInfo * info
Definition: nbtree.h:332
int bufsize
Definition: nbtree.h:342
int maxbufsize
Definition: nbtree.h:343
void * callback_state
Definition: nbtree.h:335
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:916
uint16 ndeletedtids
Definition: nbtree.h:915
IndexTuple itup
Definition: nbtree.h:911
OffsetNumber updatedoffset
Definition: nbtree.h:912
ambuildphasename_function ambuildphasename
Definition: amapi.h:270
ambuildempty_function ambuildempty
Definition: amapi.h:262
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:265
bool amclusterable
Definition: amapi.h:238
amoptions_function amoptions
Definition: amapi.h:268
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:282
amrestrpos_function amrestrpos
Definition: amapi.h:279
aminsert_function aminsert
Definition: amapi.h:263
amendscan_function amendscan
Definition: amapi.h:277
uint16 amoptsprocnum
Definition: amapi.h:218
amparallelrescan_function amparallelrescan
Definition: amapi.h:284
Oid amkeytype
Definition: amapi.h:252
bool ampredlocks
Definition: amapi.h:240
uint16 amsupport
Definition: amapi.h:216
amcostestimate_function amcostestimate
Definition: amapi.h:267
bool amcanorderbyop
Definition: amapi.h:222
amadjustmembers_function amadjustmembers
Definition: amapi.h:272
ambuild_function ambuild
Definition: amapi.h:261
bool amstorage
Definition: amapi.h:236
uint16 amstrategies
Definition: amapi.h:214
bool amoptionalkey
Definition: amapi.h:230
amgettuple_function amgettuple
Definition: amapi.h:275
amcanreturn_function amcanreturn
Definition: amapi.h:266
bool amcanunique
Definition: amapi.h:226
amgetbitmap_function amgetbitmap
Definition: amapi.h:276
amproperty_function amproperty
Definition: amapi.h:269
ambulkdelete_function ambulkdelete
Definition: amapi.h:264
bool amsearcharray
Definition: amapi.h:232
bool amsummarizing
Definition: amapi.h:248
amvalidate_function amvalidate
Definition: amapi.h:271
ammarkpos_function ammarkpos
Definition: amapi.h:278
bool amcanmulticol
Definition: amapi.h:228
bool amusemaintenanceworkmem
Definition: amapi.h:246
ambeginscan_function ambeginscan
Definition: amapi.h:273
bool amcanparallel
Definition: amapi.h:242
amrescan_function amrescan
Definition: amapi.h:274
bool amcanorder
Definition: amapi.h:220
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:283
uint8 amparallelvacuumoptions
Definition: amapi.h:250
bool amcanbackward
Definition: amapi.h:224
bool amcaninclude
Definition: amapi.h:244
bool amsearchnulls
Definition: amapi.h:234
bool estimated_count
Definition: genam.h:78
BlockNumber pages_deleted
Definition: genam.h:82
BlockNumber pages_free
Definition: genam.h:83
BlockNumber num_pages
Definition: genam.h:77
double tuples_removed
Definition: genam.h:80
double num_index_tuples
Definition: genam.h:79
struct ScanKeyData * keyData
Definition: relscan.h:122
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
bool kill_prior_tuple
Definition: relscan.h:128
struct TupleDescData * xs_itupdesc
Definition: relscan.h:143
ItemPointerData xs_heaptid
Definition: relscan.h:147
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:46
double num_heap_tuples
Definition: genam.h:52
bool analyze_only
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:53
Relation heaprel
Definition: genam.h:47
bool report_progress
Definition: genam.h:49
bool estimated_count
Definition: genam.h:50
Definition: type.h:95
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
void vacuum_delay_point(void)
Definition: vacuum.c:2322
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:47
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:54
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1225