PostgreSQL Source Code  git master
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/progress.h"
26 #include "commands/vacuum.h"
27 #include "miscadmin.h"
28 #include "nodes/execnodes.h"
29 #include "pgstat.h"
30 #include "postmaster/autovacuum.h"
32 #include "storage/indexfsm.h"
33 #include "storage/ipc.h"
34 #include "storage/lmgr.h"
35 #include "storage/smgr.h"
36 #include "utils/builtins.h"
37 #include "utils/index_selfuncs.h"
38 #include "utils/memutils.h"
39 
40 
41 /* Working state needed by btvacuumpage */
42 typedef struct
43 {
49  BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
50  BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
51  BlockNumber totFreePages; /* true total # of free pages */
54 } BTVacState;
55 
56 /*
57  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
58  *
59  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
60  * a new page; others must wait.
61  *
62  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
63  * to a new page; some process can start doing that.
64  *
65  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
66  * We reach this state once for every distinct combination of array keys.
67  */
68 typedef enum
69 {
74 } BTPS_State;
75 
76 /*
77  * BTParallelScanDescData contains btree specific shared information required
78  * for parallel scan.
79  */
80 typedef struct BTParallelScanDescData
81 {
82  BlockNumber btps_scanPage; /* latest or next page to be scanned */
83  BTPS_State btps_pageStatus; /* indicates whether next page is
84  * available for scan. see above for
85  * possible states of parallel scan. */
86  int btps_arrayKeyCount; /* count indicating number of array scan
87  * keys processed by parallel scan */
88  slock_t btps_mutex; /* protects above variables */
89  ConditionVariable btps_cv; /* used to synchronize parallel scan */
91 
93 
94 
95 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
96  IndexBulkDeleteCallback callback, void *callback_state,
97  BTCycleId cycleid, TransactionId *oldestBtpoXact);
98 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
99  BlockNumber orig_blkno);
100 
101 
102 /*
103  * Btree handler function: return IndexAmRoutine with access method parameters
104  * and callbacks.
105  */
106 Datum
108 {
109  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
110 
111  amroutine->amstrategies = BTMaxStrategyNumber;
112  amroutine->amsupport = BTNProcs;
113  amroutine->amcanorder = true;
114  amroutine->amcanorderbyop = false;
115  amroutine->amcanbackward = true;
116  amroutine->amcanunique = true;
117  amroutine->amcanmulticol = true;
118  amroutine->amoptionalkey = true;
119  amroutine->amsearcharray = true;
120  amroutine->amsearchnulls = true;
121  amroutine->amstorage = false;
122  amroutine->amclusterable = true;
123  amroutine->ampredlocks = true;
124  amroutine->amcanparallel = true;
125  amroutine->amcaninclude = true;
126  amroutine->amkeytype = InvalidOid;
127 
128  amroutine->ambuild = btbuild;
129  amroutine->ambuildempty = btbuildempty;
130  amroutine->aminsert = btinsert;
131  amroutine->ambulkdelete = btbulkdelete;
132  amroutine->amvacuumcleanup = btvacuumcleanup;
133  amroutine->amcanreturn = btcanreturn;
134  amroutine->amcostestimate = btcostestimate;
135  amroutine->amoptions = btoptions;
136  amroutine->amproperty = btproperty;
137  amroutine->ambuildphasename = btbuildphasename;
138  amroutine->amvalidate = btvalidate;
139  amroutine->ambeginscan = btbeginscan;
140  amroutine->amrescan = btrescan;
141  amroutine->amgettuple = btgettuple;
142  amroutine->amgetbitmap = btgetbitmap;
143  amroutine->amendscan = btendscan;
144  amroutine->ammarkpos = btmarkpos;
145  amroutine->amrestrpos = btrestrpos;
148  amroutine->amparallelrescan = btparallelrescan;
149 
150  PG_RETURN_POINTER(amroutine);
151 }
152 
153 /*
154  * btbuildempty() -- build an empty btree index in the initialization fork
155  */
156 void
158 {
159  Page metapage;
160 
161  /* Construct metapage. */
162  metapage = (Page) palloc(BLCKSZ);
163  _bt_initmetapage(metapage, P_NONE, 0);
164 
165  /*
166  * Write the page and log it. It might seem that an immediate sync would
167  * be sufficient to guarantee that the file exists on disk, but recovery
168  * itself might remove it while replaying, for example, an
169  * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
170  * this even when wal_level=minimal.
171  */
174  (char *) metapage, true);
176  BTREE_METAPAGE, metapage, true);
177 
178  /*
179  * An immediate sync is required even if we xlog'd the page, because the
180  * write did not go through shared_buffers and therefore a concurrent
181  * checkpoint may have moved the redo pointer past our xlog record.
182  */
184 }
185 
186 /*
187  * btinsert() -- insert an index tuple into a btree.
188  *
189  * Descend the tree recursively, find the appropriate location for our
190  * new tuple, and put it there.
191  */
192 bool
193 btinsert(Relation rel, Datum *values, bool *isnull,
194  ItemPointer ht_ctid, Relation heapRel,
195  IndexUniqueCheck checkUnique,
196  IndexInfo *indexInfo)
197 {
198  bool result;
199  IndexTuple itup;
200 
201  /* generate an index tuple */
202  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
203  itup->t_tid = *ht_ctid;
204 
205  result = _bt_doinsert(rel, itup, checkUnique, heapRel);
206 
207  pfree(itup);
208 
209  return result;
210 }
211 
212 /*
213  * btgettuple() -- Get the next tuple in the scan.
214  */
215 bool
217 {
218  BTScanOpaque so = (BTScanOpaque) scan->opaque;
219  bool res;
220 
221  /* btree indexes are never lossy */
222  scan->xs_recheck = false;
223 
224  /*
225  * If we have any array keys, initialize them during first call for a
226  * scan. We can't do this in btrescan because we don't know the scan
227  * direction at that time.
228  */
229  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
230  {
231  /* punt if we have any unsatisfiable array keys */
232  if (so->numArrayKeys < 0)
233  return false;
234 
235  _bt_start_array_keys(scan, dir);
236  }
237 
238  /* This loop handles advancing to the next array elements, if any */
239  do
240  {
241  /*
242  * If we've already initialized this scan, we can just advance it in
243  * the appropriate direction. If we haven't done so yet, we call
244  * _bt_first() to get the first item in the scan.
245  */
246  if (!BTScanPosIsValid(so->currPos))
247  res = _bt_first(scan, dir);
248  else
249  {
250  /*
251  * Check to see if we should kill the previously-fetched tuple.
252  */
253  if (scan->kill_prior_tuple)
254  {
255  /*
256  * Yes, remember it for later. (We'll deal with all such
257  * tuples at once right before leaving the index page.) The
258  * test for numKilled overrun is not just paranoia: if the
259  * caller reverses direction in the indexscan then the same
260  * item might get entered multiple times. It's not worth
261  * trying to optimize that, so we don't detect it, but instead
262  * just forget any excess entries.
263  */
264  if (so->killedItems == NULL)
265  so->killedItems = (int *)
266  palloc(MaxIndexTuplesPerPage * sizeof(int));
268  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
269  }
270 
271  /*
272  * Now continue the scan.
273  */
274  res = _bt_next(scan, dir);
275  }
276 
277  /* If we have a tuple, return it ... */
278  if (res)
279  break;
280  /* ... otherwise see if we have more array keys to deal with */
281  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
282 
283  return res;
284 }
285 
286 /*
287  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
288  */
289 int64
291 {
292  BTScanOpaque so = (BTScanOpaque) scan->opaque;
293  int64 ntids = 0;
294  ItemPointer heapTid;
295 
296  /*
297  * If we have any array keys, initialize them.
298  */
299  if (so->numArrayKeys)
300  {
301  /* punt if we have any unsatisfiable array keys */
302  if (so->numArrayKeys < 0)
303  return ntids;
304 
306  }
307 
308  /* This loop handles advancing to the next array elements, if any */
309  do
310  {
311  /* Fetch the first page & tuple */
312  if (_bt_first(scan, ForwardScanDirection))
313  {
314  /* Save tuple ID, and continue scanning */
315  heapTid = &scan->xs_heaptid;
316  tbm_add_tuples(tbm, heapTid, 1, false);
317  ntids++;
318 
319  for (;;)
320  {
321  /*
322  * Advance to next tuple within page. This is the same as the
323  * easy case in _bt_next().
324  */
325  if (++so->currPos.itemIndex > so->currPos.lastItem)
326  {
327  /* let _bt_next do the heavy lifting */
328  if (!_bt_next(scan, ForwardScanDirection))
329  break;
330  }
331 
332  /* Save tuple ID, and continue scanning */
333  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
334  tbm_add_tuples(tbm, heapTid, 1, false);
335  ntids++;
336  }
337  }
338  /* Now see if we have more array keys to deal with */
340 
341  return ntids;
342 }
343 
344 /*
345  * btbeginscan() -- start a scan on a btree index
346  */
348 btbeginscan(Relation rel, int nkeys, int norderbys)
349 {
350  IndexScanDesc scan;
351  BTScanOpaque so;
352 
353  /* no order by operators allowed */
354  Assert(norderbys == 0);
355 
356  /* get the scan */
357  scan = RelationGetIndexScan(rel, nkeys, norderbys);
358 
359  /* allocate private workspace */
360  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
363  if (scan->numberOfKeys > 0)
364  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
365  else
366  so->keyData = NULL;
367 
368  so->arrayKeyData = NULL; /* assume no array keys for now */
369  so->numArrayKeys = 0;
370  so->arrayKeys = NULL;
371  so->arrayContext = NULL;
372 
373  so->killedItems = NULL; /* until needed */
374  so->numKilled = 0;
375 
376  /*
377  * We don't know yet whether the scan will be index-only, so we do not
378  * allocate the tuple workspace arrays until btrescan. However, we set up
379  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
380  */
381  so->currTuples = so->markTuples = NULL;
382 
383  scan->xs_itupdesc = RelationGetDescr(rel);
384 
385  scan->opaque = so;
386 
387  return scan;
388 }
389 
390 /*
391  * btrescan() -- rescan an index relation
392  */
393 void
394 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
395  ScanKey orderbys, int norderbys)
396 {
397  BTScanOpaque so = (BTScanOpaque) scan->opaque;
398 
399  /* we aren't holding any read locks, but gotta drop the pins */
400  if (BTScanPosIsValid(so->currPos))
401  {
402  /* Before leaving current page, deal with any killed items */
403  if (so->numKilled > 0)
404  _bt_killitems(scan);
407  }
408 
409  so->markItemIndex = -1;
410  so->arrayKeyCount = 0;
413 
414  /*
415  * Allocate tuple workspace arrays, if needed for an index-only scan and
416  * not already done in a previous rescan call. To save on palloc
417  * overhead, both workspaces are allocated as one palloc block; only this
418  * function and btendscan know that.
419  *
420  * NOTE: this data structure also makes it safe to return data from a
421  * "name" column, even though btree name_ops uses an underlying storage
422  * datatype of cstring. The risk there is that "name" is supposed to be
423  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
424  * However, since we only return data out of tuples sitting in the
425  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
426  * data out of the markTuples array --- running off the end of memory for
427  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
428  * adding special-case treatment for name_ops elsewhere.
429  */
430  if (scan->xs_want_itup && so->currTuples == NULL)
431  {
432  so->currTuples = (char *) palloc(BLCKSZ * 2);
433  so->markTuples = so->currTuples + BLCKSZ;
434  }
435 
436  /*
437  * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
438  * - vadim 05/05/97
439  */
440  if (scankey && scan->numberOfKeys > 0)
441  memmove(scan->keyData,
442  scankey,
443  scan->numberOfKeys * sizeof(ScanKeyData));
444  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
445 
446  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
448 }
449 
450 /*
451  * btendscan() -- close down a scan
452  */
453 void
455 {
456  BTScanOpaque so = (BTScanOpaque) scan->opaque;
457 
458  /* we aren't holding any read locks, but gotta drop the pins */
459  if (BTScanPosIsValid(so->currPos))
460  {
461  /* Before leaving current page, deal with any killed items */
462  if (so->numKilled > 0)
463  _bt_killitems(scan);
465  }
466 
467  so->markItemIndex = -1;
469 
470  /* No need to invalidate positions, the RAM is about to be freed. */
471 
472  /* Release storage */
473  if (so->keyData != NULL)
474  pfree(so->keyData);
475  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
476  if (so->arrayContext != NULL)
478  if (so->killedItems != NULL)
479  pfree(so->killedItems);
480  if (so->currTuples != NULL)
481  pfree(so->currTuples);
482  /* so->markTuples should not be pfree'd, see btrescan */
483  pfree(so);
484 }
485 
486 /*
487  * btmarkpos() -- save current scan position
488  */
489 void
491 {
492  BTScanOpaque so = (BTScanOpaque) scan->opaque;
493 
494  /* There may be an old mark with a pin (but no lock). */
496 
497  /*
498  * Just record the current itemIndex. If we later step to next page
499  * before releasing the marked position, _bt_steppage makes a full copy of
500  * the currPos struct in markPos. If (as often happens) the mark is moved
501  * before we leave the page, we don't have to do that work.
502  */
503  if (BTScanPosIsValid(so->currPos))
504  so->markItemIndex = so->currPos.itemIndex;
505  else
506  {
508  so->markItemIndex = -1;
509  }
510 
511  /* Also record the current positions of any array keys */
512  if (so->numArrayKeys)
513  _bt_mark_array_keys(scan);
514 }
515 
516 /*
517  * btrestrpos() -- restore scan to last saved position
518  */
519 void
521 {
522  BTScanOpaque so = (BTScanOpaque) scan->opaque;
523 
524  /* Restore the marked positions of any array keys */
525  if (so->numArrayKeys)
527 
528  if (so->markItemIndex >= 0)
529  {
530  /*
531  * The scan has never moved to a new page since the last mark. Just
532  * restore the itemIndex.
533  *
534  * NB: In this case we can't count on anything in so->markPos to be
535  * accurate.
536  */
537  so->currPos.itemIndex = so->markItemIndex;
538  }
539  else
540  {
541  /*
542  * The scan moved to a new page after last mark or restore, and we are
543  * now restoring to the marked page. We aren't holding any read
544  * locks, but if we're still holding the pin for the current position,
545  * we must drop it.
546  */
547  if (BTScanPosIsValid(so->currPos))
548  {
549  /* Before leaving current page, deal with any killed items */
550  if (so->numKilled > 0)
551  _bt_killitems(scan);
553  }
554 
555  if (BTScanPosIsValid(so->markPos))
556  {
557  /* bump pin on mark buffer for assignment to current buffer */
558  if (BTScanPosIsPinned(so->markPos))
560  memcpy(&so->currPos, &so->markPos,
561  offsetof(BTScanPosData, items[1]) +
562  so->markPos.lastItem * sizeof(BTScanPosItem));
563  if (so->currTuples)
564  memcpy(so->currTuples, so->markTuples,
566  }
567  else
569  }
570 }
571 
572 /*
573  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
574  */
575 Size
577 {
578  return sizeof(BTParallelScanDescData);
579 }
580 
581 /*
582  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
583  */
584 void
585 btinitparallelscan(void *target)
586 {
587  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
588 
589  SpinLockInit(&bt_target->btps_mutex);
590  bt_target->btps_scanPage = InvalidBlockNumber;
592  bt_target->btps_arrayKeyCount = 0;
593  ConditionVariableInit(&bt_target->btps_cv);
594 }
595 
596 /*
597  * btparallelrescan() -- reset parallel scan
598  */
599 void
601 {
602  BTParallelScanDesc btscan;
603  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
604 
605  Assert(parallel_scan);
606 
607  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
608  parallel_scan->ps_offset);
609 
610  /*
611  * In theory, we don't need to acquire the spinlock here, because there
612  * shouldn't be any other workers running at this point, but we do so for
613  * consistency.
614  */
615  SpinLockAcquire(&btscan->btps_mutex);
618  btscan->btps_arrayKeyCount = 0;
619  SpinLockRelease(&btscan->btps_mutex);
620 }
621 
622 /*
623  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
624  * page. Other scans must wait until we call _bt_parallel_release()
625  * or _bt_parallel_done().
626  *
627  * The return value is true if we successfully seized the scan and false
628  * if we did not. The latter case occurs if no pages remain for the current
629  * set of scankeys.
630  *
631  * If the return value is true, *pageno returns the next or current page
632  * of the scan (depending on the scan direction). An invalid block number
633  * means the scan hasn't yet started, and P_NONE means we've reached the end.
634  * The first time a participating process reaches the last page, it will return
635  * true and set *pageno to P_NONE; after that, further attempts to seize the
636  * scan will return false.
637  *
638  * Callers should ignore the value of pageno if the return value is false.
639  */
640 bool
642 {
643  BTScanOpaque so = (BTScanOpaque) scan->opaque;
644  BTPS_State pageStatus;
645  bool exit_loop = false;
646  bool status = true;
647  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
648  BTParallelScanDesc btscan;
649 
650  *pageno = P_NONE;
651 
652  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
653  parallel_scan->ps_offset);
654 
655  while (1)
656  {
657  SpinLockAcquire(&btscan->btps_mutex);
658  pageStatus = btscan->btps_pageStatus;
659 
660  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
661  {
662  /* Parallel scan has already advanced to a new set of scankeys. */
663  status = false;
664  }
665  else if (pageStatus == BTPARALLEL_DONE)
666  {
667  /*
668  * We're done with this set of scankeys. This may be the end, or
669  * there could be more sets to try.
670  */
671  status = false;
672  }
673  else if (pageStatus != BTPARALLEL_ADVANCING)
674  {
675  /*
676  * We have successfully seized control of the scan for the purpose
677  * of advancing it to a new page!
678  */
679  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
680  *pageno = btscan->btps_scanPage;
681  exit_loop = true;
682  }
683  SpinLockRelease(&btscan->btps_mutex);
684  if (exit_loop || !status)
685  break;
687  }
689 
690  return status;
691 }
692 
693 /*
694  * _bt_parallel_release() -- Complete the process of advancing the scan to a
695  * new page. We now have the new value btps_scanPage; some other backend
696  * can now begin advancing the scan.
697  */
698 void
700 {
701  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
702  BTParallelScanDesc btscan;
703 
704  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
705  parallel_scan->ps_offset);
706 
707  SpinLockAcquire(&btscan->btps_mutex);
708  btscan->btps_scanPage = scan_page;
710  SpinLockRelease(&btscan->btps_mutex);
712 }
713 
714 /*
715  * _bt_parallel_done() -- Mark the parallel scan as complete.
716  *
717  * When there are no pages left to scan, this function should be called to
718  * notify other workers. Otherwise, they might wait forever for the scan to
719  * advance to the next page.
720  */
721 void
723 {
724  BTScanOpaque so = (BTScanOpaque) scan->opaque;
725  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
726  BTParallelScanDesc btscan;
727  bool status_changed = false;
728 
729  /* Do nothing, for non-parallel scans */
730  if (parallel_scan == NULL)
731  return;
732 
733  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
734  parallel_scan->ps_offset);
735 
736  /*
737  * Mark the parallel scan as done for this combination of scan keys,
738  * unless some other process already did so. See also
739  * _bt_advance_array_keys.
740  */
741  SpinLockAcquire(&btscan->btps_mutex);
742  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
743  btscan->btps_pageStatus != BTPARALLEL_DONE)
744  {
745  btscan->btps_pageStatus = BTPARALLEL_DONE;
746  status_changed = true;
747  }
748  SpinLockRelease(&btscan->btps_mutex);
749 
750  /* wake up all the workers associated with this parallel scan */
751  if (status_changed)
752  ConditionVariableBroadcast(&btscan->btps_cv);
753 }
754 
755 /*
756  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
757  * keys.
758  *
759  * Updates the count of array keys processed for both local and parallel
760  * scans.
761  */
762 void
764 {
765  BTScanOpaque so = (BTScanOpaque) scan->opaque;
766  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
767  BTParallelScanDesc btscan;
768 
769  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
770  parallel_scan->ps_offset);
771 
772  so->arrayKeyCount++;
773  SpinLockAcquire(&btscan->btps_mutex);
774  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
775  {
776  btscan->btps_scanPage = InvalidBlockNumber;
777  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
778  btscan->btps_arrayKeyCount++;
779  }
780  SpinLockRelease(&btscan->btps_mutex);
781 }
782 
783 /*
784  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
785  * btbulkdelete() wasn't called.
786  */
787 static bool
789 {
790  Buffer metabuf;
791  Page metapg;
792  BTMetaPageData *metad;
793  bool result = false;
794 
795  metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
796  metapg = BufferGetPage(metabuf);
797  metad = BTPageGetMeta(metapg);
798 
799  if (metad->btm_version < BTREE_NOVAC_VERSION)
800  {
801  /*
802  * Do cleanup if metapage needs upgrade, because we don't have
803  * cleanup-related meta-information yet.
804  */
805  result = true;
806  }
807  else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
810  {
811  /*
812  * If oldest btpo.xact in the deleted pages is older than
813  * RecentGlobalXmin, then at least one deleted page can be recycled.
814  */
815  result = true;
816  }
817  else
818  {
819  StdRdOptions *relopts;
820  float8 cleanup_scale_factor;
821  float8 prev_num_heap_tuples;
822 
823  /*
824  * If table receives enough insertions and no cleanup was performed,
825  * then index would appear have stale statistics. If scale factor is
826  * set, we avoid that by performing cleanup if the number of inserted
827  * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
828  * original tuples count.
829  */
830  relopts = (StdRdOptions *) info->index->rd_options;
831  cleanup_scale_factor = (relopts &&
832  relopts->vacuum_cleanup_index_scale_factor >= 0)
835  prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
836 
837  if (cleanup_scale_factor <= 0 ||
838  prev_num_heap_tuples <= 0 ||
839  (info->num_heap_tuples - prev_num_heap_tuples) /
840  prev_num_heap_tuples >= cleanup_scale_factor)
841  result = true;
842  }
843 
844  _bt_relbuf(info->index, metabuf);
845  return result;
846 }
847 
848 /*
849  * Bulk deletion of all index entries pointing to a set of heap tuples.
850  * The set of target tuples is specified via a callback routine that tells
851  * whether any given heap tuple (identified by ItemPointer) is being deleted.
852  *
853  * Result: a palloc'd struct containing statistical info for VACUUM displays.
854  */
857  IndexBulkDeleteCallback callback, void *callback_state)
858 {
859  Relation rel = info->index;
860  BTCycleId cycleid;
861 
862  /* allocate stats if first time through, else re-use existing struct */
863  if (stats == NULL)
865 
866  /* Establish the vacuum cycle ID to use for this scan */
867  /* The ENSURE stuff ensures we clean up shared memory on failure */
869  {
870  TransactionId oldestBtpoXact;
871 
872  cycleid = _bt_start_vacuum(rel);
873 
874  btvacuumscan(info, stats, callback, callback_state, cycleid,
875  &oldestBtpoXact);
876 
877  /*
878  * Update cleanup-related information in metapage. This information is
879  * used only for cleanup but keeping them up to date can avoid
880  * unnecessary cleanup even after bulkdelete.
881  */
882  _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
883  info->num_heap_tuples);
884  }
886  _bt_end_vacuum(rel);
887 
888  return stats;
889 }
890 
891 /*
892  * Post-VACUUM cleanup.
893  *
894  * Result: a palloc'd struct containing statistical info for VACUUM displays.
895  */
898 {
899  /* No-op in ANALYZE ONLY mode */
900  if (info->analyze_only)
901  return stats;
902 
903  /*
904  * If btbulkdelete was called, we need not do anything, just return the
905  * stats from the latest btbulkdelete call. If it wasn't called, we might
906  * still need to do a pass over the index, to recycle any newly-recyclable
907  * pages or to obtain index statistics. _bt_vacuum_needs_cleanup
908  * determines if either are needed.
909  *
910  * Since we aren't going to actually delete any leaf items, there's no
911  * need to go through all the vacuum-cycle-ID pushups.
912  */
913  if (stats == NULL)
914  {
915  TransactionId oldestBtpoXact;
916 
917  /* Check if we need a cleanup */
918  if (!_bt_vacuum_needs_cleanup(info))
919  return NULL;
920 
922  btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
923 
924  /* Update cleanup-related information in the metapage */
925  _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
926  info->num_heap_tuples);
927  }
928 
929  /*
930  * It's quite possible for us to be fooled by concurrent page splits into
931  * double-counting some index tuples, so disbelieve any total that exceeds
932  * the underlying heap's count ... if we know that accurately. Otherwise
933  * this might just make matters worse.
934  */
935  if (!info->estimated_count)
936  {
937  if (stats->num_index_tuples > info->num_heap_tuples)
938  stats->num_index_tuples = info->num_heap_tuples;
939  }
940 
941  return stats;
942 }
943 
944 /*
945  * btvacuumscan --- scan the index for VACUUMing purposes
946  *
947  * This combines the functions of looking for leaf tuples that are deletable
948  * according to the vacuum callback, looking for empty pages that can be
949  * deleted, and looking for old deleted pages that can be recycled. Both
950  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
951  * btbulkdelete call occurred).
952  *
953  * The caller is responsible for initially allocating/zeroing a stats struct
954  * and for obtaining a vacuum cycle ID if necessary.
955  */
956 static void
958  IndexBulkDeleteCallback callback, void *callback_state,
959  BTCycleId cycleid, TransactionId *oldestBtpoXact)
960 {
961  Relation rel = info->index;
962  BTVacState vstate;
963  BlockNumber num_pages;
964  BlockNumber blkno;
965  bool needLock;
966 
967  /*
968  * Reset counts that will be incremented during the scan; needed in case
969  * of multiple scans during a single VACUUM command
970  */
971  stats->estimated_count = false;
972  stats->num_index_tuples = 0;
973  stats->pages_deleted = 0;
974 
975  /* Set up info to pass down to btvacuumpage */
976  vstate.info = info;
977  vstate.stats = stats;
978  vstate.callback = callback;
979  vstate.callback_state = callback_state;
980  vstate.cycleid = cycleid;
981  vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
983  vstate.totFreePages = 0;
985 
986  /* Create a temporary memory context to run _bt_pagedel in */
988  "_bt_pagedel",
990 
991  /*
992  * The outer loop iterates over all index pages except the metapage, in
993  * physical order (we hope the kernel will cooperate in providing
994  * read-ahead for speed). It is critical that we visit all leaf pages,
995  * including ones added after we start the scan, else we might fail to
996  * delete some deletable tuples. Hence, we must repeatedly check the
997  * relation length. We must acquire the relation-extension lock while
998  * doing so to avoid a race condition: if someone else is extending the
999  * relation, there is a window where bufmgr/smgr have created a new
1000  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1001  * we manage to scan such a page here, we'll improperly assume it can be
1002  * recycled. Taking the lock synchronizes things enough to prevent a
1003  * problem: either num_pages won't include the new page, or _bt_getbuf
1004  * already has write lock on the buffer and it will be fully initialized
1005  * before we can examine it. (See also vacuumlazy.c, which has the same
1006  * issue.) Also, we need not worry if a page is added immediately after
1007  * we look; the page splitting code already has write-lock on the left
1008  * page before it adds a right page, so we must already have processed any
1009  * tuples due to be moved into such a page.
1010  *
1011  * We can skip locking for new or temp relations, however, since no one
1012  * else could be accessing them.
1013  */
1014  needLock = !RELATION_IS_LOCAL(rel);
1015 
1016  blkno = BTREE_METAPAGE + 1;
1017  for (;;)
1018  {
1019  /* Get the current relation length */
1020  if (needLock)
1022  num_pages = RelationGetNumberOfBlocks(rel);
1023  if (needLock)
1025 
1026  if (info->report_progress)
1028  num_pages);
1029 
1030  /* Quit if we've scanned the whole relation */
1031  if (blkno >= num_pages)
1032  break;
1033  /* Iterate over pages, then loop back to recheck length */
1034  for (; blkno < num_pages; blkno++)
1035  {
1036  btvacuumpage(&vstate, blkno, blkno);
1037  if (info->report_progress)
1039  blkno);
1040  }
1041  }
1042 
1043  /*
1044  * Check to see if we need to issue one final WAL record for this index,
1045  * which may be needed for correctness on a hot standby node when non-MVCC
1046  * index scans could take place.
1047  *
1048  * If the WAL is replayed in hot standby, the replay process needs to get
1049  * cleanup locks on all index leaf pages, just as we've been doing here.
1050  * However, we won't issue any WAL records about pages that have no items
1051  * to be deleted. For pages between pages we've vacuumed, the replay code
1052  * will take locks under the direction of the lastBlockVacuumed fields in
1053  * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
1054  * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1055  * against the last leaf page in the index, if that one wasn't vacuumed.
1056  */
1057  if (XLogStandbyInfoActive() &&
1058  vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1059  {
1060  Buffer buf;
1061 
1062  /*
1063  * The page should be valid, but we can't use _bt_getbuf() because we
1064  * want to use a nondefault buffer access strategy. Since we aren't
1065  * going to delete any items, getting cleanup lock again is probably
1066  * overkill, but for consistency do that anyway.
1067  */
1068  buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1069  RBM_NORMAL, info->strategy);
1070  LockBufferForCleanup(buf);
1071  _bt_checkpage(rel, buf);
1072  _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1073  _bt_relbuf(rel, buf);
1074  }
1075 
1077 
1078  /*
1079  * If we found any recyclable pages (and recorded them in the FSM), then
1080  * forcibly update the upper-level FSM pages to ensure that searchers can
1081  * find them. It's possible that the pages were also found during
1082  * previous scans and so this is a waste of time, but it's cheap enough
1083  * relative to scanning the index that it shouldn't matter much, and
1084  * making sure that free pages are available sooner not later seems
1085  * worthwhile.
1086  *
1087  * Note that if no recyclable pages exist, we don't bother vacuuming the
1088  * FSM at all.
1089  */
1090  if (vstate.totFreePages > 0)
1092 
1093  /* update statistics */
1094  stats->num_pages = num_pages;
1095  stats->pages_free = vstate.totFreePages;
1096 
1097  if (oldestBtpoXact)
1098  *oldestBtpoXact = vstate.oldestBtpoXact;
1099 }
1100 
1101 /*
1102  * btvacuumpage --- VACUUM one page
1103  *
1104  * This processes a single page for btvacuumscan(). In some cases we
1105  * must go back and re-examine previously-scanned pages; this routine
1106  * recurses when necessary to handle that case.
1107  *
1108  * blkno is the page to process. orig_blkno is the highest block number
1109  * reached by the outer btvacuumscan loop (the same as blkno, unless we
1110  * are recursing to re-examine a previous page).
1111  */
1112 static void
1113 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1114 {
1115  IndexVacuumInfo *info = vstate->info;
1116  IndexBulkDeleteResult *stats = vstate->stats;
1118  void *callback_state = vstate->callback_state;
1119  Relation rel = info->index;
1120  bool delete_now;
1121  BlockNumber recurse_to;
1122  Buffer buf;
1123  Page page;
1124  BTPageOpaque opaque = NULL;
1125 
1126 restart:
1127  delete_now = false;
1128  recurse_to = P_NONE;
1129 
1130  /* call vacuum_delay_point while not holding any buffer lock */
1132 
1133  /*
1134  * We can't use _bt_getbuf() here because it always applies
1135  * _bt_checkpage(), which will barf on an all-zero page. We want to
1136  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1137  * buffer access strategy.
1138  */
1139  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1140  info->strategy);
1141  LockBuffer(buf, BT_READ);
1142  page = BufferGetPage(buf);
1143  if (!PageIsNew(page))
1144  {
1145  _bt_checkpage(rel, buf);
1146  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1147  }
1148 
1149  /*
1150  * If we are recursing, the only case we want to do anything with is a
1151  * live leaf page having the current vacuum cycle ID. Any other state
1152  * implies we already saw the page (eg, deleted it as being empty).
1153  */
1154  if (blkno != orig_blkno)
1155  {
1156  if (_bt_page_recyclable(page) ||
1157  P_IGNORE(opaque) ||
1158  !P_ISLEAF(opaque) ||
1159  opaque->btpo_cycleid != vstate->cycleid)
1160  {
1161  _bt_relbuf(rel, buf);
1162  return;
1163  }
1164  }
1165 
1166  /* Page is valid, see what to do with it */
1167  if (_bt_page_recyclable(page))
1168  {
1169  /* Okay to recycle this page */
1170  RecordFreeIndexPage(rel, blkno);
1171  vstate->totFreePages++;
1172  stats->pages_deleted++;
1173  }
1174  else if (P_ISDELETED(opaque))
1175  {
1176  /* Already deleted, but can't recycle yet */
1177  stats->pages_deleted++;
1178 
1179  /* Update the oldest btpo.xact */
1180  if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1181  TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1182  vstate->oldestBtpoXact = opaque->btpo.xact;
1183  }
1184  else if (P_ISHALFDEAD(opaque))
1185  {
1186  /* Half-dead, try to delete */
1187  delete_now = true;
1188  }
1189  else if (P_ISLEAF(opaque))
1190  {
1191  OffsetNumber deletable[MaxOffsetNumber];
1192  int ndeletable;
1193  OffsetNumber offnum,
1194  minoff,
1195  maxoff;
1196 
1197  /*
1198  * Trade in the initial read lock for a super-exclusive write lock on
1199  * this page. We must get such a lock on every leaf page over the
1200  * course of the vacuum scan, whether or not it actually contains any
1201  * deletable tuples --- see nbtree/README.
1202  */
1204  LockBufferForCleanup(buf);
1205 
1206  /*
1207  * Remember highest leaf page number we've taken cleanup lock on; see
1208  * notes in btvacuumscan
1209  */
1210  if (blkno > vstate->lastBlockLocked)
1211  vstate->lastBlockLocked = blkno;
1212 
1213  /*
1214  * Check whether we need to recurse back to earlier pages. What we
1215  * are concerned about is a page split that happened since we started
1216  * the vacuum scan. If the split moved some tuples to a lower page
1217  * then we might have missed 'em. If so, set up for tail recursion.
1218  * (Must do this before possibly clearing btpo_cycleid below!)
1219  */
1220  if (vstate->cycleid != 0 &&
1221  opaque->btpo_cycleid == vstate->cycleid &&
1222  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1223  !P_RIGHTMOST(opaque) &&
1224  opaque->btpo_next < orig_blkno)
1225  recurse_to = opaque->btpo_next;
1226 
1227  /*
1228  * Scan over all items to see which ones need deleted according to the
1229  * callback function.
1230  */
1231  ndeletable = 0;
1232  minoff = P_FIRSTDATAKEY(opaque);
1233  maxoff = PageGetMaxOffsetNumber(page);
1234  if (callback)
1235  {
1236  for (offnum = minoff;
1237  offnum <= maxoff;
1238  offnum = OffsetNumberNext(offnum))
1239  {
1240  IndexTuple itup;
1241  ItemPointer htup;
1242 
1243  itup = (IndexTuple) PageGetItem(page,
1244  PageGetItemId(page, offnum));
1245  htup = &(itup->t_tid);
1246 
1247  /*
1248  * During Hot Standby we currently assume that
1249  * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1250  * only true as long as the callback function depends only
1251  * upon whether the index tuple refers to heap tuples removed
1252  * in the initial heap scan. When vacuum starts it derives a
1253  * value of OldestXmin. Backends taking later snapshots could
1254  * have a RecentGlobalXmin with a later xid than the vacuum's
1255  * OldestXmin, so it is possible that row versions deleted
1256  * after OldestXmin could be marked as killed by other
1257  * backends. The callback function *could* look at the index
1258  * tuple state in isolation and decide to delete the index
1259  * tuple, though currently it does not. If it ever did, we
1260  * would need to reconsider whether XLOG_BTREE_VACUUM records
1261  * should cause conflicts. If they did cause conflicts they
1262  * would be fairly harsh conflicts, since we haven't yet
1263  * worked out a way to pass a useful value for
1264  * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1265  * applies to *any* type of index that marks index tuples as
1266  * killed.
1267  */
1268  if (callback(htup, callback_state))
1269  deletable[ndeletable++] = offnum;
1270  }
1271  }
1272 
1273  /*
1274  * Apply any needed deletes. We issue just one _bt_delitems_vacuum()
1275  * call per page, so as to minimize WAL traffic.
1276  */
1277  if (ndeletable > 0)
1278  {
1279  /*
1280  * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1281  * all information to the replay code to allow it to get a cleanup
1282  * lock on all pages between the previous lastBlockVacuumed and
1283  * this page. This ensures that WAL replay locks all leaf pages at
1284  * some point, which is important should non-MVCC scans be
1285  * requested. This is currently unused on standby, but we record
1286  * it anyway, so that the WAL contains the required information.
1287  *
1288  * Since we can visit leaf pages out-of-order when recursing,
1289  * replay might end up locking such pages an extra time, but it
1290  * doesn't seem worth the amount of bookkeeping it'd take to avoid
1291  * that.
1292  */
1293  _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1294  vstate->lastBlockVacuumed);
1295 
1296  /*
1297  * Remember highest leaf page number we've issued a
1298  * XLOG_BTREE_VACUUM WAL record for.
1299  */
1300  if (blkno > vstate->lastBlockVacuumed)
1301  vstate->lastBlockVacuumed = blkno;
1302 
1303  stats->tuples_removed += ndeletable;
1304  /* must recompute maxoff */
1305  maxoff = PageGetMaxOffsetNumber(page);
1306  }
1307  else
1308  {
1309  /*
1310  * If the page has been split during this vacuum cycle, it seems
1311  * worth expending a write to clear btpo_cycleid even if we don't
1312  * have any deletions to do. (If we do, _bt_delitems_vacuum takes
1313  * care of this.) This ensures we won't process the page again.
1314  *
1315  * We treat this like a hint-bit update because there's no need to
1316  * WAL-log it.
1317  */
1318  if (vstate->cycleid != 0 &&
1319  opaque->btpo_cycleid == vstate->cycleid)
1320  {
1321  opaque->btpo_cycleid = 0;
1322  MarkBufferDirtyHint(buf, true);
1323  }
1324  }
1325 
1326  /*
1327  * If it's now empty, try to delete; else count the live tuples. We
1328  * don't delete when recursing, though, to avoid putting entries into
1329  * freePages out-of-order (doesn't seem worth any extra code to handle
1330  * the case).
1331  */
1332  if (minoff > maxoff)
1333  delete_now = (blkno == orig_blkno);
1334  else
1335  stats->num_index_tuples += maxoff - minoff + 1;
1336  }
1337 
1338  if (delete_now)
1339  {
1340  MemoryContext oldcontext;
1341  int ndel;
1342 
1343  /* Run pagedel in a temp context to avoid memory leakage */
1345  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1346 
1347  ndel = _bt_pagedel(rel, buf);
1348 
1349  /* count only this page, else may double-count parent */
1350  if (ndel)
1351  {
1352  stats->pages_deleted++;
1353  if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1354  TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1355  vstate->oldestBtpoXact = opaque->btpo.xact;
1356  }
1357 
1358  MemoryContextSwitchTo(oldcontext);
1359  /* pagedel released buffer, so we shouldn't */
1360  }
1361  else
1362  _bt_relbuf(rel, buf);
1363 
1364  /*
1365  * This is really tail recursion, but if the compiler is too stupid to
1366  * optimize it as such, we'd eat an uncomfortably large amount of stack
1367  * space per recursion level (due to the deletable[] array). A failure is
1368  * improbable since the number of levels isn't likely to be large ... but
1369  * just in case, let's hand-optimize into a loop.
1370  */
1371  if (recurse_to != P_NONE)
1372  {
1373  blkno = recurse_to;
1374  goto restart;
1375  }
1376 }
1377 
1378 /*
1379  * btcanreturn() -- Check whether btree indexes support index-only scans.
1380  *
1381  * btrees always do, so this is trivial.
1382  */
1383 bool
1385 {
1386  return true;
1387 }
slock_t btps_mutex
Definition: nbtree.c:88
ambeginscan_function ambeginscan
Definition: amapi.h:221
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:351
int slock_t
Definition: s_lock.h:929
BlockNumber lastBlockLocked
Definition: nbtree.c:50
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3647
ambulkdelete_function ambulkdelete
Definition: amapi.h:213
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:546
#define BTP_SPLIT_END
Definition: nbtree.h:76
bool amcanmulticol
Definition: amapi.h:183
BlockNumber btpo_next
Definition: nbtree.h:58
uint16 amsupport
Definition: amapi.h:173
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:211
#define AllocSetContextCreate
Definition: memutils.h:169
MemoryContext pagedelcontext
Definition: nbtree.c:53
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:394
double tuples_removed
Definition: genam.h:78
void * callback_state
Definition: nbtree.c:47
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Definition: nbtinsert.c:79
#define P_IGNORE(opaque)
Definition: nbtree.h:194
static bool _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
Definition: nbtree.c:788
uint32 TransactionId
Definition: c.h:507
BlockNumber lastBlockVacuumed
Definition: nbtree.c:49
uint32 btm_version
Definition: nbtree.h:100
amgettuple_function amgettuple
Definition: amapi.h:223
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:581
struct BTParallelScanDescData BTParallelScanDescData
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3423
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:722
#define RelationGetDescr(relation)
Definition: rel.h:442
bool amcanorderbyop
Definition: amapi.h:177
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:520
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
BlockNumber totFreePages
Definition: nbtree.c:51
MemoryContext arrayContext
Definition: nbtree.h:642
amproperty_function amproperty
Definition: amapi.h:218
#define ExclusiveLock
Definition: lockdefs.h:44
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:2063
#define PointerGetDatum(X)
Definition: postgres.h:556
#define MaxOffsetNumber
Definition: off.h:28
int itemIndex
Definition: nbtree.h:579
struct SMgrRelationData * rd_smgr
Definition: rel.h:56
void pgstat_progress_update_param(int index, int64 val)
Definition: pgstat.c:3220
char * currTuples
Definition: nbtree.h:653
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:376
#define SpinLockInit(lock)
Definition: spin.h:60
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:536
bool analyze_only
Definition: genam.h:47
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo)
Definition: nbtree.c:193
struct TupleDescData * xs_itupdesc
Definition: relscan.h:128
IndexBulkDeleteCallback callback
Definition: nbtree.c:46
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:642
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:232
union BTPageOpaqueData::@46 btpo
bool report_progress
Definition: genam.h:48
BufferAccessStrategy strategy
Definition: genam.h:52
bool amstorage
Definition: amapi.h:191
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define P_NONE
Definition: nbtree.h:181
void ConditionVariableBroadcast(ConditionVariable *cv)
Relation index
Definition: genam.h:46
double vacuum_cleanup_index_scale_factor
Definition: globals.c:150
bool ampredlocks
Definition: amapi.h:195
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:603
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:136
uint32 BlockNumber
Definition: block.h:31
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:1978
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:348
void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
Definition: nbtpage.c:158
aminsert_function aminsert
Definition: amapi.h:212
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:92
Oid amkeytype
Definition: amapi.h:201
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:939
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
TransactionId xact
Definition: nbtree.h:62
bool amoptionalkey
Definition: amapi.h:185
amvalidate_function amvalidate
Definition: amapi.h:220
Size btestimateparallelscan(void)
Definition: nbtree.c:576
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:741
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:38
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
IndexUniqueCheck
Definition: genam.h:112
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:47
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:624
void btbuildempty(Relation index)
Definition: nbtree.c:157
#define BT_READ
Definition: nbtree.h:402
#define SpinLockAcquire(lock)
Definition: spin.h:62
int nextTupleOffset
Definition: nbtree.h:568
void ConditionVariableInit(ConditionVariable *cv)
int lastItem
Definition: nbtree.h:578
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1718
BTScanPosData markPos
Definition: nbtree.h:667
void pfree(void *pointer)
Definition: mcxt.c:1031
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:897
void ConditionVariableCancelSleep(void)
amgetbitmap_function amgetbitmap
Definition: amapi.h:224
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:690
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:193
double float8
Definition: c.h:491
uint16 BTCycleId
Definition: nbtree.h:27
ambuild_function ambuild
Definition: amapi.h:210
amoptions_function amoptions
Definition: amapi.h:217
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
Definition: nbtree.c:1113
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:290
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:310
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid, TransactionId *oldestBtpoXact)
Definition: nbtree.c:957
BlockNumber num_pages
Definition: genam.h:74
BlockNumber pages_free
Definition: genam.h:80
BTPS_State
Definition: nbtree.c:68
BTCycleId btpo_cycleid
Definition: nbtree.h:65
void ConditionVariableSignal(ConditionVariable *cv)
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:454
bool amcaninclude
Definition: amapi.h:199
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:191
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
Definition: nbtpage.c:50
amcostestimate_function amcostestimate
Definition: amapi.h:216
#define BTPageGetMeta(p)
Definition: nbtree.h:112
bool amcanunique
Definition: amapi.h:181
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:5776
int numArrayKeys
Definition: nbtree.h:637
ConditionVariable btps_cv
Definition: nbtree.c:89
ItemPointerData xs_heaptid
Definition: relscan.h:132
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:1950
static char * buf
Definition: pg_test_fsync.c:68
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:214
amendscan_function amendscan
Definition: amapi.h:225
#define memmove(d, s, c)
Definition: c.h:1238
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:584
ScanKeyData * ScanKey
Definition: skey.h:75
bool amcanbackward
Definition: amapi.h:179
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:586
TransactionId RecentGlobalXmin
Definition: snapmgr.c:168
IndexTupleData * IndexTuple
Definition: itup.h:53
int arrayKeyCount
Definition: nbtree.h:639
ScanDirection
Definition: sdir.h:22
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:605
char * markTuples
Definition: nbtree.h:654
#define InvalidTransactionId
Definition: transam.h:31
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:641
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
BlockNumber pages_deleted
Definition: genam.h:79
#define OffsetToPointer(base, offset)
Definition: c.h:635
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
ScanKey arrayKeyData
Definition: nbtree.h:636
amrescan_function amrescan
Definition: amapi.h:222
bool amcanparallel
Definition: amapi.h:197
#define BTREE_METAPAGE
Definition: nbtree.h:131
#define P_ISDELETED(opaque)
Definition: nbtree.h:191
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
TransactionId oldestBtpoXact
Definition: nbtree.c:52
BTCycleId cycleid
Definition: nbtree.c:48
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:402
#define SpinLockRelease(lock)
Definition: spin.h:64
bool amsearchnulls
Definition: amapi.h:189
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:216
float8 vacuum_cleanup_index_scale_factor
Definition: rel.h:268
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:490
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
void * palloc0(Size size)
Definition: mcxt.c:955
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:103
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:452
uintptr_t Datum
Definition: postgres.h:367
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
bool amclusterable
Definition: amapi.h:193
#define XLogStandbyInfoActive()
Definition: xlog.h:195
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:198
BlockNumber btps_scanPage
Definition: nbtree.c:82
bool amsearcharray
Definition: amapi.h:187
#define InvalidOid
Definition: postgres_ext.h:36
int _bt_pagedel(Relation rel, Buffer buf)
Definition: nbtpage.c:1304
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
int markItemIndex
Definition: nbtree.h:663
RelFileNode node
Definition: relfilenode.h:74
double num_heap_tuples
Definition: genam.h:51
#define makeNode(_type_)
Definition: nodes.h:573
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int * killedItems
Definition: nbtree.h:645
#define Assert(condition)
Definition: c.h:732
IndexVacuumInfo * info
Definition: nbtree.c:44
BTPS_State btps_pageStatus
Definition: nbtree.c:83
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2040
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:520
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:107
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:609
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
Definition: nbtpage.c:984
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1198
int numberOfKeys
Definition: nbtree.h:632
ammarkpos_function ammarkpos
Definition: amapi.h:226
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:102
bool amcanorder
Definition: amapi.h:175
IndexBulkDeleteResult * stats
Definition: nbtree.c:45
ambuildphasename_function ambuildphasename
Definition: amapi.h:219
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1384
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:600
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:230
#define BTNProcs
Definition: nbtree.h:395
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:699
ItemPointerData heapTid
Definition: nbtree.h:542
struct ScanKeyData * keyData
Definition: relscan.h:107
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:763
static Datum values[MAXATTR]
Definition: bootstrap.c:167
uint16 amstrategies
Definition: amapi.h:171
BTScanPosData currPos
Definition: nbtree.h:666
#define PageIsNew(page)
Definition: bufpage.h:229
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:924
void btinitparallelscan(void *target)
Definition: nbtree.c:585
ScanKey keyData
Definition: nbtree.h:633
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:972
ambuildempty_function ambuildempty
Definition: amapi.h:211
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:641
bool kill_prior_tuple
Definition: relscan.h:113
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:191
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
#define PG_FUNCTION_ARGS
Definition: fmgr.h:188
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1343
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2028
#define BTMaxStrategyNumber
Definition: stratnum.h:35
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
Buffer buf
Definition: nbtree.h:549
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:1893
#define TransactionIdIsValid(xid)
Definition: transam.h:41
void vacuum_delay_point(void)
Definition: vacuum.c:1940
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
uint16 btpo_flags
Definition: nbtree.h:64
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
double num_index_tuples
Definition: genam.h:77
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:215
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:671
bool estimated_count
Definition: genam.h:76
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3391
bytea * rd_options
Definition: rel.h:126
#define offsetof(type, field)
Definition: c.h:655
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:597
Pointer Page
Definition: bufpage.h:78
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:84
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:231
bool estimated_count
Definition: genam.h:49
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:856
amrestrpos_function amrestrpos
Definition: amapi.h:227
#define P_ISLEAF(opaque)
Definition: nbtree.h:189