PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
nbtree.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtree.c
4  * Implementation of Lehman and Yao's btree management algorithm for
5  * Postgres.
6  *
7  * NOTES
8  * This file contains only the public interface routines.
9  *
10  *
11  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * IDENTIFICATION
15  * src/backend/access/nbtree/nbtree.c
16  *
17  *-------------------------------------------------------------------------
18  */
19 #include "postgres.h"
20 
21 #include "access/nbtree.h"
22 #include "access/relscan.h"
23 #include "access/xlog.h"
24 #include "catalog/index.h"
25 #include "commands/vacuum.h"
26 #include "pgstat.h"
28 #include "storage/indexfsm.h"
29 #include "storage/ipc.h"
30 #include "storage/lmgr.h"
31 #include "storage/smgr.h"
32 #include "tcop/tcopprot.h" /* pgrminclude ignore */
33 #include "utils/builtins.h"
34 #include "utils/index_selfuncs.h"
35 #include "utils/memutils.h"
36 
37 
38 /* Working state for btbuild and its callback */
39 typedef struct
40 {
41  bool isUnique;
42  bool haveDead;
45 
46  /*
47  * spool2 is needed only when the index is a unique index. Dead tuples are
48  * put into spool2 instead of spool in order to avoid uniqueness check.
49  */
51  double indtuples;
52 } BTBuildState;
53 
54 /* Working state needed by btvacuumpage */
55 typedef struct
56 {
62  BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
63  BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
64  BlockNumber totFreePages; /* true total # of free pages */
66 } BTVacState;
67 
68 /*
69  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
70  *
71  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
72  * a new page; others must wait.
73  *
74  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
75  * to a new page; some process can start doing that.
76  *
77  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
78  * We reach this state once for every distinct combination of array keys.
79  */
80 typedef enum
81 {
86 } BTPS_State;
87 
88 /*
89  * BTParallelScanDescData contains btree specific shared information required
90  * for parallel scan.
91  */
92 typedef struct BTParallelScanDescData
93 {
94  BlockNumber btps_scanPage; /* latest or next page to be scanned */
95  BTPS_State btps_pageStatus; /* indicates whether next page is
96  * available for scan. see above for
97  * possible states of parallel scan. */
98  int btps_arrayKeyCount; /* count indicating number of array scan
99  * keys processed by parallel scan */
100  slock_t btps_mutex; /* protects above variables */
101  ConditionVariable btps_cv; /* used to synchronize parallel scan */
103 
105 
106 
107 static void btbuildCallback(Relation index,
108  HeapTuple htup,
109  Datum *values,
110  bool *isnull,
111  bool tupleIsAlive,
112  void *state);
113 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
114  IndexBulkDeleteCallback callback, void *callback_state,
115  BTCycleId cycleid);
116 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
117  BlockNumber orig_blkno);
118 
119 
120 /*
121  * Btree handler function: return IndexAmRoutine with access method parameters
122  * and callbacks.
123  */
124 Datum
126 {
127  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
128 
129  amroutine->amstrategies = BTMaxStrategyNumber;
130  amroutine->amsupport = BTNProcs;
131  amroutine->amcanorder = true;
132  amroutine->amcanorderbyop = false;
133  amroutine->amcanbackward = true;
134  amroutine->amcanunique = true;
135  amroutine->amcanmulticol = true;
136  amroutine->amoptionalkey = true;
137  amroutine->amsearcharray = true;
138  amroutine->amsearchnulls = true;
139  amroutine->amstorage = false;
140  amroutine->amclusterable = true;
141  amroutine->ampredlocks = true;
142  amroutine->amcanparallel = true;
143  amroutine->amkeytype = InvalidOid;
144 
145  amroutine->ambuild = btbuild;
146  amroutine->ambuildempty = btbuildempty;
147  amroutine->aminsert = btinsert;
148  amroutine->ambulkdelete = btbulkdelete;
149  amroutine->amvacuumcleanup = btvacuumcleanup;
150  amroutine->amcanreturn = btcanreturn;
151  amroutine->amcostestimate = btcostestimate;
152  amroutine->amoptions = btoptions;
153  amroutine->amproperty = btproperty;
154  amroutine->amvalidate = btvalidate;
155  amroutine->ambeginscan = btbeginscan;
156  amroutine->amrescan = btrescan;
157  amroutine->amgettuple = btgettuple;
158  amroutine->amgetbitmap = btgetbitmap;
159  amroutine->amendscan = btendscan;
160  amroutine->ammarkpos = btmarkpos;
161  amroutine->amrestrpos = btrestrpos;
164  amroutine->amparallelrescan = btparallelrescan;
165 
166  PG_RETURN_POINTER(amroutine);
167 }
168 
169 /*
170  * btbuild() -- build a new btree index.
171  */
174 {
176  double reltuples;
177  BTBuildState buildstate;
178 
179  buildstate.isUnique = indexInfo->ii_Unique;
180  buildstate.haveDead = false;
181  buildstate.heapRel = heap;
182  buildstate.spool = NULL;
183  buildstate.spool2 = NULL;
184  buildstate.indtuples = 0;
185 
186 #ifdef BTREE_BUILD_STATS
188  ResetUsage();
189 #endif /* BTREE_BUILD_STATS */
190 
191  /*
192  * We expect to be called exactly once for any index relation. If that's
193  * not the case, big trouble's what we have.
194  */
195  if (RelationGetNumberOfBlocks(index) != 0)
196  elog(ERROR, "index \"%s\" already contains data",
197  RelationGetRelationName(index));
198 
199  buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);
200 
201  /*
202  * If building a unique index, put dead tuples in a second spool to keep
203  * them out of the uniqueness check.
204  */
205  if (indexInfo->ii_Unique)
206  buildstate.spool2 = _bt_spoolinit(heap, index, false, true);
207 
208  /* do the heap scan */
209  reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
210  btbuildCallback, (void *) &buildstate);
211 
212  /* okay, all heap tuples are indexed */
213  if (buildstate.spool2 && !buildstate.haveDead)
214  {
215  /* spool2 turns out to be unnecessary */
216  _bt_spooldestroy(buildstate.spool2);
217  buildstate.spool2 = NULL;
218  }
219 
220  /*
221  * Finish the build by (1) completing the sort of the spool file, (2)
222  * inserting the sorted tuples into btree pages and (3) building the upper
223  * levels.
224  */
225  _bt_leafbuild(buildstate.spool, buildstate.spool2);
226  _bt_spooldestroy(buildstate.spool);
227  if (buildstate.spool2)
228  _bt_spooldestroy(buildstate.spool2);
229 
230 #ifdef BTREE_BUILD_STATS
232  {
233  ShowUsage("BTREE BUILD STATS");
234  ResetUsage();
235  }
236 #endif /* BTREE_BUILD_STATS */
237 
238  /*
239  * Return statistics
240  */
241  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
242 
243  result->heap_tuples = reltuples;
244  result->index_tuples = buildstate.indtuples;
245 
246  return result;
247 }
248 
249 /*
250  * Per-tuple callback from IndexBuildHeapScan
251  */
252 static void
254  HeapTuple htup,
255  Datum *values,
256  bool *isnull,
257  bool tupleIsAlive,
258  void *state)
259 {
260  BTBuildState *buildstate = (BTBuildState *) state;
261 
262  /*
263  * insert the index tuple into the appropriate spool file for subsequent
264  * processing
265  */
266  if (tupleIsAlive || buildstate->spool2 == NULL)
267  _bt_spool(buildstate->spool, &htup->t_self, values, isnull);
268  else
269  {
270  /* dead tuples are put into spool2 */
271  buildstate->haveDead = true;
272  _bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
273  }
274 
275  buildstate->indtuples += 1;
276 }
277 
278 /*
279  * btbuildempty() -- build an empty btree index in the initialization fork
280  */
281 void
283 {
284  Page metapage;
285 
286  /* Construct metapage. */
287  metapage = (Page) palloc(BLCKSZ);
288  _bt_initmetapage(metapage, P_NONE, 0);
289 
290  /*
291  * Write the page and log it. It might seem that an immediate sync would
292  * be sufficient to guarantee that the file exists on disk, but recovery
293  * itself might remove it while replaying, for example, an
294  * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
295  * this even when wal_level=minimal.
296  */
299  (char *) metapage, true);
301  BTREE_METAPAGE, metapage, false);
302 
303  /*
304  * An immediate sync is required even if we xlog'd the page, because the
305  * write did not go through shared_buffers and therefore a concurrent
306  * checkpoint may have moved the redo pointer past our xlog record.
307  */
309 }
310 
311 /*
312  * btinsert() -- insert an index tuple into a btree.
313  *
314  * Descend the tree recursively, find the appropriate location for our
315  * new tuple, and put it there.
316  */
317 bool
318 btinsert(Relation rel, Datum *values, bool *isnull,
319  ItemPointer ht_ctid, Relation heapRel,
320  IndexUniqueCheck checkUnique,
321  IndexInfo *indexInfo)
322 {
323  bool result;
324  IndexTuple itup;
325 
326  /* generate an index tuple */
327  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
328  itup->t_tid = *ht_ctid;
329 
330  result = _bt_doinsert(rel, itup, checkUnique, heapRel);
331 
332  pfree(itup);
333 
334  return result;
335 }
336 
337 /*
338  * btgettuple() -- Get the next tuple in the scan.
339  */
340 bool
342 {
343  BTScanOpaque so = (BTScanOpaque) scan->opaque;
344  bool res;
345 
346  /* btree indexes are never lossy */
347  scan->xs_recheck = false;
348 
349  /*
350  * If we have any array keys, initialize them during first call for a
351  * scan. We can't do this in btrescan because we don't know the scan
352  * direction at that time.
353  */
354  if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
355  {
356  /* punt if we have any unsatisfiable array keys */
357  if (so->numArrayKeys < 0)
358  return false;
359 
360  _bt_start_array_keys(scan, dir);
361  }
362 
363  /* This loop handles advancing to the next array elements, if any */
364  do
365  {
366  /*
367  * If we've already initialized this scan, we can just advance it in
368  * the appropriate direction. If we haven't done so yet, we call
369  * _bt_first() to get the first item in the scan.
370  */
371  if (!BTScanPosIsValid(so->currPos))
372  res = _bt_first(scan, dir);
373  else
374  {
375  /*
376  * Check to see if we should kill the previously-fetched tuple.
377  */
378  if (scan->kill_prior_tuple)
379  {
380  /*
381  * Yes, remember it for later. (We'll deal with all such
382  * tuples at once right before leaving the index page.) The
383  * test for numKilled overrun is not just paranoia: if the
384  * caller reverses direction in the indexscan then the same
385  * item might get entered multiple times. It's not worth
386  * trying to optimize that, so we don't detect it, but instead
387  * just forget any excess entries.
388  */
389  if (so->killedItems == NULL)
390  so->killedItems = (int *)
391  palloc(MaxIndexTuplesPerPage * sizeof(int));
393  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
394  }
395 
396  /*
397  * Now continue the scan.
398  */
399  res = _bt_next(scan, dir);
400  }
401 
402  /* If we have a tuple, return it ... */
403  if (res)
404  break;
405  /* ... otherwise see if we have more array keys to deal with */
406  } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
407 
408  return res;
409 }
410 
411 /*
412  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
413  */
414 int64
416 {
417  BTScanOpaque so = (BTScanOpaque) scan->opaque;
418  int64 ntids = 0;
419  ItemPointer heapTid;
420 
421  /*
422  * If we have any array keys, initialize them.
423  */
424  if (so->numArrayKeys)
425  {
426  /* punt if we have any unsatisfiable array keys */
427  if (so->numArrayKeys < 0)
428  return ntids;
429 
431  }
432 
433  /* This loop handles advancing to the next array elements, if any */
434  do
435  {
436  /* Fetch the first page & tuple */
437  if (_bt_first(scan, ForwardScanDirection))
438  {
439  /* Save tuple ID, and continue scanning */
440  heapTid = &scan->xs_ctup.t_self;
441  tbm_add_tuples(tbm, heapTid, 1, false);
442  ntids++;
443 
444  for (;;)
445  {
446  /*
447  * Advance to next tuple within page. This is the same as the
448  * easy case in _bt_next().
449  */
450  if (++so->currPos.itemIndex > so->currPos.lastItem)
451  {
452  /* let _bt_next do the heavy lifting */
453  if (!_bt_next(scan, ForwardScanDirection))
454  break;
455  }
456 
457  /* Save tuple ID, and continue scanning */
458  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
459  tbm_add_tuples(tbm, heapTid, 1, false);
460  ntids++;
461  }
462  }
463  /* Now see if we have more array keys to deal with */
465 
466  return ntids;
467 }
468 
469 /*
470  * btbeginscan() -- start a scan on a btree index
471  */
473 btbeginscan(Relation rel, int nkeys, int norderbys)
474 {
475  IndexScanDesc scan;
476  BTScanOpaque so;
477 
478  /* no order by operators allowed */
479  Assert(norderbys == 0);
480 
481  /* get the scan */
482  scan = RelationGetIndexScan(rel, nkeys, norderbys);
483 
484  /* allocate private workspace */
485  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
488  if (scan->numberOfKeys > 0)
489  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
490  else
491  so->keyData = NULL;
492 
493  so->arrayKeyData = NULL; /* assume no array keys for now */
494  so->numArrayKeys = 0;
495  so->arrayKeys = NULL;
496  so->arrayContext = NULL;
497 
498  so->killedItems = NULL; /* until needed */
499  so->numKilled = 0;
500 
501  /*
502  * We don't know yet whether the scan will be index-only, so we do not
503  * allocate the tuple workspace arrays until btrescan. However, we set up
504  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
505  */
506  so->currTuples = so->markTuples = NULL;
507 
508  scan->xs_itupdesc = RelationGetDescr(rel);
509 
510  scan->opaque = so;
511 
512  return scan;
513 }
514 
515 /*
516  * btrescan() -- rescan an index relation
517  */
518 void
519 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
520  ScanKey orderbys, int norderbys)
521 {
522  BTScanOpaque so = (BTScanOpaque) scan->opaque;
523 
524  /* we aren't holding any read locks, but gotta drop the pins */
525  if (BTScanPosIsValid(so->currPos))
526  {
527  /* Before leaving current page, deal with any killed items */
528  if (so->numKilled > 0)
529  _bt_killitems(scan);
532  }
533 
534  so->markItemIndex = -1;
535  so->arrayKeyCount = 0;
538 
539  /*
540  * Allocate tuple workspace arrays, if needed for an index-only scan and
541  * not already done in a previous rescan call. To save on palloc
542  * overhead, both workspaces are allocated as one palloc block; only this
543  * function and btendscan know that.
544  *
545  * NOTE: this data structure also makes it safe to return data from a
546  * "name" column, even though btree name_ops uses an underlying storage
547  * datatype of cstring. The risk there is that "name" is supposed to be
548  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
549  * However, since we only return data out of tuples sitting in the
550  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
551  * data out of the markTuples array --- running off the end of memory for
552  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
553  * adding special-case treatment for name_ops elsewhere.
554  */
555  if (scan->xs_want_itup && so->currTuples == NULL)
556  {
557  so->currTuples = (char *) palloc(BLCKSZ * 2);
558  so->markTuples = so->currTuples + BLCKSZ;
559  }
560 
561  /*
562  * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
563  * - vadim 05/05/97
564  */
565  if (scankey && scan->numberOfKeys > 0)
566  memmove(scan->keyData,
567  scankey,
568  scan->numberOfKeys * sizeof(ScanKeyData));
569  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
570 
571  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
573 }
574 
575 /*
576  * btendscan() -- close down a scan
577  */
578 void
580 {
581  BTScanOpaque so = (BTScanOpaque) scan->opaque;
582 
583  /* we aren't holding any read locks, but gotta drop the pins */
584  if (BTScanPosIsValid(so->currPos))
585  {
586  /* Before leaving current page, deal with any killed items */
587  if (so->numKilled > 0)
588  _bt_killitems(scan);
590  }
591 
592  so->markItemIndex = -1;
594 
595  /* No need to invalidate positions, the RAM is about to be freed. */
596 
597  /* Release storage */
598  if (so->keyData != NULL)
599  pfree(so->keyData);
600  /* so->arrayKeyData and so->arrayKeys are in arrayContext */
601  if (so->arrayContext != NULL)
603  if (so->killedItems != NULL)
604  pfree(so->killedItems);
605  if (so->currTuples != NULL)
606  pfree(so->currTuples);
607  /* so->markTuples should not be pfree'd, see btrescan */
608  pfree(so);
609 }
610 
611 /*
612  * btmarkpos() -- save current scan position
613  */
614 void
616 {
617  BTScanOpaque so = (BTScanOpaque) scan->opaque;
618 
619  /* There may be an old mark with a pin (but no lock). */
621 
622  /*
623  * Just record the current itemIndex. If we later step to next page
624  * before releasing the marked position, _bt_steppage makes a full copy of
625  * the currPos struct in markPos. If (as often happens) the mark is moved
626  * before we leave the page, we don't have to do that work.
627  */
628  if (BTScanPosIsValid(so->currPos))
629  so->markItemIndex = so->currPos.itemIndex;
630  else
631  {
633  so->markItemIndex = -1;
634  }
635 
636  /* Also record the current positions of any array keys */
637  if (so->numArrayKeys)
638  _bt_mark_array_keys(scan);
639 }
640 
641 /*
642  * btrestrpos() -- restore scan to last saved position
643  */
644 void
646 {
647  BTScanOpaque so = (BTScanOpaque) scan->opaque;
648 
649  /* Restore the marked positions of any array keys */
650  if (so->numArrayKeys)
652 
653  if (so->markItemIndex >= 0)
654  {
655  /*
656  * The scan has never moved to a new page since the last mark. Just
657  * restore the itemIndex.
658  *
659  * NB: In this case we can't count on anything in so->markPos to be
660  * accurate.
661  */
662  so->currPos.itemIndex = so->markItemIndex;
663  }
664  else
665  {
666  /*
667  * The scan moved to a new page after last mark or restore, and we are
668  * now restoring to the marked page. We aren't holding any read
669  * locks, but if we're still holding the pin for the current position,
670  * we must drop it.
671  */
672  if (BTScanPosIsValid(so->currPos))
673  {
674  /* Before leaving current page, deal with any killed items */
675  if (so->numKilled > 0)
676  _bt_killitems(scan);
678  }
679 
680  if (BTScanPosIsValid(so->markPos))
681  {
682  /* bump pin on mark buffer for assignment to current buffer */
683  if (BTScanPosIsPinned(so->markPos))
685  memcpy(&so->currPos, &so->markPos,
686  offsetof(BTScanPosData, items[1]) +
687  so->markPos.lastItem * sizeof(BTScanPosItem));
688  if (so->currTuples)
689  memcpy(so->currTuples, so->markTuples,
691  }
692  else
694  }
695 }
696 
697 /*
698  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
699  */
700 Size
702 {
703  return sizeof(BTParallelScanDescData);
704 }
705 
706 /*
707  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
708  */
709 void
710 btinitparallelscan(void *target)
711 {
712  BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
713 
714  SpinLockInit(&bt_target->btps_mutex);
715  bt_target->btps_scanPage = InvalidBlockNumber;
717  bt_target->btps_arrayKeyCount = 0;
718  ConditionVariableInit(&bt_target->btps_cv);
719 }
720 
721 /*
722  * btparallelrescan() -- reset parallel scan
723  */
724 void
726 {
727  BTParallelScanDesc btscan;
728  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
729 
730  Assert(parallel_scan);
731 
732  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
733  parallel_scan->ps_offset);
734 
735  /*
736  * In theory, we don't need to acquire the spinlock here, because there
737  * shouldn't be any other workers running at this point, but we do so for
738  * consistency.
739  */
740  SpinLockAcquire(&btscan->btps_mutex);
743  btscan->btps_arrayKeyCount = 0;
744  SpinLockRelease(&btscan->btps_mutex);
745 }
746 
747 /*
748  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
749  * page. Other scans must wait until we call bt_parallel_release() or
750  * bt_parallel_done().
751  *
752  * The return value is true if we successfully seized the scan and false
753  * if we did not. The latter case occurs if no pages remain for the current
754  * set of scankeys.
755  *
756  * If the return value is true, *pageno returns the next or current page
757  * of the scan (depending on the scan direction). An invalid block number
758  * means the scan hasn't yet started, and P_NONE means we've reached the end.
759  * The first time a participating process reaches the last page, it will return
760  * true and set *pageno to P_NONE; after that, further attempts to seize the
761  * scan will return false.
762  *
763  * Callers should ignore the value of pageno if the return value is false.
764  */
765 bool
767 {
768  BTScanOpaque so = (BTScanOpaque) scan->opaque;
769  BTPS_State pageStatus;
770  bool exit_loop = false;
771  bool status = true;
772  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
773  BTParallelScanDesc btscan;
774 
775  *pageno = P_NONE;
776 
777  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
778  parallel_scan->ps_offset);
779 
780  while (1)
781  {
782  SpinLockAcquire(&btscan->btps_mutex);
783  pageStatus = btscan->btps_pageStatus;
784 
785  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
786  {
787  /* Parallel scan has already advanced to a new set of scankeys. */
788  status = false;
789  }
790  else if (pageStatus == BTPARALLEL_DONE)
791  {
792  /*
793  * We're done with this set of scankeys. This may be the end, or
794  * there could be more sets to try.
795  */
796  status = false;
797  }
798  else if (pageStatus != BTPARALLEL_ADVANCING)
799  {
800  /*
801  * We have successfully seized control of the scan for the purpose
802  * of advancing it to a new page!
803  */
804  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
805  *pageno = btscan->btps_scanPage;
806  exit_loop = true;
807  }
808  SpinLockRelease(&btscan->btps_mutex);
809  if (exit_loop || !status)
810  break;
812  }
814 
815  return status;
816 }
817 
818 /*
819  * _bt_parallel_release() -- Complete the process of advancing the scan to a
820  * new page. We now have the new value btps_scanPage; some other backend
821  * can now begin advancing the scan.
822  */
823 void
825 {
826  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
827  BTParallelScanDesc btscan;
828 
829  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
830  parallel_scan->ps_offset);
831 
832  SpinLockAcquire(&btscan->btps_mutex);
833  btscan->btps_scanPage = scan_page;
835  SpinLockRelease(&btscan->btps_mutex);
837 }
838 
839 /*
840  * _bt_parallel_done() -- Mark the parallel scan as complete.
841  *
842  * When there are no pages left to scan, this function should be called to
843  * notify other workers. Otherwise, they might wait forever for the scan to
844  * advance to the next page.
845  */
846 void
848 {
849  BTScanOpaque so = (BTScanOpaque) scan->opaque;
850  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
851  BTParallelScanDesc btscan;
852  bool status_changed = false;
853 
854  /* Do nothing, for non-parallel scans */
855  if (parallel_scan == NULL)
856  return;
857 
858  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
859  parallel_scan->ps_offset);
860 
861  /*
862  * Mark the parallel scan as done for this combination of scan keys,
863  * unless some other process already did so. See also
864  * _bt_advance_array_keys.
865  */
866  SpinLockAcquire(&btscan->btps_mutex);
867  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
868  btscan->btps_pageStatus != BTPARALLEL_DONE)
869  {
870  btscan->btps_pageStatus = BTPARALLEL_DONE;
871  status_changed = true;
872  }
873  SpinLockRelease(&btscan->btps_mutex);
874 
875  /* wake up all the workers associated with this parallel scan */
876  if (status_changed)
877  ConditionVariableBroadcast(&btscan->btps_cv);
878 }
879 
880 /*
881  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
882  * keys.
883  *
884  * Updates the count of array keys processed for both local and parallel
885  * scans.
886  */
887 void
889 {
890  BTScanOpaque so = (BTScanOpaque) scan->opaque;
891  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
892  BTParallelScanDesc btscan;
893 
894  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
895  parallel_scan->ps_offset);
896 
897  so->arrayKeyCount++;
898  SpinLockAcquire(&btscan->btps_mutex);
899  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
900  {
901  btscan->btps_scanPage = InvalidBlockNumber;
902  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
903  btscan->btps_arrayKeyCount++;
904  }
905  SpinLockRelease(&btscan->btps_mutex);
906 }
907 
908 /*
909  * Bulk deletion of all index entries pointing to a set of heap tuples.
910  * The set of target tuples is specified via a callback routine that tells
911  * whether any given heap tuple (identified by ItemPointer) is being deleted.
912  *
913  * Result: a palloc'd struct containing statistical info for VACUUM displays.
914  */
917  IndexBulkDeleteCallback callback, void *callback_state)
918 {
919  Relation rel = info->index;
920  BTCycleId cycleid;
921 
922  /* allocate stats if first time through, else re-use existing struct */
923  if (stats == NULL)
925 
926  /* Establish the vacuum cycle ID to use for this scan */
927  /* The ENSURE stuff ensures we clean up shared memory on failure */
929  {
930  cycleid = _bt_start_vacuum(rel);
931 
932  btvacuumscan(info, stats, callback, callback_state, cycleid);
933  }
935  _bt_end_vacuum(rel);
936 
937  return stats;
938 }
939 
940 /*
941  * Post-VACUUM cleanup.
942  *
943  * Result: a palloc'd struct containing statistical info for VACUUM displays.
944  */
947 {
948  /* No-op in ANALYZE ONLY mode */
949  if (info->analyze_only)
950  return stats;
951 
952  /*
953  * If btbulkdelete was called, we need not do anything, just return the
954  * stats from the latest btbulkdelete call. If it wasn't called, we must
955  * still do a pass over the index, to recycle any newly-recyclable pages
956  * and to obtain index statistics.
957  *
958  * Since we aren't going to actually delete any leaf items, there's no
959  * need to go through all the vacuum-cycle-ID pushups.
960  */
961  if (stats == NULL)
962  {
964  btvacuumscan(info, stats, NULL, NULL, 0);
965  }
966 
967  /* Finally, vacuum the FSM */
969 
970  /*
971  * It's quite possible for us to be fooled by concurrent page splits into
972  * double-counting some index tuples, so disbelieve any total that exceeds
973  * the underlying heap's count ... if we know that accurately. Otherwise
974  * this might just make matters worse.
975  */
976  if (!info->estimated_count)
977  {
978  if (stats->num_index_tuples > info->num_heap_tuples)
979  stats->num_index_tuples = info->num_heap_tuples;
980  }
981 
982  return stats;
983 }
984 
985 /*
986  * btvacuumscan --- scan the index for VACUUMing purposes
987  *
988  * This combines the functions of looking for leaf tuples that are deletable
989  * according to the vacuum callback, looking for empty pages that can be
990  * deleted, and looking for old deleted pages that can be recycled. Both
991  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
992  * btbulkdelete call occurred).
993  *
994  * The caller is responsible for initially allocating/zeroing a stats struct
995  * and for obtaining a vacuum cycle ID if necessary.
996  */
997 static void
999  IndexBulkDeleteCallback callback, void *callback_state,
1000  BTCycleId cycleid)
1001 {
1002  Relation rel = info->index;
1003  BTVacState vstate;
1004  BlockNumber num_pages;
1005  BlockNumber blkno;
1006  bool needLock;
1007 
1008  /*
1009  * Reset counts that will be incremented during the scan; needed in case
1010  * of multiple scans during a single VACUUM command
1011  */
1012  stats->estimated_count = false;
1013  stats->num_index_tuples = 0;
1014  stats->pages_deleted = 0;
1015 
1016  /* Set up info to pass down to btvacuumpage */
1017  vstate.info = info;
1018  vstate.stats = stats;
1019  vstate.callback = callback;
1020  vstate.callback_state = callback_state;
1021  vstate.cycleid = cycleid;
1022  vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
1024  vstate.totFreePages = 0;
1025 
1026  /* Create a temporary memory context to run _bt_pagedel in */
1028  "_bt_pagedel",
1030 
1031  /*
1032  * The outer loop iterates over all index pages except the metapage, in
1033  * physical order (we hope the kernel will cooperate in providing
1034  * read-ahead for speed). It is critical that we visit all leaf pages,
1035  * including ones added after we start the scan, else we might fail to
1036  * delete some deletable tuples. Hence, we must repeatedly check the
1037  * relation length. We must acquire the relation-extension lock while
1038  * doing so to avoid a race condition: if someone else is extending the
1039  * relation, there is a window where bufmgr/smgr have created a new
1040  * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1041  * we manage to scan such a page here, we'll improperly assume it can be
1042  * recycled. Taking the lock synchronizes things enough to prevent a
1043  * problem: either num_pages won't include the new page, or _bt_getbuf
1044  * already has write lock on the buffer and it will be fully initialized
1045  * before we can examine it. (See also vacuumlazy.c, which has the same
1046  * issue.) Also, we need not worry if a page is added immediately after
1047  * we look; the page splitting code already has write-lock on the left
1048  * page before it adds a right page, so we must already have processed any
1049  * tuples due to be moved into such a page.
1050  *
1051  * We can skip locking for new or temp relations, however, since no one
1052  * else could be accessing them.
1053  */
1054  needLock = !RELATION_IS_LOCAL(rel);
1055 
1056  blkno = BTREE_METAPAGE + 1;
1057  for (;;)
1058  {
1059  /* Get the current relation length */
1060  if (needLock)
1062  num_pages = RelationGetNumberOfBlocks(rel);
1063  if (needLock)
1065 
1066  /* Quit if we've scanned the whole relation */
1067  if (blkno >= num_pages)
1068  break;
1069  /* Iterate over pages, then loop back to recheck length */
1070  for (; blkno < num_pages; blkno++)
1071  {
1072  btvacuumpage(&vstate, blkno, blkno);
1073  }
1074  }
1075 
1076  /*
1077  * Check to see if we need to issue one final WAL record for this index,
1078  * which may be needed for correctness on a hot standby node when non-MVCC
1079  * index scans could take place.
1080  *
1081  * If the WAL is replayed in hot standby, the replay process needs to get
1082  * cleanup locks on all index leaf pages, just as we've been doing here.
1083  * However, we won't issue any WAL records about pages that have no items
1084  * to be deleted. For pages between pages we've vacuumed, the replay code
1085  * will take locks under the direction of the lastBlockVacuumed fields in
1086  * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
1087  * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1088  * against the last leaf page in the index, if that one wasn't vacuumed.
1089  */
1090  if (XLogStandbyInfoActive() &&
1091  vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1092  {
1093  Buffer buf;
1094 
1095  /*
1096  * The page should be valid, but we can't use _bt_getbuf() because we
1097  * want to use a nondefault buffer access strategy. Since we aren't
1098  * going to delete any items, getting cleanup lock again is probably
1099  * overkill, but for consistency do that anyway.
1100  */
1101  buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1102  RBM_NORMAL, info->strategy);
1103  LockBufferForCleanup(buf);
1104  _bt_checkpage(rel, buf);
1105  _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1106  _bt_relbuf(rel, buf);
1107  }
1108 
1110 
1111  /* update statistics */
1112  stats->num_pages = num_pages;
1113  stats->pages_free = vstate.totFreePages;
1114 }
1115 
1116 /*
1117  * btvacuumpage --- VACUUM one page
1118  *
1119  * This processes a single page for btvacuumscan(). In some cases we
1120  * must go back and re-examine previously-scanned pages; this routine
1121  * recurses when necessary to handle that case.
1122  *
1123  * blkno is the page to process. orig_blkno is the highest block number
1124  * reached by the outer btvacuumscan loop (the same as blkno, unless we
1125  * are recursing to re-examine a previous page).
1126  */
1127 static void
1128 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1129 {
1130  IndexVacuumInfo *info = vstate->info;
1131  IndexBulkDeleteResult *stats = vstate->stats;
1133  void *callback_state = vstate->callback_state;
1134  Relation rel = info->index;
1135  bool delete_now;
1136  BlockNumber recurse_to;
1137  Buffer buf;
1138  Page page;
1139  BTPageOpaque opaque = NULL;
1140 
1141 restart:
1142  delete_now = false;
1143  recurse_to = P_NONE;
1144 
1145  /* call vacuum_delay_point while not holding any buffer lock */
1147 
1148  /*
1149  * We can't use _bt_getbuf() here because it always applies
1150  * _bt_checkpage(), which will barf on an all-zero page. We want to
1151  * recycle all-zero pages, not fail. Also, we want to use a nondefault
1152  * buffer access strategy.
1153  */
1154  buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1155  info->strategy);
1156  LockBuffer(buf, BT_READ);
1157  page = BufferGetPage(buf);
1158  if (!PageIsNew(page))
1159  {
1160  _bt_checkpage(rel, buf);
1161  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1162  }
1163 
1164  /*
1165  * If we are recursing, the only case we want to do anything with is a
1166  * live leaf page having the current vacuum cycle ID. Any other state
1167  * implies we already saw the page (eg, deleted it as being empty).
1168  */
1169  if (blkno != orig_blkno)
1170  {
1171  if (_bt_page_recyclable(page) ||
1172  P_IGNORE(opaque) ||
1173  !P_ISLEAF(opaque) ||
1174  opaque->btpo_cycleid != vstate->cycleid)
1175  {
1176  _bt_relbuf(rel, buf);
1177  return;
1178  }
1179  }
1180 
1181  /* Page is valid, see what to do with it */
1182  if (_bt_page_recyclable(page))
1183  {
1184  /* Okay to recycle this page */
1185  RecordFreeIndexPage(rel, blkno);
1186  vstate->totFreePages++;
1187  stats->pages_deleted++;
1188  }
1189  else if (P_ISDELETED(opaque))
1190  {
1191  /* Already deleted, but can't recycle yet */
1192  stats->pages_deleted++;
1193  }
1194  else if (P_ISHALFDEAD(opaque))
1195  {
1196  /* Half-dead, try to delete */
1197  delete_now = true;
1198  }
1199  else if (P_ISLEAF(opaque))
1200  {
1201  OffsetNumber deletable[MaxOffsetNumber];
1202  int ndeletable;
1203  OffsetNumber offnum,
1204  minoff,
1205  maxoff;
1206 
1207  /*
1208  * Trade in the initial read lock for a super-exclusive write lock on
1209  * this page. We must get such a lock on every leaf page over the
1210  * course of the vacuum scan, whether or not it actually contains any
1211  * deletable tuples --- see nbtree/README.
1212  */
1214  LockBufferForCleanup(buf);
1215 
1216  /*
1217  * Remember highest leaf page number we've taken cleanup lock on; see
1218  * notes in btvacuumscan
1219  */
1220  if (blkno > vstate->lastBlockLocked)
1221  vstate->lastBlockLocked = blkno;
1222 
1223  /*
1224  * Check whether we need to recurse back to earlier pages. What we
1225  * are concerned about is a page split that happened since we started
1226  * the vacuum scan. If the split moved some tuples to a lower page
1227  * then we might have missed 'em. If so, set up for tail recursion.
1228  * (Must do this before possibly clearing btpo_cycleid below!)
1229  */
1230  if (vstate->cycleid != 0 &&
1231  opaque->btpo_cycleid == vstate->cycleid &&
1232  !(opaque->btpo_flags & BTP_SPLIT_END) &&
1233  !P_RIGHTMOST(opaque) &&
1234  opaque->btpo_next < orig_blkno)
1235  recurse_to = opaque->btpo_next;
1236 
1237  /*
1238  * Scan over all items to see which ones need deleted according to the
1239  * callback function.
1240  */
1241  ndeletable = 0;
1242  minoff = P_FIRSTDATAKEY(opaque);
1243  maxoff = PageGetMaxOffsetNumber(page);
1244  if (callback)
1245  {
1246  for (offnum = minoff;
1247  offnum <= maxoff;
1248  offnum = OffsetNumberNext(offnum))
1249  {
1250  IndexTuple itup;
1251  ItemPointer htup;
1252 
1253  itup = (IndexTuple) PageGetItem(page,
1254  PageGetItemId(page, offnum));
1255  htup = &(itup->t_tid);
1256 
1257  /*
1258  * During Hot Standby we currently assume that
1259  * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1260  * only true as long as the callback function depends only
1261  * upon whether the index tuple refers to heap tuples removed
1262  * in the initial heap scan. When vacuum starts it derives a
1263  * value of OldestXmin. Backends taking later snapshots could
1264  * have a RecentGlobalXmin with a later xid than the vacuum's
1265  * OldestXmin, so it is possible that row versions deleted
1266  * after OldestXmin could be marked as killed by other
1267  * backends. The callback function *could* look at the index
1268  * tuple state in isolation and decide to delete the index
1269  * tuple, though currently it does not. If it ever did, we
1270  * would need to reconsider whether XLOG_BTREE_VACUUM records
1271  * should cause conflicts. If they did cause conflicts they
1272  * would be fairly harsh conflicts, since we haven't yet
1273  * worked out a way to pass a useful value for
1274  * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1275  * applies to *any* type of index that marks index tuples as
1276  * killed.
1277  */
1278  if (callback(htup, callback_state))
1279  deletable[ndeletable++] = offnum;
1280  }
1281  }
1282 
1283  /*
1284  * Apply any needed deletes. We issue just one _bt_delitems_vacuum()
1285  * call per page, so as to minimize WAL traffic.
1286  */
1287  if (ndeletable > 0)
1288  {
1289  /*
1290  * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1291  * all information to the replay code to allow it to get a cleanup
1292  * lock on all pages between the previous lastBlockVacuumed and
1293  * this page. This ensures that WAL replay locks all leaf pages at
1294  * some point, which is important should non-MVCC scans be
1295  * requested. This is currently unused on standby, but we record
1296  * it anyway, so that the WAL contains the required information.
1297  *
1298  * Since we can visit leaf pages out-of-order when recursing,
1299  * replay might end up locking such pages an extra time, but it
1300  * doesn't seem worth the amount of bookkeeping it'd take to avoid
1301  * that.
1302  */
1303  _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1304  vstate->lastBlockVacuumed);
1305 
1306  /*
1307  * Remember highest leaf page number we've issued a
1308  * XLOG_BTREE_VACUUM WAL record for.
1309  */
1310  if (blkno > vstate->lastBlockVacuumed)
1311  vstate->lastBlockVacuumed = blkno;
1312 
1313  stats->tuples_removed += ndeletable;
1314  /* must recompute maxoff */
1315  maxoff = PageGetMaxOffsetNumber(page);
1316  }
1317  else
1318  {
1319  /*
1320  * If the page has been split during this vacuum cycle, it seems
1321  * worth expending a write to clear btpo_cycleid even if we don't
1322  * have any deletions to do. (If we do, _bt_delitems_vacuum takes
1323  * care of this.) This ensures we won't process the page again.
1324  *
1325  * We treat this like a hint-bit update because there's no need to
1326  * WAL-log it.
1327  */
1328  if (vstate->cycleid != 0 &&
1329  opaque->btpo_cycleid == vstate->cycleid)
1330  {
1331  opaque->btpo_cycleid = 0;
1332  MarkBufferDirtyHint(buf, true);
1333  }
1334  }
1335 
1336  /*
1337  * If it's now empty, try to delete; else count the live tuples. We
1338  * don't delete when recursing, though, to avoid putting entries into
1339  * freePages out-of-order (doesn't seem worth any extra code to handle
1340  * the case).
1341  */
1342  if (minoff > maxoff)
1343  delete_now = (blkno == orig_blkno);
1344  else
1345  stats->num_index_tuples += maxoff - minoff + 1;
1346  }
1347 
1348  if (delete_now)
1349  {
1350  MemoryContext oldcontext;
1351  int ndel;
1352 
1353  /* Run pagedel in a temp context to avoid memory leakage */
1355  oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1356 
1357  ndel = _bt_pagedel(rel, buf);
1358 
1359  /* count only this page, else may double-count parent */
1360  if (ndel)
1361  stats->pages_deleted++;
1362 
1363  MemoryContextSwitchTo(oldcontext);
1364  /* pagedel released buffer, so we shouldn't */
1365  }
1366  else
1367  _bt_relbuf(rel, buf);
1368 
1369  /*
1370  * This is really tail recursion, but if the compiler is too stupid to
1371  * optimize it as such, we'd eat an uncomfortably large amount of stack
1372  * space per recursion level (due to the deletable[] array). A failure is
1373  * improbable since the number of levels isn't likely to be large ... but
1374  * just in case, let's hand-optimize into a loop.
1375  */
1376  if (recurse_to != P_NONE)
1377  {
1378  blkno = recurse_to;
1379  goto restart;
1380  }
1381 }
1382 
1383 /*
1384  * btcanreturn() -- Check whether btree indexes support index-only scans.
1385  *
1386  * btrees always do, so this is trivial.
1387  */
1388 bool
1390 {
1391  return true;
1392 }
ParallelIndexScanDesc parallel_scan
Definition: relscan.h:139
ambeginscan_function ambeginscan
Definition: amapi.h:208
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:321
int slock_t
Definition: s_lock.h:888
BlockNumber lastBlockLocked
Definition: nbtree.c:63
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3603
ambulkdelete_function ambulkdelete
Definition: amapi.h:201
bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:547
#define BTP_SPLIT_END
Definition: nbtree.h:75
bool amcanmulticol
Definition: amapi.h:179
BlockNumber btpo_next
Definition: nbtree.h:57
uint16 amsupport
Definition: amapi.h:169
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:200
MemoryContext pagedelcontext
Definition: nbtree.c:65
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:519
double tuples_removed
Definition: genam.h:77
void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:201
void * callback_state
Definition: nbtree.c:60
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Definition: nbtinsert.c:108
#define P_IGNORE(opaque)
Definition: nbtree.h:181
static void btbuildCallback(Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
Definition: nbtree.c:253
BlockNumber lastBlockVacuumed
Definition: nbtree.c:62
amgettuple_function amgettuple
Definition: amapi.h:210
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:328
struct BTParallelScanDescData BTParallelScanDescData
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3379
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:847
#define RelationGetDescr(relation)
Definition: rel.h:428
bool amcanorderbyop
Definition: amapi.h:173
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:521
BlockNumber totFreePages
Definition: nbtree.c:64
MemoryContext arrayContext
Definition: nbtree.h:389
amproperty_function amproperty
Definition: amapi.h:206
bool ConditionVariableSignal(ConditionVariable *cv)
#define ExclusiveLock
Definition: lockdefs.h:44
#define PointerGetDatum(X)
Definition: postgres.h:562
void ShowUsage(const char *title)
Definition: postgres.c:4395
#define MaxOffsetNumber
Definition: off.h:28
int itemIndex
Definition: nbtree.h:326
struct SMgrRelationData * rd_smgr
Definition: rel.h:87
char * currTuples
Definition: nbtree.h:400
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:403
#define SpinLockInit(lock)
Definition: spin.h:60
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:205
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:523
bool analyze_only
Definition: genam.h:47
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo)
Definition: nbtree.c:318
IndexBulkDeleteCallback callback
Definition: nbtree.c:59
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:640
int ConditionVariableBroadcast(ConditionVariable *cv)
ItemPointerData t_tid
Definition: itup.h:37
amparallelrescan_function amparallelrescan
Definition: amapi.h:219
BufferAccessStrategy strategy
Definition: genam.h:51
bool amstorage
Definition: amapi.h:187
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define P_NONE
Definition: nbtree.h:168
Relation index
Definition: genam.h:46
bool ampredlocks
Definition: amapi.h:191
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:350
return result
Definition: formatting.c:1633
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:135
uint32 BlockNumber
Definition: block.h:31
TupleDesc xs_itupdesc
Definition: relscan.h:114
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:1992
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:473
BTSpool * _bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
Definition: nbtsort.c:152
aminsert_function aminsert
Definition: amapi.h:200
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:998
void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:180
BTSpool * spool
Definition: nbtree.c:44
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:104
Oid amkeytype
Definition: amapi.h:195
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:744
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:353
bool amoptionalkey
Definition: amapi.h:181
amvalidate_function amvalidate
Definition: amapi.h:207
Size btestimateparallelscan(void)
Definition: nbtree.c:701
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:536
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:38
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:89
IndexUniqueCheck
Definition: genam.h:111
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:37
void _bt_restore_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:625
void ResetUsage(void)
Definition: postgres.c:4388
void btbuildempty(Relation index)
Definition: nbtree.c:282
#define BT_READ
Definition: nbtree.h:238
#define SpinLockAcquire(lock)
Definition: spin.h:62
int nextTupleOffset
Definition: nbtree.h:315
void ConditionVariableInit(ConditionVariable *cv)
int lastItem
Definition: nbtree.h:325
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:1732
BTScanPosData markPos
Definition: nbtree.h:414
void pfree(void *pointer)
Definition: mcxt.c:950
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:946
void ConditionVariableCancelSleep(void)
amgetbitmap_function amgetbitmap
Definition: amapi.h:211
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:503
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:180
#define ERROR
Definition: elog.h:43
uint16 BTCycleId
Definition: nbtree.h:26
ambuild_function ambuild
Definition: amapi.h:198
void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
Definition: nbtsort.c:190
amoptions_function amoptions
Definition: amapi.h:205
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
Definition: nbtree.c:1128
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:415
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:417
BlockNumber num_pages
Definition: genam.h:73
BlockNumber pages_free
Definition: genam.h:79
ItemPointerData t_self
Definition: htup.h:65
BTPS_State
Definition: nbtree.c:80
BTCycleId btpo_cycleid
Definition: nbtree.h:64
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:48
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:579
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:165
RelFileNodeBackend smgr_rnode
Definition: smgr.h:43
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
Definition: nbtpage.c:49
amcostestimate_function amcostestimate
Definition: amapi.h:204
bool amcanunique
Definition: amapi.h:177
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6682
int numArrayKeys
Definition: nbtree.h:384
Relation heapRel
Definition: nbtree.c:43
ConditionVariable btps_cv
Definition: nbtree.c:101
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:1964
static char * buf
Definition: pg_test_fsync.c:66
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:202
amendscan_function amendscan
Definition: amapi.h:212
#define memmove(d, s, c)
Definition: c.h:1058
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:647
ScanKeyData * ScanKey
Definition: skey.h:75
BTSpool * spool2
Definition: nbtree.c:50
bool amcanbackward
Definition: amapi.h:175
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:333
bool haveDead
Definition: nbtree.c:42
IndexTupleData * IndexTuple
Definition: itup.h:53
int arrayKeyCount
Definition: nbtree.h:386
ScanDirection
Definition: sdir.h:22
void _bt_mark_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:606
char * markTuples
Definition: nbtree.h:401
double indtuples
Definition: nbtree.c:51
#define RelationGetRelationName(relation)
Definition: rel.h:436
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:766
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
BlockNumber pages_deleted
Definition: genam.h:78
#define OffsetToPointer(base, offset)
Definition: c.h:535
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
ScanKey arrayKeyData
Definition: nbtree.h:383
amrescan_function amrescan
Definition: amapi.h:209
bool amcanparallel
Definition: amapi.h:193
#define BTREE_METAPAGE
Definition: nbtree.h:109
#define P_ISDELETED(opaque)
Definition: nbtree.h:178
BTCycleId cycleid
Definition: nbtree.c:61
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:231
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:332
#define SpinLockRelease(lock)
Definition: spin.h:64
bool amsearchnulls
Definition: amapi.h:185
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:341
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:615
MemoryContext AllocSetContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: aset.c:322
void * palloc0(Size size)
Definition: mcxt.c:878
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:382
uintptr_t Datum
Definition: postgres.h:372
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
bool log_btree_build_stats
Definition: guc.c:443
bool amclusterable
Definition: amapi.h:189
#define XLogStandbyInfoActive()
Definition: xlog.h:159
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:199
BlockNumber btps_scanPage
Definition: nbtree.c:94
bool amsearcharray
Definition: amapi.h:183
#define InvalidOid
Definition: postgres_ext.h:36
int _bt_pagedel(Relation rel, Buffer buf)
Definition: nbtpage.c:1109
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
int markItemIndex
Definition: nbtree.h:410
RelFileNode node
Definition: relfilenode.h:74
bool xs_want_itup
Definition: relscan.h:95
double num_heap_tuples
Definition: genam.h:50
bool ii_Unique
Definition: execnodes.h:146
#define makeNode(_type_)
Definition: nodes.h:557
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int * killedItems
Definition: nbtree.h:392
#define NULL
Definition: c.h:229
#define Assert(condition)
Definition: c.h:675
IndexVacuumInfo * info
Definition: nbtree.c:57
Definition: regguts.h:298
BTPS_State btps_pageStatus
Definition: nbtree.c:95
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2054
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:645
HeapTupleData xs_ctup
Definition: relscan.h:119
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:125
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:356
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:356
#define PageGetSpecialPointer(page)
Definition: bufpage.h:322
#define InvalidBlockNumber
Definition: block.h:33
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
Definition: nbtpage.c:789
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1199
int numberOfKeys
Definition: nbtree.h:379
ammarkpos_function ammarkpos
Definition: amapi.h:213
bool amcanorder
Definition: amapi.h:171
IndexBulkDeleteResult * stats
Definition: nbtree.c:58
ScanKey keyData
Definition: relscan.h:93
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1389
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:725
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:217
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtree.c:173
#define BTNProcs
Definition: nbtree.h:231
void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
Definition: nbtree.c:824
ItemPointerData heapTid
Definition: nbtree.h:289
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:888
uint16 amstrategies
Definition: amapi.h:167
static Datum values[MAXATTR]
Definition: bootstrap.c:163
BTScanPosData currPos
Definition: nbtree.h:413
double IndexBuildHeapScan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, bool allow_sync, IndexBuildCallback callback, void *callback_state)
Definition: index.c:2169
#define PageIsNew(page)
Definition: bufpage.h:225
#define MaxIndexTuplesPerPage
Definition: itup.h:137
void * palloc(Size size)
Definition: mcxt.c:849
void btinitparallelscan(void *target)
Definition: nbtree.c:710
ScanKey keyData
Definition: nbtree.h:380
XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std)
Definition: xloginsert.c:972
ambuildempty_function ambuildempty
Definition: amapi.h:199
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:388
bool kill_prior_tuple
Definition: relscan.h:99
bool isUnique
Definition: nbtree.c:41
void _bt_preprocess_array_keys(IndexScanDesc scan)
Definition: nbtutils.c:192
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:78
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1129
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2042
#define BTMaxStrategyNumber
Definition: stratnum.h:35
#define elog
Definition: elog.h:219
Buffer buf
Definition: nbtree.h:296
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:1907
void vacuum_delay_point(void)
Definition: vacuum.c:1560
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:224
uint16 btpo_flags
Definition: nbtree.h:63
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
double num_index_tuples
Definition: genam.h:76
int Buffer
Definition: buf.h:23
amcanreturn_function amcanreturn
Definition: amapi.h:203
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:175
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:734
bool estimated_count
Definition: genam.h:75
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3347
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:83
#define offsetof(type, field)
Definition: c.h:555
#define PageGetItem(page, itemId)
Definition: bufpage.h:336
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:344
Pointer Page
Definition: bufpage.h:74
double index_tuples
Definition: genam.h:33
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:218
double heap_tuples
Definition: genam.h:32
bool estimated_count
Definition: genam.h:48
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:916
amrestrpos_function amrestrpos
Definition: amapi.h:214
#define P_ISLEAF(opaque)
Definition: nbtree.h:176