PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtree.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
6 *
7 * NOTES
8 * This file contains only the public interface routines.
9 *
10 *
11 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
13 *
14 * IDENTIFICATION
15 * src/backend/access/nbtree/nbtree.c
16 *
17 *-------------------------------------------------------------------------
18 */
19#include "postgres.h"
20
21#include "access/nbtree.h"
22#include "access/relscan.h"
23#include "access/stratnum.h"
24#include "commands/progress.h"
25#include "commands/vacuum.h"
26#include "nodes/execnodes.h"
27#include "pgstat.h"
28#include "storage/bulk_write.h"
30#include "storage/indexfsm.h"
31#include "storage/ipc.h"
32#include "storage/lmgr.h"
33#include "storage/lwlock.h"
34#include "storage/read_stream.h"
35#include "utils/datum.h"
36#include "utils/fmgrprotos.h"
38#include "utils/memutils.h"
39#include "utils/wait_event.h"
40
41
42/*
43 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
44 *
45 * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
46 * scan to advance it via another call to _bt_first.
47 *
48 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
49 * a new page; others must wait.
50 *
51 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
52 * to a new page; some process can start doing that.
53 *
54 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
55 */
64
65/*
66 * BTParallelScanDescData contains btree specific shared information required
67 * for parallel scan.
68 */
70{
71 BlockNumber btps_nextScanPage; /* next page to be scanned */
72 BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
73 * btps_nextScanPage */
74 BTPS_State btps_pageStatus; /* indicates whether next page is
75 * available for scan. see above for
76 * possible states of parallel scan. */
77 LWLock btps_lock; /* protects shared parallel state */
78 ConditionVariable btps_cv; /* used to synchronize parallel scan */
79
80 /*
81 * btps_arrElems is used when scans need to schedule another primitive
82 * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
83 * offsets for each = scan key associated with a ScalarArrayOp array.
84 */
86
87 /*
88 * Additional space (at the end of the struct) is used when scans need to
89 * schedule another primitive index scan with one or more skip arrays.
90 * Holds a flattened datum representation for each = scan key associated
91 * with a skip array.
92 */
94
96
97
98static bool _bt_start_prim_scan(IndexScanDesc scan);
103static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
104 IndexBulkDeleteCallback callback, void *callback_state,
105 BTCycleId cycleid);
109 OffsetNumber updatedoffset,
110 int *nremaining);
111
112
113/*
114 * Btree handler function: return IndexAmRoutine with access method parameters
115 * and callbacks.
116 */
117Datum
119{
120 static const IndexAmRoutine amroutine = {
122 .amstrategies = BTMaxStrategyNumber,
123 .amsupport = BTNProcs,
124 .amoptsprocnum = BTOPTIONS_PROC,
125 .amcanorder = true,
126 .amcanorderbyop = false,
127 .amcanhash = false,
128 .amconsistentequality = true,
129 .amconsistentordering = true,
130 .amcanbackward = true,
131 .amcanunique = true,
132 .amcanmulticol = true,
133 .amoptionalkey = true,
134 .amsearcharray = true,
135 .amsearchnulls = true,
136 .amstorage = false,
137 .amclusterable = true,
138 .ampredlocks = true,
139 .amcanparallel = true,
140 .amcanbuildparallel = true,
141 .amcaninclude = true,
142 .amusemaintenanceworkmem = false,
143 .amsummarizing = false,
144 .amparallelvacuumoptions =
146 .amkeytype = InvalidOid,
147
148 .ambuild = btbuild,
149 .ambuildempty = btbuildempty,
150 .aminsert = btinsert,
151 .aminsertcleanup = NULL,
152 .ambulkdelete = btbulkdelete,
153 .amvacuumcleanup = btvacuumcleanup,
154 .amcanreturn = btcanreturn,
155 .amcostestimate = btcostestimate,
156 .amgettreeheight = btgettreeheight,
157 .amoptions = btoptions,
158 .amproperty = btproperty,
159 .ambuildphasename = btbuildphasename,
160 .amvalidate = btvalidate,
161 .amadjustmembers = btadjustmembers,
162 .ambeginscan = btbeginscan,
163 .amrescan = btrescan,
164 .amgettuple = btgettuple,
165 .amgetbitmap = btgetbitmap,
166 .amendscan = btendscan,
167 .ammarkpos = btmarkpos,
168 .amrestrpos = btrestrpos,
169 .amestimateparallelscan = btestimateparallelscan,
170 .aminitparallelscan = btinitparallelscan,
171 .amparallelrescan = btparallelrescan,
172 .amtranslatestrategy = bttranslatestrategy,
173 .amtranslatecmptype = bttranslatecmptype,
174 };
175
177}
178
179/*
180 * btbuildempty() -- build an empty btree index in the initialization fork
181 */
182void
184{
185 bool allequalimage = _bt_allequalimage(index, false);
186 BulkWriteState *bulkstate;
188
190
191 /* Construct metapage. */
192 metabuf = smgr_bulk_get_buf(bulkstate);
193 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
194 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
195
196 smgr_bulk_finish(bulkstate);
197}
198
199/*
200 * btinsert() -- insert an index tuple into a btree.
201 *
202 * Descend the tree recursively, find the appropriate location for our
203 * new tuple, and put it there.
204 */
205bool
206btinsert(Relation rel, Datum *values, bool *isnull,
209 bool indexUnchanged,
210 IndexInfo *indexInfo)
211{
212 bool result;
213 IndexTuple itup;
214
215 /* generate an index tuple */
216 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
217 itup->t_tid = *ht_ctid;
218
219 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
220
221 pfree(itup);
222
223 return result;
224}
225
226/*
227 * btgettuple() -- Get the next tuple in the scan.
228 */
229bool
231{
233 bool res;
234
235 Assert(scan->heapRelation != NULL);
236
237 /* btree indexes are never lossy */
238 scan->xs_recheck = false;
239
240 /* Each loop iteration performs another primitive index scan */
241 do
242 {
243 /*
244 * If we've already initialized this scan, we can just advance it in
245 * the appropriate direction. If we haven't done so yet, we call
246 * _bt_first() to get the first item in the scan.
247 */
248 if (!BTScanPosIsValid(so->currPos))
249 res = _bt_first(scan, dir);
250 else
251 {
252 /*
253 * Check to see if we should kill the previously-fetched tuple.
254 */
255 if (scan->kill_prior_tuple)
256 {
257 /*
258 * Yes, remember it for later. (We'll deal with all such
259 * tuples at once right before leaving the index page.) The
260 * test for numKilled overrun is not just paranoia: if the
261 * caller reverses direction in the indexscan then the same
262 * item might get entered multiple times. It's not worth
263 * trying to optimize that, so we don't detect it, but instead
264 * just forget any excess entries.
265 */
266 if (so->killedItems == NULL)
267 so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
268 if (so->numKilled < MaxTIDsPerBTreePage)
269 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
270 }
271
272 /*
273 * Now continue the scan.
274 */
275 res = _bt_next(scan, dir);
276 }
277
278 /* If we have a tuple, return it ... */
279 if (res)
280 break;
281 /* ... otherwise see if we need another primitive index scan */
282 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
283
284 return res;
285}
286
287/*
288 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
289 */
290int64
292{
294 int64 ntids = 0;
295 ItemPointer heapTid;
296
297 Assert(scan->heapRelation == NULL);
298
299 /* Each loop iteration performs another primitive index scan */
300 do
301 {
302 /* Fetch the first page & tuple */
304 {
305 /* Save tuple ID, and continue scanning */
306 heapTid = &scan->xs_heaptid;
307 tbm_add_tuples(tbm, heapTid, 1, false);
308 ntids++;
309
310 for (;;)
311 {
312 /*
313 * Advance to next tuple within page. This is the same as the
314 * easy case in _bt_next().
315 */
316 if (++so->currPos.itemIndex > so->currPos.lastItem)
317 {
318 /* let _bt_next do the heavy lifting */
319 if (!_bt_next(scan, ForwardScanDirection))
320 break;
321 }
322
323 /* Save tuple ID, and continue scanning */
324 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
325 tbm_add_tuples(tbm, heapTid, 1, false);
326 ntids++;
327 }
328 }
329 /* Now see if we need another primitive index scan */
330 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
331
332 return ntids;
333}
334
335/*
336 * btbeginscan() -- start a scan on a btree index
337 */
339btbeginscan(Relation rel, int nkeys, int norderbys)
340{
341 IndexScanDesc scan;
343
344 /* no order by operators allowed */
345 Assert(norderbys == 0);
346
347 /* get the scan */
348 scan = RelationGetIndexScan(rel, nkeys, norderbys);
349
350 /* allocate private workspace */
352 BTScanPosInvalidate(so->currPos);
353 BTScanPosInvalidate(so->markPos);
354 if (scan->numberOfKeys > 0)
355 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
356 else
357 so->keyData = NULL;
358
359 so->skipScan = false;
360 so->needPrimScan = false;
361 so->scanBehind = false;
362 so->oppositeDirCheck = false;
363 so->arrayKeys = NULL;
364 so->orderProcs = NULL;
365 so->arrayContext = NULL;
366
367 so->killedItems = NULL; /* until needed */
368 so->numKilled = 0;
369
370 /*
371 * We don't know yet whether the scan will be index-only, so we do not
372 * allocate the tuple workspace arrays until btrescan. However, we set up
373 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
374 */
375 so->currTuples = so->markTuples = NULL;
376
377 scan->xs_itupdesc = RelationGetDescr(rel);
378
379 scan->opaque = so;
380
381 return scan;
382}
383
384/*
385 * btrescan() -- rescan an index relation
386 */
387void
389 ScanKey orderbys, int norderbys)
390{
392
393 /* we aren't holding any read locks, but gotta drop the pins */
394 if (BTScanPosIsValid(so->currPos))
395 {
396 /* Before leaving current page, deal with any killed items */
397 if (so->numKilled > 0)
398 _bt_killitems(scan);
399 BTScanPosUnpinIfPinned(so->currPos);
400 BTScanPosInvalidate(so->currPos);
401 }
402
403 /*
404 * We prefer to eagerly drop leaf page pins before btgettuple returns.
405 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
406 *
407 * We cannot safely drop leaf page pins during index-only scans due to a
408 * race condition involving VACUUM setting pages all-visible in the VM.
409 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
410 *
411 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
412 * Pins cannot be held for more than an instant during bitmap scans either
413 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
414 *
415 * See nbtree/README section on making concurrent TID recycling safe.
416 *
417 * Note: so->dropPin should never change across rescans.
418 */
419 so->dropPin = (!scan->xs_want_itup &&
421 scan->heapRelation != NULL);
422
423 so->markItemIndex = -1;
424 so->needPrimScan = false;
425 so->scanBehind = false;
426 so->oppositeDirCheck = false;
427 BTScanPosUnpinIfPinned(so->markPos);
428 BTScanPosInvalidate(so->markPos);
429
430 /*
431 * Allocate tuple workspace arrays, if needed for an index-only scan and
432 * not already done in a previous rescan call. To save on palloc
433 * overhead, both workspaces are allocated as one palloc block; only this
434 * function and btendscan know that.
435 */
436 if (scan->xs_want_itup && so->currTuples == NULL)
437 {
438 so->currTuples = (char *) palloc(BLCKSZ * 2);
439 so->markTuples = so->currTuples + BLCKSZ;
440 }
441
442 /*
443 * Reset the scan keys
444 */
445 if (scankey && scan->numberOfKeys > 0)
446 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
447 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
448 so->numArrayKeys = 0; /* ditto */
449}
450
451/*
452 * btendscan() -- close down a scan
453 */
454void
456{
458
459 /* we aren't holding any read locks, but gotta drop the pins */
460 if (BTScanPosIsValid(so->currPos))
461 {
462 /* Before leaving current page, deal with any killed items */
463 if (so->numKilled > 0)
464 _bt_killitems(scan);
465 BTScanPosUnpinIfPinned(so->currPos);
466 }
467
468 so->markItemIndex = -1;
469 BTScanPosUnpinIfPinned(so->markPos);
470
471 /* No need to invalidate positions, the RAM is about to be freed. */
472
473 /* Release storage */
474 if (so->keyData != NULL)
475 pfree(so->keyData);
476 /* so->arrayKeys and so->orderProcs are in arrayContext */
477 if (so->arrayContext != NULL)
478 MemoryContextDelete(so->arrayContext);
479 if (so->killedItems != NULL)
480 pfree(so->killedItems);
481 if (so->currTuples != NULL)
482 pfree(so->currTuples);
483 /* so->markTuples should not be pfree'd, see btrescan */
484 pfree(so);
485}
486
487/*
488 * btmarkpos() -- save current scan position
489 */
490void
492{
494
495 /* There may be an old mark with a pin (but no lock). */
496 BTScanPosUnpinIfPinned(so->markPos);
497
498 /*
499 * Just record the current itemIndex. If we later step to next page
500 * before releasing the marked position, _bt_steppage makes a full copy of
501 * the currPos struct in markPos. If (as often happens) the mark is moved
502 * before we leave the page, we don't have to do that work.
503 */
504 if (BTScanPosIsValid(so->currPos))
505 so->markItemIndex = so->currPos.itemIndex;
506 else
507 {
508 BTScanPosInvalidate(so->markPos);
509 so->markItemIndex = -1;
510 }
511}
512
513/*
514 * btrestrpos() -- restore scan to last saved position
515 */
516void
518{
520
521 if (so->markItemIndex >= 0)
522 {
523 /*
524 * The scan has never moved to a new page since the last mark. Just
525 * restore the itemIndex.
526 *
527 * NB: In this case we can't count on anything in so->markPos to be
528 * accurate.
529 */
530 so->currPos.itemIndex = so->markItemIndex;
531 }
532 else
533 {
534 /*
535 * The scan moved to a new page after last mark or restore, and we are
536 * now restoring to the marked page. We aren't holding any read
537 * locks, but if we're still holding the pin for the current position,
538 * we must drop it.
539 */
540 if (BTScanPosIsValid(so->currPos))
541 {
542 /* Before leaving current page, deal with any killed items */
543 if (so->numKilled > 0)
544 _bt_killitems(scan);
545 BTScanPosUnpinIfPinned(so->currPos);
546 }
547
548 if (BTScanPosIsValid(so->markPos))
549 {
550 /* bump pin on mark buffer for assignment to current buffer */
551 if (BTScanPosIsPinned(so->markPos))
552 IncrBufferRefCount(so->markPos.buf);
553 memcpy(&so->currPos, &so->markPos,
555 so->markPos.lastItem * sizeof(BTScanPosItem));
556 if (so->currTuples)
557 memcpy(so->currTuples, so->markTuples,
558 so->markPos.nextTupleOffset);
559 /* Reset the scan's array keys (see _bt_steppage for why) */
560 if (so->numArrayKeys)
561 {
562 _bt_start_array_keys(scan, so->currPos.dir);
563 so->needPrimScan = false;
564 }
565 }
566 else
567 BTScanPosInvalidate(so->currPos);
568 }
569}
570
571/*
572 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
573 */
574Size
575btestimateparallelscan(Relation rel, int nkeys, int norderbys)
576{
580
581 /*
582 * Pessimistically assume that every input scan key will be output with
583 * its own SAOP array
584 */
586 sizeof(int) * nkeys;
587
588 /* Single column indexes cannot possibly use a skip array */
589 if (nkeyatts == 1)
590 return estnbtreeshared;
591
592 /*
593 * Pessimistically assume that all attributes prior to the least
594 * significant attribute require a skip array (and an associated key)
595 */
596 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
597 sizeof(Datum));
598 for (int attnum = 1; attnum < nkeyatts; attnum++)
599 {
600 CompactAttribute *attr;
601
602 /*
603 * We make the conservative assumption that every index column will
604 * also require a skip array.
605 *
606 * Every skip array must have space to store its scan key's sk_flags.
607 * We also need space for each skip array's unused btps_arrElems slot
608 * (we need to be able to subscript btps_arrElems using a simple
609 * so->arrayKeys[]-wise offset for any subsequent SAOP arrays).
610 */
611 estnbtreeshared = add_size(estnbtreeshared, sizeof(int) * 2);
612
613 /* Consider space required to store a datum of opclass input type */
614 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
615 if (attr->attbyval)
616 {
617 /* This index attribute stores pass-by-value datums */
619 true, attr->attlen);
620
622 continue;
623 }
624
625 /*
626 * This index attribute stores pass-by-reference datums.
627 *
628 * Assume that serializing this array will use just as much space as a
629 * pass-by-value datum, in addition to space for the largest possible
630 * whole index tuple (this is not just a per-datum portion of the
631 * largest possible tuple because that'd be almost as large anyway).
632 *
633 * This is quite conservative, but it's not clear how we could do much
634 * better. The executor requires an up-front storage request size
635 * that reliably covers the scan's high watermark memory usage. We
636 * can't be sure of the real high watermark until the scan is over.
637 */
640 }
641
642 return estnbtreeshared;
643}
644
645/*
646 * _bt_start_prim_scan() -- start scheduled primitive index scan?
647 *
648 * Returns true if _bt_checkkeys scheduled another primitive index scan, just
649 * as the last one ended. Otherwise returns false, indicating that the array
650 * keys are now fully exhausted.
651 *
652 * Only call here during scans with one or more equality type array scan keys,
653 * after _bt_first or _bt_next return false.
654 */
655static bool
657{
659
660 Assert(so->numArrayKeys);
661
662 so->scanBehind = so->oppositeDirCheck = false; /* reset */
663
664 /*
665 * Array keys are advanced within _bt_checkkeys when the scan reaches the
666 * leaf level (more precisely, they're advanced when the scan reaches the
667 * end of each distinct set of array elements). This process avoids
668 * repeat access to leaf pages (across multiple primitive index scans) by
669 * advancing the scan's array keys when it allows the primitive index scan
670 * to find nearby matching tuples (or when it eliminates ranges of array
671 * key space that can't possibly be satisfied by any index tuple).
672 *
673 * _bt_checkkeys sets a simple flag variable to schedule another primitive
674 * index scan. The flag tells us what to do.
675 *
676 * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
677 * various cases where that won't happen. For example, if the index is
678 * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
679 * We also don't expect a call to _bt_checkkeys during searches for a
680 * non-existent value that happens to be lower/higher than any existing
681 * value in the index.
682 *
683 * We don't require special handling for these cases -- we don't need to
684 * be explicitly instructed to _not_ perform another primitive index scan.
685 * It's up to code under the control of _bt_first to always set the flag
686 * when another primitive index scan will be required.
687 *
688 * This works correctly, even with the tricky cases listed above, which
689 * all involve access to leaf pages "near the boundaries of the key space"
690 * (whether it's from a leftmost/rightmost page, or an imaginary empty
691 * leaf root page). If _bt_checkkeys cannot be reached by a primitive
692 * index scan for one set of array keys, then it also won't be reached for
693 * any later set ("later" in terms of the direction that we scan the index
694 * and advance the arrays). The array keys won't have advanced in these
695 * cases, but that's the correct behavior (even _bt_advance_array_keys
696 * won't always advance the arrays at the point they become "exhausted").
697 */
698 if (so->needPrimScan)
699 {
700 /*
701 * Flag was set -- must call _bt_first again, which will reset the
702 * scan's needPrimScan flag
703 */
704 return true;
705 }
706
707 /* The top-level index scan ran out of tuples in this scan direction */
708 if (scan->parallel_scan != NULL)
709 _bt_parallel_done(scan);
710
711 return false;
712}
713
714/*
715 * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
716 *
717 * Caller must have exclusively locked btscan->btps_lock when called.
718 */
719static void
722{
723 char *datumshared;
724
725 /* Space for serialized datums begins immediately after btps_arrElems[] */
726 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
727 for (int i = 0; i < so->numArrayKeys; i++)
728 {
729 BTArrayKeyInfo *array = &so->arrayKeys[i];
730 ScanKey skey = &so->keyData[array->scan_key];
731
732 if (array->num_elems != -1)
733 {
734 /* Save SAOP array's cur_elem (no need to copy key/datum) */
735 Assert(!(skey->sk_flags & SK_BT_SKIP));
736 btscan->btps_arrElems[i] = array->cur_elem;
737 continue;
738 }
739
740 /* Save all mutable state associated with skip array's key */
741 Assert(skey->sk_flags & SK_BT_SKIP);
742 memcpy(datumshared, &skey->sk_flags, sizeof(int));
743 datumshared += sizeof(int);
744
745 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
746 {
747 /* No sk_argument datum to serialize */
748 Assert(skey->sk_argument == 0);
749 continue;
750 }
751
752 datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
753 array->attbyval, array->attlen, &datumshared);
754 }
755}
756
757/*
758 * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
759 *
760 * Caller must have exclusively locked btscan->btps_lock when called.
761 */
762static void
765{
766 char *datumshared;
767
768 /* Space for serialized datums begins immediately after btps_arrElems[] */
769 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
770 for (int i = 0; i < so->numArrayKeys; i++)
771 {
772 BTArrayKeyInfo *array = &so->arrayKeys[i];
773 ScanKey skey = &so->keyData[array->scan_key];
774 bool isnull;
775
776 if (array->num_elems != -1)
777 {
778 /* Restore SAOP array using its saved cur_elem */
779 Assert(!(skey->sk_flags & SK_BT_SKIP));
780 array->cur_elem = btscan->btps_arrElems[i];
781 skey->sk_argument = array->elem_values[array->cur_elem];
782 continue;
783 }
784
785 /* Restore skip array by restoring its key directly */
786 if (!array->attbyval && skey->sk_argument)
787 pfree(DatumGetPointer(skey->sk_argument));
788 skey->sk_argument = (Datum) 0;
789 memcpy(&skey->sk_flags, datumshared, sizeof(int));
790 datumshared += sizeof(int);
791
792 Assert(skey->sk_flags & SK_BT_SKIP);
793
794 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
795 {
796 /* No sk_argument datum to restore */
797 continue;
798 }
799
800 skey->sk_argument = datumRestore(&datumshared, &isnull);
801 if (isnull)
802 {
803 Assert(skey->sk_argument == 0);
804 Assert(skey->sk_flags & SK_SEARCHNULL);
805 Assert(skey->sk_flags & SK_ISNULL);
806 }
807 }
808}
809
810/*
811 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
812 */
813void
815{
817
818 LWLockInitialize(&bt_target->btps_lock,
820 bt_target->btps_nextScanPage = InvalidBlockNumber;
821 bt_target->btps_lastCurrPage = InvalidBlockNumber;
822 bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
824}
825
826/*
827 * btparallelrescan() -- reset parallel scan
828 */
829void
831{
833 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
834
835 Assert(parallel_scan);
836
837 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
838 parallel_scan->ps_offset_am);
839
840 /*
841 * In theory, we don't need to acquire the LWLock here, because there
842 * shouldn't be any other workers running at this point, but we do so for
843 * consistency.
844 */
845 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
846 btscan->btps_nextScanPage = InvalidBlockNumber;
847 btscan->btps_lastCurrPage = InvalidBlockNumber;
848 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
849 LWLockRelease(&btscan->btps_lock);
850}
851
852/*
853 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
854 * page. Other scans must wait until we call _bt_parallel_release()
855 * or _bt_parallel_done().
856 *
857 * The return value is true if we successfully seized the scan and false
858 * if we did not. The latter case occurs when no pages remain, or when
859 * another primitive index scan is scheduled that caller's backend cannot
860 * start just yet (only backends that call from _bt_first are capable of
861 * starting primitive index scans, which they indicate by passing first=true).
862 *
863 * If the return value is true, *next_scan_page returns the next page of the
864 * scan, and *last_curr_page returns the page that *next_scan_page came from.
865 * An invalid *next_scan_page means the scan hasn't yet started, or that
866 * caller needs to start the next primitive index scan (if it's the latter
867 * case we'll set so.needPrimScan).
868 *
869 * Callers should ignore the value of *next_scan_page and *last_curr_page if
870 * the return value is false.
871 */
872bool
874 BlockNumber *last_curr_page, bool first)
875{
876 Relation rel = scan->indexRelation;
878 bool exit_loop = false,
879 status = true,
880 endscan = false;
881 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
883
886
887 /*
888 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
889 * call to _bt_readnextpage treats this backend similarly to a serial
890 * backend that steps from *last_curr_page to *next_scan_page (unless this
891 * backend's so->currPos is initialized by _bt_readfirstpage before then).
892 */
893 BTScanPosInvalidate(so->currPos);
894 so->currPos.moreLeft = so->currPos.moreRight = true;
895
896 if (first)
897 {
898 /*
899 * Initialize array related state when called from _bt_first, assuming
900 * that this will be the first primitive index scan for the scan
901 */
902 so->needPrimScan = false;
903 so->scanBehind = false;
904 so->oppositeDirCheck = false;
905 }
906 else
907 {
908 /*
909 * Don't attempt to seize the scan when it requires another primitive
910 * index scan, since caller's backend cannot start it right now
911 */
912 if (so->needPrimScan)
913 return false;
914 }
915
916 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
917 parallel_scan->ps_offset_am);
918
919 while (1)
920 {
921 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
922
923 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
924 {
925 /* We're done with this parallel index scan */
926 status = false;
927 }
928 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
929 btscan->btps_nextScanPage == P_NONE)
930 {
931 /* End this parallel index scan */
932 status = false;
933 endscan = true;
934 }
935 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
936 {
937 Assert(so->numArrayKeys);
938
939 if (first)
940 {
941 /* Can start scheduled primitive scan right away, so do so */
942 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
943
944 /* Restore scan's array keys from serialized values */
946 exit_loop = true;
947 }
948 else
949 {
950 /*
951 * Don't attempt to seize the scan when it requires another
952 * primitive index scan, since caller's backend cannot start
953 * it right now
954 */
955 status = false;
956 }
957
958 /*
959 * Either way, update backend local state to indicate that a
960 * pending primitive scan is required
961 */
962 so->needPrimScan = true;
963 so->scanBehind = false;
964 so->oppositeDirCheck = false;
965 }
966 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
967 {
968 /*
969 * We have successfully seized control of the scan for the purpose
970 * of advancing it to a new page!
971 */
972 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
973 Assert(btscan->btps_nextScanPage != P_NONE);
974 *next_scan_page = btscan->btps_nextScanPage;
975 *last_curr_page = btscan->btps_lastCurrPage;
976 exit_loop = true;
977 }
978 LWLockRelease(&btscan->btps_lock);
979 if (exit_loop || !status)
980 break;
982 }
984
985 /* When the scan has reached the rightmost (or leftmost) page, end it */
986 if (endscan)
987 _bt_parallel_done(scan);
988
989 return status;
990}
991
992/*
993 * _bt_parallel_release() -- Complete the process of advancing the scan to a
994 * new page. We now have the new value btps_nextScanPage; another backend
995 * can now begin advancing the scan.
996 *
997 * Callers whose scan uses array keys must save their curr_page argument so
998 * that it can be passed to _bt_parallel_primscan_schedule, should caller
999 * determine that another primitive index scan is required.
1000 *
1001 * If caller's next_scan_page is P_NONE, the scan has reached the index's
1002 * rightmost/leftmost page. This is treated as reaching the end of the scan
1003 * within _bt_parallel_seize.
1004 *
1005 * Note: unlike the serial case, parallel scans don't need to remember both
1006 * sibling links. next_scan_page is whichever link is next given the scan's
1007 * direction. That's all we'll ever need, since the direction of a parallel
1008 * scan can never change.
1009 */
1010void
1013{
1014 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1016
1018
1019 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1020 parallel_scan->ps_offset_am);
1021
1022 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1023 btscan->btps_nextScanPage = next_scan_page;
1024 btscan->btps_lastCurrPage = curr_page;
1025 btscan->btps_pageStatus = BTPARALLEL_IDLE;
1026 LWLockRelease(&btscan->btps_lock);
1028}
1029
1030/*
1031 * _bt_parallel_done() -- Mark the parallel scan as complete.
1032 *
1033 * When there are no pages left to scan, this function should be called to
1034 * notify other workers. Otherwise, they might wait forever for the scan to
1035 * advance to the next page.
1036 */
1037void
1039{
1041 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1043 bool status_changed = false;
1044
1045 Assert(!BTScanPosIsValid(so->currPos));
1046
1047 /* Do nothing, for non-parallel scans */
1048 if (parallel_scan == NULL)
1049 return;
1050
1051 /*
1052 * Should not mark parallel scan done when there's still a pending
1053 * primitive index scan
1054 */
1055 if (so->needPrimScan)
1056 return;
1057
1058 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1059 parallel_scan->ps_offset_am);
1060
1061 /*
1062 * Mark the parallel scan as done, unless some other process did so
1063 * already
1064 */
1065 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1066 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1067 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1068 {
1069 btscan->btps_pageStatus = BTPARALLEL_DONE;
1070 status_changed = true;
1071 }
1072 LWLockRelease(&btscan->btps_lock);
1073
1074 /* wake up all the workers associated with this parallel scan */
1075 if (status_changed)
1077}
1078
1079/*
1080 * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1081 *
1082 * Caller passes the curr_page most recently passed to _bt_parallel_release
1083 * by its backend. Caller successfully schedules the next primitive index scan
1084 * if the shared parallel state hasn't been seized since caller's backend last
1085 * advanced the scan.
1086 */
1087void
1089{
1090 Relation rel = scan->indexRelation;
1092 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1094
1095 Assert(so->numArrayKeys);
1096
1097 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1098 parallel_scan->ps_offset_am);
1099
1100 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1101 if (btscan->btps_lastCurrPage == curr_page &&
1102 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1103 {
1104 btscan->btps_nextScanPage = InvalidBlockNumber;
1105 btscan->btps_lastCurrPage = InvalidBlockNumber;
1106 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1107
1108 /* Serialize scan's current array keys */
1110 }
1111 LWLockRelease(&btscan->btps_lock);
1112}
1113
1114/*
1115 * Bulk deletion of all index entries pointing to a set of heap tuples.
1116 * The set of target tuples is specified via a callback routine that tells
1117 * whether any given heap tuple (identified by ItemPointer) is being deleted.
1118 *
1119 * Result: a palloc'd struct containing statistical info for VACUUM displays.
1120 */
1123 IndexBulkDeleteCallback callback, void *callback_state)
1124{
1125 Relation rel = info->index;
1126 BTCycleId cycleid;
1127
1128 /* allocate stats if first time through, else re-use existing struct */
1129 if (stats == NULL)
1131
1132 /* Establish the vacuum cycle ID to use for this scan */
1133 /* The ENSURE stuff ensures we clean up shared memory on failure */
1135 {
1136 cycleid = _bt_start_vacuum(rel);
1137
1138 btvacuumscan(info, stats, callback, callback_state, cycleid);
1139 }
1141 _bt_end_vacuum(rel);
1142
1143 return stats;
1144}
1145
1146/*
1147 * Post-VACUUM cleanup.
1148 *
1149 * Result: a palloc'd struct containing statistical info for VACUUM displays.
1150 */
1153{
1155
1156 /* No-op in ANALYZE ONLY mode */
1157 if (info->analyze_only)
1158 return stats;
1159
1160 /*
1161 * If btbulkdelete was called, we need not do anything (we just maintain
1162 * the information used within _bt_vacuum_needs_cleanup() by calling
1163 * _bt_set_cleanup_info() below).
1164 *
1165 * If btbulkdelete was _not_ called, then we have a choice to make: we
1166 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1167 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1168 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1169 * now.
1170 */
1171 if (stats == NULL)
1172 {
1173 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1174 if (!_bt_vacuum_needs_cleanup(info->index))
1175 return NULL;
1176
1177 /*
1178 * Since we aren't going to actually delete any leaf items, there's no
1179 * need to go through all the vacuum-cycle-ID pushups here.
1180 *
1181 * Posting list tuples are a source of inaccuracy for cleanup-only
1182 * scans. btvacuumscan() will assume that the number of index tuples
1183 * from each page can be used as num_index_tuples, even though
1184 * num_index_tuples is supposed to represent the number of TIDs in the
1185 * index. This naive approach can underestimate the number of tuples
1186 * in the index significantly.
1187 *
1188 * We handle the problem by making num_index_tuples an estimate in
1189 * cleanup-only case.
1190 */
1192 btvacuumscan(info, stats, NULL, NULL, 0);
1193 stats->estimated_count = true;
1194 }
1195
1196 /*
1197 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1198 *
1199 * num_delpages is the number of deleted pages now in the index that were
1200 * not safe to place in the FSM to be recycled just yet. num_delpages is
1201 * greater than 0 only when _bt_pagedel() actually deleted pages during
1202 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1203 * have failed to place any newly deleted pages in the FSM just moments
1204 * ago. (Actually, there are edge cases where recycling of the current
1205 * VACUUM's newly deleted pages does not even become safe by the time the
1206 * next VACUUM comes around. See nbtree/README.)
1207 */
1208 Assert(stats->pages_deleted >= stats->pages_free);
1209 num_delpages = stats->pages_deleted - stats->pages_free;
1211
1212 /*
1213 * It's quite possible for us to be fooled by concurrent page splits into
1214 * double-counting some index tuples, so disbelieve any total that exceeds
1215 * the underlying heap's count ... if we know that accurately. Otherwise
1216 * this might just make matters worse.
1217 */
1218 if (!info->estimated_count)
1219 {
1220 if (stats->num_index_tuples > info->num_heap_tuples)
1221 stats->num_index_tuples = info->num_heap_tuples;
1222 }
1223
1224 return stats;
1225}
1226
1227/*
1228 * btvacuumscan --- scan the index for VACUUMing purposes
1229 *
1230 * This combines the functions of looking for leaf tuples that are deletable
1231 * according to the vacuum callback, looking for empty pages that can be
1232 * deleted, and looking for old deleted pages that can be recycled. Both
1233 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1234 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1235 *
1236 * The caller is responsible for initially allocating/zeroing a stats struct
1237 * and for obtaining a vacuum cycle ID if necessary.
1238 */
1239static void
1241 IndexBulkDeleteCallback callback, void *callback_state,
1242 BTCycleId cycleid)
1243{
1244 Relation rel = info->index;
1246 BlockNumber num_pages;
1247 bool needLock;
1249 ReadStream *stream = NULL;
1250
1251 /*
1252 * Reset fields that track information about the entire index now. This
1253 * avoids double-counting in the case where a single VACUUM command
1254 * requires multiple scans of the index.
1255 *
1256 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1257 * since they track information about the VACUUM command, and so must last
1258 * across each call to btvacuumscan().
1259 *
1260 * (Note that pages_free is treated as state about the whole index, not
1261 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1262 * calls are idempotent, and get repeated for the same deleted pages in
1263 * some scenarios. The point for us is to track the number of recyclable
1264 * pages in the index at the end of the VACUUM command.)
1265 */
1266 stats->num_pages = 0;
1267 stats->num_index_tuples = 0;
1268 stats->pages_deleted = 0;
1269 stats->pages_free = 0;
1270
1271 /* Set up info to pass down to btvacuumpage */
1272 vstate.info = info;
1273 vstate.stats = stats;
1274 vstate.callback = callback;
1275 vstate.callback_state = callback_state;
1276 vstate.cycleid = cycleid;
1277
1278 /* Create a temporary memory context to run _bt_pagedel in */
1280 "_bt_pagedel",
1282
1283 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1284 vstate.bufsize = 0;
1285 vstate.maxbufsize = 0;
1286 vstate.pendingpages = NULL;
1287 vstate.npendingpages = 0;
1288 /* Consider applying _bt_pendingfsm_finalize optimization */
1290
1291 /*
1292 * The outer loop iterates over all index pages except the metapage, in
1293 * physical order (we hope the kernel will cooperate in providing
1294 * read-ahead for speed). It is critical that we visit all leaf pages,
1295 * including ones added after we start the scan, else we might fail to
1296 * delete some deletable tuples. Hence, we must repeatedly check the
1297 * relation length. We must acquire the relation-extension lock while
1298 * doing so to avoid a race condition: if someone else is extending the
1299 * relation, there is a window where bufmgr/smgr have created a new
1300 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1301 * we manage to scan such a page here, we'll improperly assume it can be
1302 * recycled. Taking the lock synchronizes things enough to prevent a
1303 * problem: either num_pages won't include the new page, or _bt_getbuf
1304 * already has write lock on the buffer and it will be fully initialized
1305 * before we can examine it. Also, we need not worry if a page is added
1306 * immediately after we look; the page splitting code already has
1307 * write-lock on the left page before it adds a right page, so we must
1308 * already have processed any tuples due to be moved into such a page.
1309 *
1310 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1311 * think the use of the extension lock is still required.
1312 *
1313 * We can skip locking for new or temp relations, however, since no one
1314 * else could be accessing them.
1315 */
1317
1319
1320 /*
1321 * It is safe to use batchmode as block_range_read_stream_cb takes no
1322 * locks.
1323 */
1327 info->strategy,
1328 rel,
1331 &p,
1332 0);
1333 for (;;)
1334 {
1335 /* Get the current relation length */
1336 if (needLock)
1338 num_pages = RelationGetNumberOfBlocks(rel);
1339 if (needLock)
1341
1342 if (info->report_progress)
1344 num_pages);
1345
1346 /* Quit if we've scanned the whole relation */
1347 if (p.current_blocknum >= num_pages)
1348 break;
1349
1350 p.last_exclusive = num_pages;
1351
1352 /* Iterate over pages, then loop back to recheck relation length */
1353 while (true)
1354 {
1355 BlockNumber current_block;
1356 Buffer buf;
1357
1358 /* call vacuum_delay_point while not holding any buffer lock */
1359 vacuum_delay_point(false);
1360
1361 buf = read_stream_next_buffer(stream, NULL);
1362
1363 if (!BufferIsValid(buf))
1364 break;
1365
1366 current_block = btvacuumpage(&vstate, buf);
1367
1368 if (info->report_progress)
1370 current_block);
1371 }
1372
1373 /*
1374 * We have to reset the read stream to use it again. After returning
1375 * InvalidBuffer, the read stream API won't invoke our callback again
1376 * until the stream has been reset.
1377 */
1378 read_stream_reset(stream);
1379 }
1380
1381 read_stream_end(stream);
1382
1383 /* Set statistics num_pages field to final size of index */
1384 stats->num_pages = num_pages;
1385
1386 MemoryContextDelete(vstate.pagedelcontext);
1387
1388 /*
1389 * If there were any calls to _bt_pagedel() during scan of the index then
1390 * see if any of the resulting pages can be placed in the FSM now. When
1391 * it's not safe we'll have to leave it up to a future VACUUM operation.
1392 *
1393 * Finally, if we placed any pages in the FSM (either just now or during
1394 * the scan), forcibly update the upper-level FSM pages to ensure that
1395 * searchers can find them.
1396 */
1398 if (stats->pages_free > 0)
1400}
1401
1402/*
1403 * btvacuumpage --- VACUUM one page
1404 *
1405 * This processes a single page for btvacuumscan(). In some cases we must
1406 * backtrack to re-examine and VACUUM pages that were on buf's page during
1407 * a previous call here. This is how we handle page splits (that happened
1408 * after our cycleid was acquired) whose right half page happened to reuse
1409 * a block that we might have processed at some point before it was
1410 * recycled (i.e. before the page split).
1411 *
1412 * Returns BlockNumber of a scanned page (not backtracked).
1413 */
1414static BlockNumber
1416{
1417 IndexVacuumInfo *info = vstate->info;
1418 IndexBulkDeleteResult *stats = vstate->stats;
1420 void *callback_state = vstate->callback_state;
1421 Relation rel = info->index;
1422 Relation heaprel = info->heaprel;
1423 bool attempt_pagedel;
1424 BlockNumber blkno,
1427 Page page;
1428 BTPageOpaque opaque;
1429
1430 blkno = scanblkno;
1431
1432backtrack:
1433
1434 attempt_pagedel = false;
1436
1437 _bt_lockbuf(rel, buf, BT_READ);
1438 page = BufferGetPage(buf);
1439 opaque = NULL;
1440 if (!PageIsNew(page))
1441 {
1442 _bt_checkpage(rel, buf);
1443 opaque = BTPageGetOpaque(page);
1444 }
1445
1446 Assert(blkno <= scanblkno);
1447 if (blkno != scanblkno)
1448 {
1449 /*
1450 * We're backtracking.
1451 *
1452 * We followed a right link to a sibling leaf page (a page that
1453 * happens to be from a block located before scanblkno). The only
1454 * case we want to do anything with is a live leaf page having the
1455 * current vacuum cycle ID.
1456 *
1457 * The page had better be in a state that's consistent with what we
1458 * expect. Check for conditions that imply corruption in passing. It
1459 * can't be half-dead because only an interrupted VACUUM process can
1460 * leave pages in that state, so we'd definitely have dealt with it
1461 * back when the page was the scanblkno page (half-dead pages are
1462 * always marked fully deleted by _bt_pagedel(), barring corruption).
1463 */
1464 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1465 {
1466 Assert(false);
1467 ereport(LOG,
1469 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1470 blkno, scanblkno, RelationGetRelationName(rel))));
1471 _bt_relbuf(rel, buf);
1472 return scanblkno;
1473 }
1474
1475 /*
1476 * We may have already processed the page in an earlier call, when the
1477 * page was scanblkno. This happens when the leaf page split occurred
1478 * after the scan began, but before the right sibling page became the
1479 * scanblkno.
1480 *
1481 * Page may also have been deleted by current btvacuumpage() call,
1482 * since _bt_pagedel() sometimes deletes the right sibling page of
1483 * scanblkno in passing (it does so after we decided where to
1484 * backtrack to). We don't need to process this page as a deleted
1485 * page a second time now (in fact, it would be wrong to count it as a
1486 * deleted page in the bulk delete statistics a second time).
1487 */
1488 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1489 {
1490 /* Done with current scanblkno (and all lower split pages) */
1491 _bt_relbuf(rel, buf);
1492 return scanblkno;
1493 }
1494 }
1495
1496 if (!opaque || BTPageIsRecyclable(page, heaprel))
1497 {
1498 /* Okay to recycle this page (which could be leaf or internal) */
1499 RecordFreeIndexPage(rel, blkno);
1500 stats->pages_deleted++;
1501 stats->pages_free++;
1502 }
1503 else if (P_ISDELETED(opaque))
1504 {
1505 /*
1506 * Already deleted page (which could be leaf or internal). Can't
1507 * recycle yet.
1508 */
1509 stats->pages_deleted++;
1510 }
1511 else if (P_ISHALFDEAD(opaque))
1512 {
1513 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1514 attempt_pagedel = true;
1515
1516 /*
1517 * _bt_pagedel() will increment both pages_newly_deleted and
1518 * pages_deleted stats in all cases (barring corruption)
1519 */
1520 }
1521 else if (P_ISLEAF(opaque))
1522 {
1524 int ndeletable;
1526 int nupdatable;
1527 OffsetNumber offnum,
1528 minoff,
1529 maxoff;
1530 int nhtidsdead,
1531 nhtidslive;
1532
1533 /*
1534 * Trade in the initial read lock for a full cleanup lock on this
1535 * page. We must get such a lock on every leaf page over the course
1536 * of the vacuum scan, whether or not it actually contains any
1537 * deletable tuples --- see nbtree/README.
1538 */
1540
1541 /*
1542 * Check whether we need to backtrack to earlier pages. What we are
1543 * concerned about is a page split that happened since we started the
1544 * vacuum scan. If the split moved tuples on the right half of the
1545 * split (i.e. the tuples that sort high) to a block that we already
1546 * passed over, then we might have missed the tuples. We need to
1547 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1548 * or deleting scanblkno page below!)
1549 */
1550 if (vstate->cycleid != 0 &&
1551 opaque->btpo_cycleid == vstate->cycleid &&
1552 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1553 !P_RIGHTMOST(opaque) &&
1554 opaque->btpo_next < scanblkno)
1555 backtrack_to = opaque->btpo_next;
1556
1557 ndeletable = 0;
1558 nupdatable = 0;
1559 minoff = P_FIRSTDATAKEY(opaque);
1560 maxoff = PageGetMaxOffsetNumber(page);
1561 nhtidsdead = 0;
1562 nhtidslive = 0;
1563 if (callback)
1564 {
1565 /* btbulkdelete callback tells us what to delete (or update) */
1566 for (offnum = minoff;
1567 offnum <= maxoff;
1568 offnum = OffsetNumberNext(offnum))
1569 {
1570 IndexTuple itup;
1571
1572 itup = (IndexTuple) PageGetItem(page,
1573 PageGetItemId(page, offnum));
1574
1575 Assert(!BTreeTupleIsPivot(itup));
1576 if (!BTreeTupleIsPosting(itup))
1577 {
1578 /* Regular tuple, standard table TID representation */
1579 if (callback(&itup->t_tid, callback_state))
1580 {
1581 deletable[ndeletable++] = offnum;
1582 nhtidsdead++;
1583 }
1584 else
1585 nhtidslive++;
1586 }
1587 else
1588 {
1590 int nremaining;
1591
1592 /* Posting list tuple */
1593 vacposting = btreevacuumposting(vstate, itup, offnum,
1594 &nremaining);
1595 if (vacposting == NULL)
1596 {
1597 /*
1598 * All table TIDs from the posting tuple remain, so no
1599 * delete or update required
1600 */
1602 }
1603 else if (nremaining > 0)
1604 {
1605
1606 /*
1607 * Store metadata about posting list tuple in
1608 * updatable array for entire page. Existing tuple
1609 * will be updated during the later call to
1610 * _bt_delitems_vacuum().
1611 */
1613 updatable[nupdatable++] = vacposting;
1615 }
1616 else
1617 {
1618 /*
1619 * All table TIDs from the posting list must be
1620 * deleted. We'll delete the index tuple completely
1621 * (no update required).
1622 */
1623 Assert(nremaining == 0);
1624 deletable[ndeletable++] = offnum;
1627 }
1628
1630 }
1631 }
1632 }
1633
1634 /*
1635 * Apply any needed deletes or updates. We issue just one
1636 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1637 */
1638 if (ndeletable > 0 || nupdatable > 0)
1639 {
1641 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1642 nupdatable);
1643
1644 stats->tuples_removed += nhtidsdead;
1645 /* must recompute maxoff */
1646 maxoff = PageGetMaxOffsetNumber(page);
1647
1648 /* can't leak memory here */
1649 for (int i = 0; i < nupdatable; i++)
1650 pfree(updatable[i]);
1651 }
1652 else
1653 {
1654 /*
1655 * If the leaf page has been split during this vacuum cycle, it
1656 * seems worth expending a write to clear btpo_cycleid even if we
1657 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1658 * takes care of this.) This ensures we won't process the page
1659 * again.
1660 *
1661 * We treat this like a hint-bit update because there's no need to
1662 * WAL-log it.
1663 */
1664 Assert(nhtidsdead == 0);
1665 if (vstate->cycleid != 0 &&
1666 opaque->btpo_cycleid == vstate->cycleid)
1667 {
1668 opaque->btpo_cycleid = 0;
1669 MarkBufferDirtyHint(buf, true);
1670 }
1671 }
1672
1673 /*
1674 * If the leaf page is now empty, try to delete it; else count the
1675 * live tuples (live table TIDs in posting lists are counted as
1676 * separate live tuples). We don't delete when backtracking, though,
1677 * since that would require teaching _bt_pagedel() about backtracking
1678 * (doesn't seem worth adding more complexity to deal with that).
1679 *
1680 * We don't count the number of live TIDs during cleanup-only calls to
1681 * btvacuumscan (i.e. when callback is not set). We count the number
1682 * of index tuples directly instead. This avoids the expense of
1683 * directly examining all of the tuples on each page. VACUUM will
1684 * treat num_index_tuples as an estimate in cleanup-only case, so it
1685 * doesn't matter that this underestimates num_index_tuples
1686 * significantly in some cases.
1687 */
1688 if (minoff > maxoff)
1689 attempt_pagedel = (blkno == scanblkno);
1690 else if (callback)
1691 stats->num_index_tuples += nhtidslive;
1692 else
1693 stats->num_index_tuples += maxoff - minoff + 1;
1694
1696 }
1697
1698 if (attempt_pagedel)
1699 {
1700 MemoryContext oldcontext;
1701
1702 /* Run pagedel in a temp context to avoid memory leakage */
1703 MemoryContextReset(vstate->pagedelcontext);
1704 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1705
1706 /*
1707 * _bt_pagedel maintains the bulk delete stats on our behalf;
1708 * pages_newly_deleted and pages_deleted are likely to be incremented
1709 * during call
1710 */
1711 Assert(blkno == scanblkno);
1712 _bt_pagedel(rel, buf, vstate);
1713
1714 MemoryContextSwitchTo(oldcontext);
1715 /* pagedel released buffer, so we shouldn't */
1716 }
1717 else
1718 _bt_relbuf(rel, buf);
1719
1720 if (backtrack_to != P_NONE)
1721 {
1722 blkno = backtrack_to;
1723
1724 /* check for vacuum delay while not holding any buffer lock */
1725 vacuum_delay_point(false);
1726
1727 /*
1728 * We can't use _bt_getbuf() here because it always applies
1729 * _bt_checkpage(), which will barf on an all-zero page. We want to
1730 * recycle all-zero pages, not fail. Also, we want to use a
1731 * nondefault buffer access strategy.
1732 */
1734 info->strategy);
1735 goto backtrack;
1736 }
1737
1738 return scanblkno;
1739}
1740
1741/*
1742 * btreevacuumposting --- determine TIDs still needed in posting list
1743 *
1744 * Returns metadata describing how to build replacement tuple without the TIDs
1745 * that VACUUM needs to delete. Returned value is NULL in the common case
1746 * where no changes are needed to caller's posting list tuple (we avoid
1747 * allocating memory here as an optimization).
1748 *
1749 * The number of TIDs that should remain in the posting list tuple is set for
1750 * caller in *nremaining.
1751 */
1752static BTVacuumPosting
1754 OffsetNumber updatedoffset, int *nremaining)
1755{
1756 int live = 0;
1757 int nitem = BTreeTupleGetNPosting(posting);
1760
1761 for (int i = 0; i < nitem; i++)
1762 {
1763 if (!vstate->callback(items + i, vstate->callback_state))
1764 {
1765 /* Live table TID */
1766 live++;
1767 }
1768 else if (vacposting == NULL)
1769 {
1770 /*
1771 * First dead table TID encountered.
1772 *
1773 * It's now clear that we need to delete one or more dead table
1774 * TIDs, so start maintaining metadata describing how to update
1775 * existing posting list tuple.
1776 */
1778 nitem * sizeof(uint16));
1779
1780 vacposting->itup = posting;
1781 vacposting->updatedoffset = updatedoffset;
1782 vacposting->ndeletedtids = 0;
1783 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1784 }
1785 else
1786 {
1787 /* Second or subsequent dead table TID */
1788 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1789 }
1790 }
1791
1792 *nremaining = live;
1793 return vacposting;
1794}
1795
1796/*
1797 * btcanreturn() -- Check whether btree indexes support index-only scans.
1798 *
1799 * btrees always do, so this is trivial.
1800 */
1801bool
1803{
1804 return true;
1805}
1806
1807/*
1808 * btgettreeheight() -- Compute tree height for use by btcostestimate().
1809 */
1810int
1812{
1813 return _bt_getrootheight(rel);
1814}
1815
1818{
1819 switch (strategy)
1820 {
1822 return COMPARE_LT;
1824 return COMPARE_LE;
1826 return COMPARE_EQ;
1828 return COMPARE_GE;
1830 return COMPARE_GT;
1831 default:
1832 return COMPARE_INVALID;
1833 }
1834}
1835
1838{
1839 switch (cmptype)
1840 {
1841 case COMPARE_LT:
1842 return BTLessStrategyNumber;
1843 case COMPARE_LE:
1845 case COMPARE_EQ:
1846 return BTEqualStrategyNumber;
1847 case COMPARE_GE:
1849 case COMPARE_GT:
1851 default:
1852 return InvalidStrategy;
1853 }
1854}
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static Datum values[MAXATTR]
Definition bootstrap.c:190
int Buffer
Definition buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5670
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4446
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5821
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:926
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:309
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:468
@ RBM_NORMAL
Definition bufmgr.h:46
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:419
static bool PageIsNew(const PageData *page)
Definition bufpage.h:258
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:268
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:378
PageData * Page
Definition bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:396
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition bulk_write.c:130
#define OffsetToPointer(base, offset)
Definition c.h:855
#define Assert(condition)
Definition c.h:943
int64_t int64
Definition c.h:621
#define FLEXIBLE_ARRAY_MEMBER
Definition c.h:558
int16_t int16
Definition c.h:619
uint16_t uint16
Definition c.h:623
size_t Size
Definition c.h:689
uint32 result
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
CompareType
Definition cmptype.h:32
@ COMPARE_LE
Definition cmptype.h:35
@ COMPARE_INVALID
Definition cmptype.h:33
@ COMPARE_GT
Definition cmptype.h:38
@ COMPARE_EQ
Definition cmptype.h:36
@ COMPARE_GE
Definition cmptype.h:37
@ COMPARE_LT
Definition cmptype.h:34
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumRestore(char **start_address, bool *isnull)
Definition datum.c:559
void datumSerialize(Datum value, bool isnull, bool typByVal, int typLen, char **start_address)
Definition datum.c:497
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition datum.c:450
int errcode(int sqlerrcode)
Definition elog.c:875
#define LOG
Definition elog.h:32
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define ereport(elevel,...)
Definition elog.h:152
#define palloc_object(type)
Definition fe_memutils.h:89
#define palloc_array(type, count)
Definition fe_memutils.h:91
#define palloc0_object(type)
Definition fe_memutils.h:90
#define PG_RETURN_POINTER(x)
Definition fmgr.h:363
#define PG_FUNCTION_ARGS
Definition fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition genam.h:95
IndexUniqueCheck
Definition genam.h:124
void IndexFreeSpaceMapVacuum(Relation rel)
Definition indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition ipc.h:52
int i
Definition isn.c:77
IndexTupleData * IndexTuple
Definition itup.h:53
#define MaxIndexTuplesPerPage
Definition itup.h:181
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
#define ExclusiveLock
Definition lockdefs.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1150
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1767
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:670
@ LW_EXCLUSIVE
Definition lwlock.h:104
void MemoryContextReset(MemoryContext context)
Definition mcxt.c:406
Size add_size(Size s1, Size s2)
Definition mcxt.c:1733
void pfree(void *pointer)
Definition mcxt.c:1619
void * palloc(Size size)
Definition mcxt.c:1390
MemoryContext CurrentMemoryContext
Definition mcxt.c:161
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:475
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition nbtinsert.c:105
void _bt_relbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1044
int _bt_getrootheight(Relation rel)
Definition nbtpage.c:680
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition nbtpage.c:1832
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition nbtpage.c:1182
void _bt_checkpage(Relation rel, Buffer buf)
Definition nbtpage.c:802
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition nbtpage.c:233
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition nbtpage.c:1137
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition nbtpage.c:68
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition nbtpage.c:180
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition nbtpage.c:3033
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition nbtpage.c:1067
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition nbtpage.c:2991
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition nbtree.c:1088
bool btcanreturn(Relation index, int attno)
Definition nbtree.c:1802
BTPS_State
Definition nbtree.c:57
@ BTPARALLEL_ADVANCING
Definition nbtree.c:60
@ BTPARALLEL_NEED_PRIMSCAN
Definition nbtree.c:59
@ BTPARALLEL_NOT_INITIALIZED
Definition nbtree.c:58
@ BTPARALLEL_IDLE
Definition nbtree.c:61
@ BTPARALLEL_DONE
Definition nbtree.c:62
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition nbtree.c:873
StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily)
Definition nbtree.c:1837
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition nbtree.c:339
static bool _bt_start_prim_scan(IndexScanDesc scan)
Definition nbtree.c:656
static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf)
Definition nbtree.c:1415
Size btestimateparallelscan(Relation rel, int nkeys, int norderbys)
Definition nbtree.c:575
void _bt_parallel_done(IndexScanDesc scan)
Definition nbtree.c:1038
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition nbtree.c:1152
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition nbtree.c:1753
CompareType bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
Definition nbtree.c:1817
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition nbtree.c:230
void btparallelrescan(IndexScanDesc scan)
Definition nbtree.c:830
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition nbtree.c:206
void btbuildempty(Relation index)
Definition nbtree.c:183
int btgettreeheight(Relation rel)
Definition nbtree.c:1811
void btinitparallelscan(void *target)
Definition nbtree.c:814
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition nbtree.c:1122
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition nbtree.c:1240
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition nbtree.c:720
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition nbtree.c:291
void btmarkpos(IndexScanDesc scan)
Definition nbtree.c:491
void btendscan(IndexScanDesc scan)
Definition nbtree.c:455
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition nbtree.c:388
struct BTParallelScanDescData * BTParallelScanDesc
Definition nbtree.c:95
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition nbtree.c:1011
Datum bthandler(PG_FUNCTION_ARGS)
Definition nbtree.c:118
void btrestrpos(IndexScanDesc scan)
Definition nbtree.c:517
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition nbtree.c:763
#define P_ISHALFDEAD(opaque)
Definition nbtree.h:225
#define BTScanPosIsPinned(scanpos)
Definition nbtree.h:1004
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition nbtree.h:481
#define P_ISLEAF(opaque)
Definition nbtree.h:221
#define SK_BT_SKIP
Definition nbtree.h:1106
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define P_ISDELETED(opaque)
Definition nbtree.h:223
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition nbtree.h:538
#define BTNProcs
Definition nbtree.h:723
#define MaxTIDsPerBTreePage
Definition nbtree.h:186
#define BTScanPosIsValid(scanpos)
Definition nbtree.h:1021
#define P_FIRSTDATAKEY(opaque)
Definition nbtree.h:370
uint16 BTCycleId
Definition nbtree.h:30
#define P_NONE
Definition nbtree.h:213
#define P_RIGHTMOST(opaque)
Definition nbtree.h:220
#define BTREE_METAPAGE
Definition nbtree.h:149
#define SK_BT_MAXVAL
Definition nbtree.h:1110
#define BT_READ
Definition nbtree.h:730
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition nbtree.h:292
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition nbtree.h:493
#define BTScanPosInvalidate(scanpos)
Definition nbtree.h:1027
#define BTScanPosUnpinIfPinned(scanpos)
Definition nbtree.h:1015
#define BTMaxItemSize
Definition nbtree.h:165
#define BTP_SPLIT_END
Definition nbtree.h:82
#define BTOPTIONS_PROC
Definition nbtree.h:721
#define SK_BT_MINVAL
Definition nbtree.h:1109
BTScanOpaqueData * BTScanOpaque
Definition nbtree.h:1097
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:883
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:1586
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition nbtsort.c:299
void _bt_end_vacuum(Relation rel)
Definition nbtutils.c:530
void _bt_end_vacuum_callback(int code, Datum arg)
Definition nbtutils.c:558
void _bt_killitems(IndexScanDesc scan)
Definition nbtutils.c:191
char * btbuildphasename(int64 phasenum)
Definition nbtutils.c:644
bytea * btoptions(Datum reloptions, bool validate)
Definition nbtutils.c:598
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition nbtutils.c:621
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition nbtutils.c:1175
BTCycleId _bt_start_vacuum(Relation rel)
Definition nbtutils.c:473
bool btvalidate(Oid opclassoid)
Definition nbtvalidate.c:40
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:138
int16 attnum
static char buf[DEFAULT_XLOG_SEG_SIZE]
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
#define PointerGetDatum(X)
Definition postgres.h:354
#define InvalidOid
unsigned int Oid
static int fb(int x)
#define PROGRESS_SCAN_BLOCKS_DONE
Definition progress.h:151
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition progress.h:150
void read_stream_reset(ReadStream *stream)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_MAINTENANCE
Definition read_stream.h:28
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
#define RELATION_IS_LOCAL(relation)
Definition rel.h:659
#define RelationGetDescr(relation)
Definition rel.h:542
#define RelationGetRelationName(relation)
Definition rel.h:550
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:535
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
ScanDirection
Definition sdir.h:25
@ ForwardScanDirection
Definition sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition selfuncs.c:7703
#define SK_SEARCHNULL
Definition skey.h:121
ScanKeyData * ScanKey
Definition skey.h:75
#define SK_ISNULL
Definition skey.h:115
#define IsMVCCLikeSnapshot(snapshot)
Definition snapmgr.h:74
uint16 StrategyNumber
Definition stratnum.h:22
#define BTGreaterStrategyNumber
Definition stratnum.h:33
#define InvalidStrategy
Definition stratnum.h:24
#define BTMaxStrategyNumber
Definition stratnum.h:35
#define BTLessStrategyNumber
Definition stratnum.h:29
#define BTEqualStrategyNumber
Definition stratnum.h:31
#define BTLessEqualStrategyNumber
Definition stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition stratnum.h:32
Datum * elem_values
Definition nbtree.h:1041
BlockNumber btpo_next
Definition nbtree.h:66
uint16 btpo_flags
Definition nbtree.h:68
BTCycleId btpo_cycleid
Definition nbtree.h:69
BTPS_State btps_pageStatus
Definition nbtree.c:74
BlockNumber btps_lastCurrPage
Definition nbtree.c:72
ConditionVariable btps_cv
Definition nbtree.c:78
BlockNumber btps_nextScanPage
Definition nbtree.c:71
int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]
Definition nbtree.c:85
NodeTag type
Definition amapi.h:235
BlockNumber pages_deleted
Definition genam.h:90
BlockNumber pages_free
Definition genam.h:91
BlockNumber num_pages
Definition genam.h:85
double tuples_removed
Definition genam.h:88
double num_index_tuples
Definition genam.h:87
struct ScanKeyData * keyData
Definition relscan.h:154
struct ParallelIndexScanDescData * parallel_scan
Definition relscan.h:204
bool kill_prior_tuple
Definition relscan.h:160
struct TupleDescData * xs_itupdesc
Definition relscan.h:181
Relation indexRelation
Definition relscan.h:150
ItemPointerData xs_heaptid
Definition relscan.h:185
struct SnapshotData * xs_snapshot
Definition relscan.h:151
Relation heapRelation
Definition relscan.h:149
ItemPointerData t_tid
Definition itup.h:37
Relation index
Definition genam.h:54
double num_heap_tuples
Definition genam.h:60
bool analyze_only
Definition genam.h:56
BufferAccessStrategy strategy
Definition genam.h:61
Relation heaprel
Definition genam.h:55
bool report_progress
Definition genam.h:57
bool estimated_count
Definition genam.h:58
TupleDesc rd_att
Definition rel.h:112
Definition type.h:97
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
static ItemArray items
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition tidbitmap.c:367
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:195
void vacuum_delay_point(bool is_analyze)
Definition vacuum.c:2438
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition vacuum.h:47
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition vacuum.h:54