PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtree.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
6 *
7 * NOTES
8 * This file contains only the public interface routines.
9 *
10 *
11 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
13 *
14 * IDENTIFICATION
15 * src/backend/access/nbtree/nbtree.c
16 *
17 *-------------------------------------------------------------------------
18 */
19#include "postgres.h"
20
21#include "access/nbtree.h"
22#include "access/relscan.h"
23#include "access/stratnum.h"
24#include "commands/progress.h"
25#include "commands/vacuum.h"
26#include "nodes/execnodes.h"
27#include "pgstat.h"
28#include "storage/bulk_write.h"
30#include "storage/indexfsm.h"
31#include "storage/ipc.h"
32#include "storage/lmgr.h"
33#include "storage/read_stream.h"
34#include "utils/datum.h"
35#include "utils/fmgrprotos.h"
37#include "utils/memutils.h"
38#include "utils/wait_event.h"
39
40
41/*
42 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
43 *
44 * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
45 * scan to advance it via another call to _bt_first.
46 *
47 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
48 * a new page; others must wait.
49 *
50 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
51 * to a new page; some process can start doing that.
52 *
53 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
54 */
63
64/*
65 * BTParallelScanDescData contains btree specific shared information required
66 * for parallel scan.
67 */
69{
70 BlockNumber btps_nextScanPage; /* next page to be scanned */
71 BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
72 * btps_nextScanPage */
73 BTPS_State btps_pageStatus; /* indicates whether next page is
74 * available for scan. see above for
75 * possible states of parallel scan. */
76 LWLock btps_lock; /* protects shared parallel state */
77 ConditionVariable btps_cv; /* used to synchronize parallel scan */
78
79 /*
80 * btps_arrElems is used when scans need to schedule another primitive
81 * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
82 * offsets for each = scan key associated with a ScalarArrayOp array.
83 */
85
86 /*
87 * Additional space (at the end of the struct) is used when scans need to
88 * schedule another primitive index scan with one or more skip arrays.
89 * Holds a flattened datum representation for each = scan key associated
90 * with a skip array.
91 */
93
95
96
97static bool _bt_start_prim_scan(IndexScanDesc scan);
102static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
103 IndexBulkDeleteCallback callback, void *callback_state,
104 BTCycleId cycleid);
108 OffsetNumber updatedoffset,
109 int *nremaining);
110
111
112/*
113 * Btree handler function: return IndexAmRoutine with access method parameters
114 * and callbacks.
115 */
116Datum
118{
119 static const IndexAmRoutine amroutine = {
121 .amstrategies = BTMaxStrategyNumber,
122 .amsupport = BTNProcs,
123 .amoptsprocnum = BTOPTIONS_PROC,
124 .amcanorder = true,
125 .amcanorderbyop = false,
126 .amcanhash = false,
127 .amconsistentequality = true,
128 .amconsistentordering = true,
129 .amcanbackward = true,
130 .amcanunique = true,
131 .amcanmulticol = true,
132 .amoptionalkey = true,
133 .amsearcharray = true,
134 .amsearchnulls = true,
135 .amstorage = false,
136 .amclusterable = true,
137 .ampredlocks = true,
138 .amcanparallel = true,
139 .amcanbuildparallel = true,
140 .amcaninclude = true,
141 .amusemaintenanceworkmem = false,
142 .amsummarizing = false,
143 .amparallelvacuumoptions =
145 .amkeytype = InvalidOid,
146
147 .ambuild = btbuild,
148 .ambuildempty = btbuildempty,
149 .aminsert = btinsert,
150 .aminsertcleanup = NULL,
151 .ambulkdelete = btbulkdelete,
152 .amvacuumcleanup = btvacuumcleanup,
153 .amcanreturn = btcanreturn,
154 .amcostestimate = btcostestimate,
155 .amgettreeheight = btgettreeheight,
156 .amoptions = btoptions,
157 .amproperty = btproperty,
158 .ambuildphasename = btbuildphasename,
159 .amvalidate = btvalidate,
160 .amadjustmembers = btadjustmembers,
161 .ambeginscan = btbeginscan,
162 .amrescan = btrescan,
163 .amgettuple = btgettuple,
164 .amgetbitmap = btgetbitmap,
165 .amendscan = btendscan,
166 .ammarkpos = btmarkpos,
167 .amrestrpos = btrestrpos,
168 .amestimateparallelscan = btestimateparallelscan,
169 .aminitparallelscan = btinitparallelscan,
170 .amparallelrescan = btparallelrescan,
171 .amtranslatestrategy = bttranslatestrategy,
172 .amtranslatecmptype = bttranslatecmptype,
173 };
174
176}
177
178/*
179 * btbuildempty() -- build an empty btree index in the initialization fork
180 */
181void
183{
184 bool allequalimage = _bt_allequalimage(index, false);
185 BulkWriteState *bulkstate;
187
189
190 /* Construct metapage. */
191 metabuf = smgr_bulk_get_buf(bulkstate);
192 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
193 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
194
195 smgr_bulk_finish(bulkstate);
196}
197
198/*
199 * btinsert() -- insert an index tuple into a btree.
200 *
201 * Descend the tree recursively, find the appropriate location for our
202 * new tuple, and put it there.
203 */
204bool
205btinsert(Relation rel, Datum *values, bool *isnull,
208 bool indexUnchanged,
209 IndexInfo *indexInfo)
210{
211 bool result;
212 IndexTuple itup;
213
214 /* generate an index tuple */
215 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
216 itup->t_tid = *ht_ctid;
217
218 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
219
220 pfree(itup);
221
222 return result;
223}
224
225/*
226 * btgettuple() -- Get the next tuple in the scan.
227 */
228bool
230{
232 bool res;
233
234 Assert(scan->heapRelation != NULL);
235
236 /* btree indexes are never lossy */
237 scan->xs_recheck = false;
238
239 /* Each loop iteration performs another primitive index scan */
240 do
241 {
242 /*
243 * If we've already initialized this scan, we can just advance it in
244 * the appropriate direction. If we haven't done so yet, we call
245 * _bt_first() to get the first item in the scan.
246 */
247 if (!BTScanPosIsValid(so->currPos))
248 res = _bt_first(scan, dir);
249 else
250 {
251 /*
252 * Check to see if we should kill the previously-fetched tuple.
253 */
254 if (scan->kill_prior_tuple)
255 {
256 /*
257 * Yes, remember it for later. (We'll deal with all such
258 * tuples at once right before leaving the index page.) The
259 * test for numKilled overrun is not just paranoia: if the
260 * caller reverses direction in the indexscan then the same
261 * item might get entered multiple times. It's not worth
262 * trying to optimize that, so we don't detect it, but instead
263 * just forget any excess entries.
264 */
265 if (so->killedItems == NULL)
266 so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
267 if (so->numKilled < MaxTIDsPerBTreePage)
268 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
269 }
270
271 /*
272 * Now continue the scan.
273 */
274 res = _bt_next(scan, dir);
275 }
276
277 /* If we have a tuple, return it ... */
278 if (res)
279 break;
280 /* ... otherwise see if we need another primitive index scan */
281 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
282
283 return res;
284}
285
286/*
287 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
288 */
289int64
291{
293 int64 ntids = 0;
294 ItemPointer heapTid;
295
296 Assert(scan->heapRelation == NULL);
297
298 /* Each loop iteration performs another primitive index scan */
299 do
300 {
301 /* Fetch the first page & tuple */
303 {
304 /* Save tuple ID, and continue scanning */
305 heapTid = &scan->xs_heaptid;
306 tbm_add_tuples(tbm, heapTid, 1, false);
307 ntids++;
308
309 for (;;)
310 {
311 /*
312 * Advance to next tuple within page. This is the same as the
313 * easy case in _bt_next().
314 */
315 if (++so->currPos.itemIndex > so->currPos.lastItem)
316 {
317 /* let _bt_next do the heavy lifting */
318 if (!_bt_next(scan, ForwardScanDirection))
319 break;
320 }
321
322 /* Save tuple ID, and continue scanning */
323 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
324 tbm_add_tuples(tbm, heapTid, 1, false);
325 ntids++;
326 }
327 }
328 /* Now see if we need another primitive index scan */
329 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
330
331 return ntids;
332}
333
334/*
335 * btbeginscan() -- start a scan on a btree index
336 */
338btbeginscan(Relation rel, int nkeys, int norderbys)
339{
340 IndexScanDesc scan;
342
343 /* no order by operators allowed */
344 Assert(norderbys == 0);
345
346 /* get the scan */
347 scan = RelationGetIndexScan(rel, nkeys, norderbys);
348
349 /* allocate private workspace */
351 BTScanPosInvalidate(so->currPos);
352 BTScanPosInvalidate(so->markPos);
353 if (scan->numberOfKeys > 0)
354 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
355 else
356 so->keyData = NULL;
357
358 so->skipScan = false;
359 so->needPrimScan = false;
360 so->scanBehind = false;
361 so->oppositeDirCheck = false;
362 so->arrayKeys = NULL;
363 so->orderProcs = NULL;
364 so->arrayContext = NULL;
365
366 so->killedItems = NULL; /* until needed */
367 so->numKilled = 0;
368
369 /*
370 * We don't know yet whether the scan will be index-only, so we do not
371 * allocate the tuple workspace arrays until btrescan. However, we set up
372 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
373 */
374 so->currTuples = so->markTuples = NULL;
375
376 scan->xs_itupdesc = RelationGetDescr(rel);
377
378 scan->opaque = so;
379
380 return scan;
381}
382
383/*
384 * btrescan() -- rescan an index relation
385 */
386void
388 ScanKey orderbys, int norderbys)
389{
391
392 /* we aren't holding any read locks, but gotta drop the pins */
393 if (BTScanPosIsValid(so->currPos))
394 {
395 /* Before leaving current page, deal with any killed items */
396 if (so->numKilled > 0)
397 _bt_killitems(scan);
398 BTScanPosUnpinIfPinned(so->currPos);
399 BTScanPosInvalidate(so->currPos);
400 }
401
402 /*
403 * We prefer to eagerly drop leaf page pins before btgettuple returns.
404 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
405 *
406 * We cannot safely drop leaf page pins during index-only scans due to a
407 * race condition involving VACUUM setting pages all-visible in the VM.
408 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
409 *
410 * When we drop pins eagerly, the mechanism that marks so->killedItems[]
411 * index tuples LP_DEAD has to deal with concurrent TID recycling races.
412 * The scheme used to detect unsafe TID recycling won't work when scanning
413 * unlogged relations (since it involves saving an affected page's LSN).
414 * Opt out of eager pin dropping during unlogged relation scans for now
415 * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
416 *
417 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
418 * Pins cannot be held for more than an instant during bitmap scans either
419 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
420 *
421 * See nbtree/README section on making concurrent TID recycling safe.
422 *
423 * Note: so->dropPin should never change across rescans.
424 */
425 so->dropPin = (!scan->xs_want_itup &&
428 scan->heapRelation != NULL);
429
430 so->markItemIndex = -1;
431 so->needPrimScan = false;
432 so->scanBehind = false;
433 so->oppositeDirCheck = false;
434 BTScanPosUnpinIfPinned(so->markPos);
435 BTScanPosInvalidate(so->markPos);
436
437 /*
438 * Allocate tuple workspace arrays, if needed for an index-only scan and
439 * not already done in a previous rescan call. To save on palloc
440 * overhead, both workspaces are allocated as one palloc block; only this
441 * function and btendscan know that.
442 */
443 if (scan->xs_want_itup && so->currTuples == NULL)
444 {
445 so->currTuples = (char *) palloc(BLCKSZ * 2);
446 so->markTuples = so->currTuples + BLCKSZ;
447 }
448
449 /*
450 * Reset the scan keys
451 */
452 if (scankey && scan->numberOfKeys > 0)
453 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
454 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
455 so->numArrayKeys = 0; /* ditto */
456}
457
458/*
459 * btendscan() -- close down a scan
460 */
461void
463{
465
466 /* we aren't holding any read locks, but gotta drop the pins */
467 if (BTScanPosIsValid(so->currPos))
468 {
469 /* Before leaving current page, deal with any killed items */
470 if (so->numKilled > 0)
471 _bt_killitems(scan);
472 BTScanPosUnpinIfPinned(so->currPos);
473 }
474
475 so->markItemIndex = -1;
476 BTScanPosUnpinIfPinned(so->markPos);
477
478 /* No need to invalidate positions, the RAM is about to be freed. */
479
480 /* Release storage */
481 if (so->keyData != NULL)
482 pfree(so->keyData);
483 /* so->arrayKeys and so->orderProcs are in arrayContext */
484 if (so->arrayContext != NULL)
485 MemoryContextDelete(so->arrayContext);
486 if (so->killedItems != NULL)
487 pfree(so->killedItems);
488 if (so->currTuples != NULL)
489 pfree(so->currTuples);
490 /* so->markTuples should not be pfree'd, see btrescan */
491 pfree(so);
492}
493
494/*
495 * btmarkpos() -- save current scan position
496 */
497void
499{
501
502 /* There may be an old mark with a pin (but no lock). */
503 BTScanPosUnpinIfPinned(so->markPos);
504
505 /*
506 * Just record the current itemIndex. If we later step to next page
507 * before releasing the marked position, _bt_steppage makes a full copy of
508 * the currPos struct in markPos. If (as often happens) the mark is moved
509 * before we leave the page, we don't have to do that work.
510 */
511 if (BTScanPosIsValid(so->currPos))
512 so->markItemIndex = so->currPos.itemIndex;
513 else
514 {
515 BTScanPosInvalidate(so->markPos);
516 so->markItemIndex = -1;
517 }
518}
519
520/*
521 * btrestrpos() -- restore scan to last saved position
522 */
523void
525{
527
528 if (so->markItemIndex >= 0)
529 {
530 /*
531 * The scan has never moved to a new page since the last mark. Just
532 * restore the itemIndex.
533 *
534 * NB: In this case we can't count on anything in so->markPos to be
535 * accurate.
536 */
537 so->currPos.itemIndex = so->markItemIndex;
538 }
539 else
540 {
541 /*
542 * The scan moved to a new page after last mark or restore, and we are
543 * now restoring to the marked page. We aren't holding any read
544 * locks, but if we're still holding the pin for the current position,
545 * we must drop it.
546 */
547 if (BTScanPosIsValid(so->currPos))
548 {
549 /* Before leaving current page, deal with any killed items */
550 if (so->numKilled > 0)
551 _bt_killitems(scan);
552 BTScanPosUnpinIfPinned(so->currPos);
553 }
554
555 if (BTScanPosIsValid(so->markPos))
556 {
557 /* bump pin on mark buffer for assignment to current buffer */
558 if (BTScanPosIsPinned(so->markPos))
559 IncrBufferRefCount(so->markPos.buf);
560 memcpy(&so->currPos, &so->markPos,
562 so->markPos.lastItem * sizeof(BTScanPosItem));
563 if (so->currTuples)
564 memcpy(so->currTuples, so->markTuples,
565 so->markPos.nextTupleOffset);
566 /* Reset the scan's array keys (see _bt_steppage for why) */
567 if (so->numArrayKeys)
568 {
569 _bt_start_array_keys(scan, so->currPos.dir);
570 so->needPrimScan = false;
571 }
572 }
573 else
574 BTScanPosInvalidate(so->currPos);
575 }
576}
577
578/*
579 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
580 */
581Size
582btestimateparallelscan(Relation rel, int nkeys, int norderbys)
583{
587
588 /*
589 * Pessimistically assume that every input scan key will be output with
590 * its own SAOP array
591 */
593 sizeof(int) * nkeys;
594
595 /* Single column indexes cannot possibly use a skip array */
596 if (nkeyatts == 1)
597 return estnbtreeshared;
598
599 /*
600 * Pessimistically assume that all attributes prior to the least
601 * significant attribute require a skip array (and an associated key)
602 */
603 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
604 sizeof(Datum));
605 for (int attnum = 1; attnum < nkeyatts; attnum++)
606 {
607 CompactAttribute *attr;
608
609 /*
610 * We make the conservative assumption that every index column will
611 * also require a skip array.
612 *
613 * Every skip array must have space to store its scan key's sk_flags.
614 */
616
617 /* Consider space required to store a datum of opclass input type */
618 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
619 if (attr->attbyval)
620 {
621 /* This index attribute stores pass-by-value datums */
623 true, attr->attlen);
624
626 continue;
627 }
628
629 /*
630 * This index attribute stores pass-by-reference datums.
631 *
632 * Assume that serializing this array will use just as much space as a
633 * pass-by-value datum, in addition to space for the largest possible
634 * whole index tuple (this is not just a per-datum portion of the
635 * largest possible tuple because that'd be almost as large anyway).
636 *
637 * This is quite conservative, but it's not clear how we could do much
638 * better. The executor requires an up-front storage request size
639 * that reliably covers the scan's high watermark memory usage. We
640 * can't be sure of the real high watermark until the scan is over.
641 */
644 }
645
646 return estnbtreeshared;
647}
648
649/*
650 * _bt_start_prim_scan() -- start scheduled primitive index scan?
651 *
652 * Returns true if _bt_checkkeys scheduled another primitive index scan, just
653 * as the last one ended. Otherwise returns false, indicating that the array
654 * keys are now fully exhausted.
655 *
656 * Only call here during scans with one or more equality type array scan keys,
657 * after _bt_first or _bt_next return false.
658 */
659static bool
661{
663
664 Assert(so->numArrayKeys);
665
666 so->scanBehind = so->oppositeDirCheck = false; /* reset */
667
668 /*
669 * Array keys are advanced within _bt_checkkeys when the scan reaches the
670 * leaf level (more precisely, they're advanced when the scan reaches the
671 * end of each distinct set of array elements). This process avoids
672 * repeat access to leaf pages (across multiple primitive index scans) by
673 * advancing the scan's array keys when it allows the primitive index scan
674 * to find nearby matching tuples (or when it eliminates ranges of array
675 * key space that can't possibly be satisfied by any index tuple).
676 *
677 * _bt_checkkeys sets a simple flag variable to schedule another primitive
678 * index scan. The flag tells us what to do.
679 *
680 * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
681 * various cases where that won't happen. For example, if the index is
682 * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
683 * We also don't expect a call to _bt_checkkeys during searches for a
684 * non-existent value that happens to be lower/higher than any existing
685 * value in the index.
686 *
687 * We don't require special handling for these cases -- we don't need to
688 * be explicitly instructed to _not_ perform another primitive index scan.
689 * It's up to code under the control of _bt_first to always set the flag
690 * when another primitive index scan will be required.
691 *
692 * This works correctly, even with the tricky cases listed above, which
693 * all involve access to leaf pages "near the boundaries of the key space"
694 * (whether it's from a leftmost/rightmost page, or an imaginary empty
695 * leaf root page). If _bt_checkkeys cannot be reached by a primitive
696 * index scan for one set of array keys, then it also won't be reached for
697 * any later set ("later" in terms of the direction that we scan the index
698 * and advance the arrays). The array keys won't have advanced in these
699 * cases, but that's the correct behavior (even _bt_advance_array_keys
700 * won't always advance the arrays at the point they become "exhausted").
701 */
702 if (so->needPrimScan)
703 {
704 /*
705 * Flag was set -- must call _bt_first again, which will reset the
706 * scan's needPrimScan flag
707 */
708 return true;
709 }
710
711 /* The top-level index scan ran out of tuples in this scan direction */
712 if (scan->parallel_scan != NULL)
713 _bt_parallel_done(scan);
714
715 return false;
716}
717
718/*
719 * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
720 *
721 * Caller must have exclusively locked btscan->btps_lock when called.
722 */
723static void
726{
727 char *datumshared;
728
729 /* Space for serialized datums begins immediately after btps_arrElems[] */
730 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
731 for (int i = 0; i < so->numArrayKeys; i++)
732 {
733 BTArrayKeyInfo *array = &so->arrayKeys[i];
734 ScanKey skey = &so->keyData[array->scan_key];
735
736 if (array->num_elems != -1)
737 {
738 /* Save SAOP array's cur_elem (no need to copy key/datum) */
739 Assert(!(skey->sk_flags & SK_BT_SKIP));
740 btscan->btps_arrElems[i] = array->cur_elem;
741 continue;
742 }
743
744 /* Save all mutable state associated with skip array's key */
745 Assert(skey->sk_flags & SK_BT_SKIP);
746 memcpy(datumshared, &skey->sk_flags, sizeof(int));
747 datumshared += sizeof(int);
748
749 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
750 {
751 /* No sk_argument datum to serialize */
752 Assert(skey->sk_argument == 0);
753 continue;
754 }
755
756 datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
757 array->attbyval, array->attlen, &datumshared);
758 }
759}
760
761/*
762 * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
763 *
764 * Caller must have exclusively locked btscan->btps_lock when called.
765 */
766static void
769{
770 char *datumshared;
771
772 /* Space for serialized datums begins immediately after btps_arrElems[] */
773 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
774 for (int i = 0; i < so->numArrayKeys; i++)
775 {
776 BTArrayKeyInfo *array = &so->arrayKeys[i];
777 ScanKey skey = &so->keyData[array->scan_key];
778 bool isnull;
779
780 if (array->num_elems != -1)
781 {
782 /* Restore SAOP array using its saved cur_elem */
783 Assert(!(skey->sk_flags & SK_BT_SKIP));
784 array->cur_elem = btscan->btps_arrElems[i];
785 skey->sk_argument = array->elem_values[array->cur_elem];
786 continue;
787 }
788
789 /* Restore skip array by restoring its key directly */
790 if (!array->attbyval && skey->sk_argument)
791 pfree(DatumGetPointer(skey->sk_argument));
792 skey->sk_argument = (Datum) 0;
793 memcpy(&skey->sk_flags, datumshared, sizeof(int));
794 datumshared += sizeof(int);
795
796 Assert(skey->sk_flags & SK_BT_SKIP);
797
798 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
799 {
800 /* No sk_argument datum to restore */
801 continue;
802 }
803
804 skey->sk_argument = datumRestore(&datumshared, &isnull);
805 if (isnull)
806 {
807 Assert(skey->sk_argument == 0);
808 Assert(skey->sk_flags & SK_SEARCHNULL);
809 Assert(skey->sk_flags & SK_ISNULL);
810 }
811 }
812}
813
814/*
815 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
816 */
817void
819{
821
822 LWLockInitialize(&bt_target->btps_lock,
824 bt_target->btps_nextScanPage = InvalidBlockNumber;
825 bt_target->btps_lastCurrPage = InvalidBlockNumber;
826 bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
828}
829
830/*
831 * btparallelrescan() -- reset parallel scan
832 */
833void
835{
837 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
838
839 Assert(parallel_scan);
840
841 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
842 parallel_scan->ps_offset_am);
843
844 /*
845 * In theory, we don't need to acquire the LWLock here, because there
846 * shouldn't be any other workers running at this point, but we do so for
847 * consistency.
848 */
849 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
850 btscan->btps_nextScanPage = InvalidBlockNumber;
851 btscan->btps_lastCurrPage = InvalidBlockNumber;
852 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
853 LWLockRelease(&btscan->btps_lock);
854}
855
856/*
857 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
858 * page. Other scans must wait until we call _bt_parallel_release()
859 * or _bt_parallel_done().
860 *
861 * The return value is true if we successfully seized the scan and false
862 * if we did not. The latter case occurs when no pages remain, or when
863 * another primitive index scan is scheduled that caller's backend cannot
864 * start just yet (only backends that call from _bt_first are capable of
865 * starting primitive index scans, which they indicate by passing first=true).
866 *
867 * If the return value is true, *next_scan_page returns the next page of the
868 * scan, and *last_curr_page returns the page that *next_scan_page came from.
869 * An invalid *next_scan_page means the scan hasn't yet started, or that
870 * caller needs to start the next primitive index scan (if it's the latter
871 * case we'll set so.needPrimScan).
872 *
873 * Callers should ignore the value of *next_scan_page and *last_curr_page if
874 * the return value is false.
875 */
876bool
878 BlockNumber *last_curr_page, bool first)
879{
880 Relation rel = scan->indexRelation;
882 bool exit_loop = false,
883 status = true,
884 endscan = false;
885 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
887
890
891 /*
892 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
893 * call to _bt_readnextpage treats this backend similarly to a serial
894 * backend that steps from *last_curr_page to *next_scan_page (unless this
895 * backend's so->currPos is initialized by _bt_readfirstpage before then).
896 */
897 BTScanPosInvalidate(so->currPos);
898 so->currPos.moreLeft = so->currPos.moreRight = true;
899
900 if (first)
901 {
902 /*
903 * Initialize array related state when called from _bt_first, assuming
904 * that this will be the first primitive index scan for the scan
905 */
906 so->needPrimScan = false;
907 so->scanBehind = false;
908 so->oppositeDirCheck = false;
909 }
910 else
911 {
912 /*
913 * Don't attempt to seize the scan when it requires another primitive
914 * index scan, since caller's backend cannot start it right now
915 */
916 if (so->needPrimScan)
917 return false;
918 }
919
920 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
921 parallel_scan->ps_offset_am);
922
923 while (1)
924 {
925 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
926
927 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
928 {
929 /* We're done with this parallel index scan */
930 status = false;
931 }
932 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
933 btscan->btps_nextScanPage == P_NONE)
934 {
935 /* End this parallel index scan */
936 status = false;
937 endscan = true;
938 }
939 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
940 {
941 Assert(so->numArrayKeys);
942
943 if (first)
944 {
945 /* Can start scheduled primitive scan right away, so do so */
946 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
947
948 /* Restore scan's array keys from serialized values */
950 exit_loop = true;
951 }
952 else
953 {
954 /*
955 * Don't attempt to seize the scan when it requires another
956 * primitive index scan, since caller's backend cannot start
957 * it right now
958 */
959 status = false;
960 }
961
962 /*
963 * Either way, update backend local state to indicate that a
964 * pending primitive scan is required
965 */
966 so->needPrimScan = true;
967 so->scanBehind = false;
968 so->oppositeDirCheck = false;
969 }
970 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
971 {
972 /*
973 * We have successfully seized control of the scan for the purpose
974 * of advancing it to a new page!
975 */
976 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
977 Assert(btscan->btps_nextScanPage != P_NONE);
978 *next_scan_page = btscan->btps_nextScanPage;
979 *last_curr_page = btscan->btps_lastCurrPage;
980 exit_loop = true;
981 }
982 LWLockRelease(&btscan->btps_lock);
983 if (exit_loop || !status)
984 break;
986 }
988
989 /* When the scan has reached the rightmost (or leftmost) page, end it */
990 if (endscan)
991 _bt_parallel_done(scan);
992
993 return status;
994}
995
996/*
997 * _bt_parallel_release() -- Complete the process of advancing the scan to a
998 * new page. We now have the new value btps_nextScanPage; another backend
999 * can now begin advancing the scan.
1000 *
1001 * Callers whose scan uses array keys must save their curr_page argument so
1002 * that it can be passed to _bt_parallel_primscan_schedule, should caller
1003 * determine that another primitive index scan is required.
1004 *
1005 * If caller's next_scan_page is P_NONE, the scan has reached the index's
1006 * rightmost/leftmost page. This is treated as reaching the end of the scan
1007 * within _bt_parallel_seize.
1008 *
1009 * Note: unlike the serial case, parallel scans don't need to remember both
1010 * sibling links. next_scan_page is whichever link is next given the scan's
1011 * direction. That's all we'll ever need, since the direction of a parallel
1012 * scan can never change.
1013 */
1014void
1017{
1018 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1020
1022
1023 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1024 parallel_scan->ps_offset_am);
1025
1026 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1027 btscan->btps_nextScanPage = next_scan_page;
1028 btscan->btps_lastCurrPage = curr_page;
1029 btscan->btps_pageStatus = BTPARALLEL_IDLE;
1030 LWLockRelease(&btscan->btps_lock);
1032}
1033
1034/*
1035 * _bt_parallel_done() -- Mark the parallel scan as complete.
1036 *
1037 * When there are no pages left to scan, this function should be called to
1038 * notify other workers. Otherwise, they might wait forever for the scan to
1039 * advance to the next page.
1040 */
1041void
1043{
1045 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1047 bool status_changed = false;
1048
1049 Assert(!BTScanPosIsValid(so->currPos));
1050
1051 /* Do nothing, for non-parallel scans */
1052 if (parallel_scan == NULL)
1053 return;
1054
1055 /*
1056 * Should not mark parallel scan done when there's still a pending
1057 * primitive index scan
1058 */
1059 if (so->needPrimScan)
1060 return;
1061
1062 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1063 parallel_scan->ps_offset_am);
1064
1065 /*
1066 * Mark the parallel scan as done, unless some other process did so
1067 * already
1068 */
1069 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1070 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1071 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1072 {
1073 btscan->btps_pageStatus = BTPARALLEL_DONE;
1074 status_changed = true;
1075 }
1076 LWLockRelease(&btscan->btps_lock);
1077
1078 /* wake up all the workers associated with this parallel scan */
1079 if (status_changed)
1081}
1082
1083/*
1084 * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1085 *
1086 * Caller passes the curr_page most recently passed to _bt_parallel_release
1087 * by its backend. Caller successfully schedules the next primitive index scan
1088 * if the shared parallel state hasn't been seized since caller's backend last
1089 * advanced the scan.
1090 */
1091void
1093{
1094 Relation rel = scan->indexRelation;
1096 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1098
1099 Assert(so->numArrayKeys);
1100
1101 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1102 parallel_scan->ps_offset_am);
1103
1104 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1105 if (btscan->btps_lastCurrPage == curr_page &&
1106 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1107 {
1108 btscan->btps_nextScanPage = InvalidBlockNumber;
1109 btscan->btps_lastCurrPage = InvalidBlockNumber;
1110 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1111
1112 /* Serialize scan's current array keys */
1114 }
1115 LWLockRelease(&btscan->btps_lock);
1116}
1117
1118/*
1119 * Bulk deletion of all index entries pointing to a set of heap tuples.
1120 * The set of target tuples is specified via a callback routine that tells
1121 * whether any given heap tuple (identified by ItemPointer) is being deleted.
1122 *
1123 * Result: a palloc'd struct containing statistical info for VACUUM displays.
1124 */
1127 IndexBulkDeleteCallback callback, void *callback_state)
1128{
1129 Relation rel = info->index;
1130 BTCycleId cycleid;
1131
1132 /* allocate stats if first time through, else re-use existing struct */
1133 if (stats == NULL)
1135
1136 /* Establish the vacuum cycle ID to use for this scan */
1137 /* The ENSURE stuff ensures we clean up shared memory on failure */
1139 {
1140 cycleid = _bt_start_vacuum(rel);
1141
1142 btvacuumscan(info, stats, callback, callback_state, cycleid);
1143 }
1145 _bt_end_vacuum(rel);
1146
1147 return stats;
1148}
1149
1150/*
1151 * Post-VACUUM cleanup.
1152 *
1153 * Result: a palloc'd struct containing statistical info for VACUUM displays.
1154 */
1157{
1159
1160 /* No-op in ANALYZE ONLY mode */
1161 if (info->analyze_only)
1162 return stats;
1163
1164 /*
1165 * If btbulkdelete was called, we need not do anything (we just maintain
1166 * the information used within _bt_vacuum_needs_cleanup() by calling
1167 * _bt_set_cleanup_info() below).
1168 *
1169 * If btbulkdelete was _not_ called, then we have a choice to make: we
1170 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1171 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1172 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1173 * now.
1174 */
1175 if (stats == NULL)
1176 {
1177 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1178 if (!_bt_vacuum_needs_cleanup(info->index))
1179 return NULL;
1180
1181 /*
1182 * Since we aren't going to actually delete any leaf items, there's no
1183 * need to go through all the vacuum-cycle-ID pushups here.
1184 *
1185 * Posting list tuples are a source of inaccuracy for cleanup-only
1186 * scans. btvacuumscan() will assume that the number of index tuples
1187 * from each page can be used as num_index_tuples, even though
1188 * num_index_tuples is supposed to represent the number of TIDs in the
1189 * index. This naive approach can underestimate the number of tuples
1190 * in the index significantly.
1191 *
1192 * We handle the problem by making num_index_tuples an estimate in
1193 * cleanup-only case.
1194 */
1196 btvacuumscan(info, stats, NULL, NULL, 0);
1197 stats->estimated_count = true;
1198 }
1199
1200 /*
1201 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1202 *
1203 * num_delpages is the number of deleted pages now in the index that were
1204 * not safe to place in the FSM to be recycled just yet. num_delpages is
1205 * greater than 0 only when _bt_pagedel() actually deleted pages during
1206 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1207 * have failed to place any newly deleted pages in the FSM just moments
1208 * ago. (Actually, there are edge cases where recycling of the current
1209 * VACUUM's newly deleted pages does not even become safe by the time the
1210 * next VACUUM comes around. See nbtree/README.)
1211 */
1212 Assert(stats->pages_deleted >= stats->pages_free);
1213 num_delpages = stats->pages_deleted - stats->pages_free;
1215
1216 /*
1217 * It's quite possible for us to be fooled by concurrent page splits into
1218 * double-counting some index tuples, so disbelieve any total that exceeds
1219 * the underlying heap's count ... if we know that accurately. Otherwise
1220 * this might just make matters worse.
1221 */
1222 if (!info->estimated_count)
1223 {
1224 if (stats->num_index_tuples > info->num_heap_tuples)
1225 stats->num_index_tuples = info->num_heap_tuples;
1226 }
1227
1228 return stats;
1229}
1230
1231/*
1232 * btvacuumscan --- scan the index for VACUUMing purposes
1233 *
1234 * This combines the functions of looking for leaf tuples that are deletable
1235 * according to the vacuum callback, looking for empty pages that can be
1236 * deleted, and looking for old deleted pages that can be recycled. Both
1237 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1238 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1239 *
1240 * The caller is responsible for initially allocating/zeroing a stats struct
1241 * and for obtaining a vacuum cycle ID if necessary.
1242 */
1243static void
1245 IndexBulkDeleteCallback callback, void *callback_state,
1246 BTCycleId cycleid)
1247{
1248 Relation rel = info->index;
1250 BlockNumber num_pages;
1251 bool needLock;
1253 ReadStream *stream = NULL;
1254
1255 /*
1256 * Reset fields that track information about the entire index now. This
1257 * avoids double-counting in the case where a single VACUUM command
1258 * requires multiple scans of the index.
1259 *
1260 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1261 * since they track information about the VACUUM command, and so must last
1262 * across each call to btvacuumscan().
1263 *
1264 * (Note that pages_free is treated as state about the whole index, not
1265 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1266 * calls are idempotent, and get repeated for the same deleted pages in
1267 * some scenarios. The point for us is to track the number of recyclable
1268 * pages in the index at the end of the VACUUM command.)
1269 */
1270 stats->num_pages = 0;
1271 stats->num_index_tuples = 0;
1272 stats->pages_deleted = 0;
1273 stats->pages_free = 0;
1274
1275 /* Set up info to pass down to btvacuumpage */
1276 vstate.info = info;
1277 vstate.stats = stats;
1278 vstate.callback = callback;
1279 vstate.callback_state = callback_state;
1280 vstate.cycleid = cycleid;
1281
1282 /* Create a temporary memory context to run _bt_pagedel in */
1284 "_bt_pagedel",
1286
1287 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1288 vstate.bufsize = 0;
1289 vstate.maxbufsize = 0;
1290 vstate.pendingpages = NULL;
1291 vstate.npendingpages = 0;
1292 /* Consider applying _bt_pendingfsm_finalize optimization */
1294
1295 /*
1296 * The outer loop iterates over all index pages except the metapage, in
1297 * physical order (we hope the kernel will cooperate in providing
1298 * read-ahead for speed). It is critical that we visit all leaf pages,
1299 * including ones added after we start the scan, else we might fail to
1300 * delete some deletable tuples. Hence, we must repeatedly check the
1301 * relation length. We must acquire the relation-extension lock while
1302 * doing so to avoid a race condition: if someone else is extending the
1303 * relation, there is a window where bufmgr/smgr have created a new
1304 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1305 * we manage to scan such a page here, we'll improperly assume it can be
1306 * recycled. Taking the lock synchronizes things enough to prevent a
1307 * problem: either num_pages won't include the new page, or _bt_getbuf
1308 * already has write lock on the buffer and it will be fully initialized
1309 * before we can examine it. Also, we need not worry if a page is added
1310 * immediately after we look; the page splitting code already has
1311 * write-lock on the left page before it adds a right page, so we must
1312 * already have processed any tuples due to be moved into such a page.
1313 *
1314 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1315 * think the use of the extension lock is still required.
1316 *
1317 * We can skip locking for new or temp relations, however, since no one
1318 * else could be accessing them.
1319 */
1321
1323
1324 /*
1325 * It is safe to use batchmode as block_range_read_stream_cb takes no
1326 * locks.
1327 */
1331 info->strategy,
1332 rel,
1335 &p,
1336 0);
1337 for (;;)
1338 {
1339 /* Get the current relation length */
1340 if (needLock)
1342 num_pages = RelationGetNumberOfBlocks(rel);
1343 if (needLock)
1345
1346 if (info->report_progress)
1348 num_pages);
1349
1350 /* Quit if we've scanned the whole relation */
1351 if (p.current_blocknum >= num_pages)
1352 break;
1353
1354 p.last_exclusive = num_pages;
1355
1356 /* Iterate over pages, then loop back to recheck relation length */
1357 while (true)
1358 {
1359 BlockNumber current_block;
1360 Buffer buf;
1361
1362 /* call vacuum_delay_point while not holding any buffer lock */
1363 vacuum_delay_point(false);
1364
1365 buf = read_stream_next_buffer(stream, NULL);
1366
1367 if (!BufferIsValid(buf))
1368 break;
1369
1370 current_block = btvacuumpage(&vstate, buf);
1371
1372 if (info->report_progress)
1374 current_block);
1375 }
1376
1377 /*
1378 * We have to reset the read stream to use it again. After returning
1379 * InvalidBuffer, the read stream API won't invoke our callback again
1380 * until the stream has been reset.
1381 */
1382 read_stream_reset(stream);
1383 }
1384
1385 read_stream_end(stream);
1386
1387 /* Set statistics num_pages field to final size of index */
1388 stats->num_pages = num_pages;
1389
1390 MemoryContextDelete(vstate.pagedelcontext);
1391
1392 /*
1393 * If there were any calls to _bt_pagedel() during scan of the index then
1394 * see if any of the resulting pages can be placed in the FSM now. When
1395 * it's not safe we'll have to leave it up to a future VACUUM operation.
1396 *
1397 * Finally, if we placed any pages in the FSM (either just now or during
1398 * the scan), forcibly update the upper-level FSM pages to ensure that
1399 * searchers can find them.
1400 */
1402 if (stats->pages_free > 0)
1404}
1405
1406/*
1407 * btvacuumpage --- VACUUM one page
1408 *
1409 * This processes a single page for btvacuumscan(). In some cases we must
1410 * backtrack to re-examine and VACUUM pages that were on buf's page during
1411 * a previous call here. This is how we handle page splits (that happened
1412 * after our cycleid was acquired) whose right half page happened to reuse
1413 * a block that we might have processed at some point before it was
1414 * recycled (i.e. before the page split).
1415 *
1416 * Returns BlockNumber of a scanned page (not backtracked).
1417 */
1418static BlockNumber
1420{
1421 IndexVacuumInfo *info = vstate->info;
1422 IndexBulkDeleteResult *stats = vstate->stats;
1424 void *callback_state = vstate->callback_state;
1425 Relation rel = info->index;
1426 Relation heaprel = info->heaprel;
1427 bool attempt_pagedel;
1428 BlockNumber blkno,
1431 Page page;
1432 BTPageOpaque opaque;
1433
1434 blkno = scanblkno;
1435
1436backtrack:
1437
1438 attempt_pagedel = false;
1440
1441 _bt_lockbuf(rel, buf, BT_READ);
1442 page = BufferGetPage(buf);
1443 opaque = NULL;
1444 if (!PageIsNew(page))
1445 {
1446 _bt_checkpage(rel, buf);
1447 opaque = BTPageGetOpaque(page);
1448 }
1449
1450 Assert(blkno <= scanblkno);
1451 if (blkno != scanblkno)
1452 {
1453 /*
1454 * We're backtracking.
1455 *
1456 * We followed a right link to a sibling leaf page (a page that
1457 * happens to be from a block located before scanblkno). The only
1458 * case we want to do anything with is a live leaf page having the
1459 * current vacuum cycle ID.
1460 *
1461 * The page had better be in a state that's consistent with what we
1462 * expect. Check for conditions that imply corruption in passing. It
1463 * can't be half-dead because only an interrupted VACUUM process can
1464 * leave pages in that state, so we'd definitely have dealt with it
1465 * back when the page was the scanblkno page (half-dead pages are
1466 * always marked fully deleted by _bt_pagedel(), barring corruption).
1467 */
1468 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1469 {
1470 Assert(false);
1471 ereport(LOG,
1473 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1474 blkno, scanblkno, RelationGetRelationName(rel))));
1475 _bt_relbuf(rel, buf);
1476 return scanblkno;
1477 }
1478
1479 /*
1480 * We may have already processed the page in an earlier call, when the
1481 * page was scanblkno. This happens when the leaf page split occurred
1482 * after the scan began, but before the right sibling page became the
1483 * scanblkno.
1484 *
1485 * Page may also have been deleted by current btvacuumpage() call,
1486 * since _bt_pagedel() sometimes deletes the right sibling page of
1487 * scanblkno in passing (it does so after we decided where to
1488 * backtrack to). We don't need to process this page as a deleted
1489 * page a second time now (in fact, it would be wrong to count it as a
1490 * deleted page in the bulk delete statistics a second time).
1491 */
1492 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1493 {
1494 /* Done with current scanblkno (and all lower split pages) */
1495 _bt_relbuf(rel, buf);
1496 return scanblkno;
1497 }
1498 }
1499
1500 if (!opaque || BTPageIsRecyclable(page, heaprel))
1501 {
1502 /* Okay to recycle this page (which could be leaf or internal) */
1503 RecordFreeIndexPage(rel, blkno);
1504 stats->pages_deleted++;
1505 stats->pages_free++;
1506 }
1507 else if (P_ISDELETED(opaque))
1508 {
1509 /*
1510 * Already deleted page (which could be leaf or internal). Can't
1511 * recycle yet.
1512 */
1513 stats->pages_deleted++;
1514 }
1515 else if (P_ISHALFDEAD(opaque))
1516 {
1517 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1518 attempt_pagedel = true;
1519
1520 /*
1521 * _bt_pagedel() will increment both pages_newly_deleted and
1522 * pages_deleted stats in all cases (barring corruption)
1523 */
1524 }
1525 else if (P_ISLEAF(opaque))
1526 {
1528 int ndeletable;
1530 int nupdatable;
1531 OffsetNumber offnum,
1532 minoff,
1533 maxoff;
1534 int nhtidsdead,
1535 nhtidslive;
1536
1537 /*
1538 * Trade in the initial read lock for a full cleanup lock on this
1539 * page. We must get such a lock on every leaf page over the course
1540 * of the vacuum scan, whether or not it actually contains any
1541 * deletable tuples --- see nbtree/README.
1542 */
1544
1545 /*
1546 * Check whether we need to backtrack to earlier pages. What we are
1547 * concerned about is a page split that happened since we started the
1548 * vacuum scan. If the split moved tuples on the right half of the
1549 * split (i.e. the tuples that sort high) to a block that we already
1550 * passed over, then we might have missed the tuples. We need to
1551 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1552 * or deleting scanblkno page below!)
1553 */
1554 if (vstate->cycleid != 0 &&
1555 opaque->btpo_cycleid == vstate->cycleid &&
1556 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1557 !P_RIGHTMOST(opaque) &&
1558 opaque->btpo_next < scanblkno)
1559 backtrack_to = opaque->btpo_next;
1560
1561 ndeletable = 0;
1562 nupdatable = 0;
1563 minoff = P_FIRSTDATAKEY(opaque);
1564 maxoff = PageGetMaxOffsetNumber(page);
1565 nhtidsdead = 0;
1566 nhtidslive = 0;
1567 if (callback)
1568 {
1569 /* btbulkdelete callback tells us what to delete (or update) */
1570 for (offnum = minoff;
1571 offnum <= maxoff;
1572 offnum = OffsetNumberNext(offnum))
1573 {
1574 IndexTuple itup;
1575
1576 itup = (IndexTuple) PageGetItem(page,
1577 PageGetItemId(page, offnum));
1578
1579 Assert(!BTreeTupleIsPivot(itup));
1580 if (!BTreeTupleIsPosting(itup))
1581 {
1582 /* Regular tuple, standard table TID representation */
1583 if (callback(&itup->t_tid, callback_state))
1584 {
1585 deletable[ndeletable++] = offnum;
1586 nhtidsdead++;
1587 }
1588 else
1589 nhtidslive++;
1590 }
1591 else
1592 {
1594 int nremaining;
1595
1596 /* Posting list tuple */
1597 vacposting = btreevacuumposting(vstate, itup, offnum,
1598 &nremaining);
1599 if (vacposting == NULL)
1600 {
1601 /*
1602 * All table TIDs from the posting tuple remain, so no
1603 * delete or update required
1604 */
1606 }
1607 else if (nremaining > 0)
1608 {
1609
1610 /*
1611 * Store metadata about posting list tuple in
1612 * updatable array for entire page. Existing tuple
1613 * will be updated during the later call to
1614 * _bt_delitems_vacuum().
1615 */
1617 updatable[nupdatable++] = vacposting;
1619 }
1620 else
1621 {
1622 /*
1623 * All table TIDs from the posting list must be
1624 * deleted. We'll delete the index tuple completely
1625 * (no update required).
1626 */
1627 Assert(nremaining == 0);
1628 deletable[ndeletable++] = offnum;
1631 }
1632
1634 }
1635 }
1636 }
1637
1638 /*
1639 * Apply any needed deletes or updates. We issue just one
1640 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1641 */
1642 if (ndeletable > 0 || nupdatable > 0)
1643 {
1645 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1646 nupdatable);
1647
1648 stats->tuples_removed += nhtidsdead;
1649 /* must recompute maxoff */
1650 maxoff = PageGetMaxOffsetNumber(page);
1651
1652 /* can't leak memory here */
1653 for (int i = 0; i < nupdatable; i++)
1654 pfree(updatable[i]);
1655 }
1656 else
1657 {
1658 /*
1659 * If the leaf page has been split during this vacuum cycle, it
1660 * seems worth expending a write to clear btpo_cycleid even if we
1661 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1662 * takes care of this.) This ensures we won't process the page
1663 * again.
1664 *
1665 * We treat this like a hint-bit update because there's no need to
1666 * WAL-log it.
1667 */
1668 Assert(nhtidsdead == 0);
1669 if (vstate->cycleid != 0 &&
1670 opaque->btpo_cycleid == vstate->cycleid)
1671 {
1672 opaque->btpo_cycleid = 0;
1673 MarkBufferDirtyHint(buf, true);
1674 }
1675 }
1676
1677 /*
1678 * If the leaf page is now empty, try to delete it; else count the
1679 * live tuples (live table TIDs in posting lists are counted as
1680 * separate live tuples). We don't delete when backtracking, though,
1681 * since that would require teaching _bt_pagedel() about backtracking
1682 * (doesn't seem worth adding more complexity to deal with that).
1683 *
1684 * We don't count the number of live TIDs during cleanup-only calls to
1685 * btvacuumscan (i.e. when callback is not set). We count the number
1686 * of index tuples directly instead. This avoids the expense of
1687 * directly examining all of the tuples on each page. VACUUM will
1688 * treat num_index_tuples as an estimate in cleanup-only case, so it
1689 * doesn't matter that this underestimates num_index_tuples
1690 * significantly in some cases.
1691 */
1692 if (minoff > maxoff)
1693 attempt_pagedel = (blkno == scanblkno);
1694 else if (callback)
1695 stats->num_index_tuples += nhtidslive;
1696 else
1697 stats->num_index_tuples += maxoff - minoff + 1;
1698
1700 }
1701
1702 if (attempt_pagedel)
1703 {
1704 MemoryContext oldcontext;
1705
1706 /* Run pagedel in a temp context to avoid memory leakage */
1707 MemoryContextReset(vstate->pagedelcontext);
1708 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1709
1710 /*
1711 * _bt_pagedel maintains the bulk delete stats on our behalf;
1712 * pages_newly_deleted and pages_deleted are likely to be incremented
1713 * during call
1714 */
1715 Assert(blkno == scanblkno);
1716 _bt_pagedel(rel, buf, vstate);
1717
1718 MemoryContextSwitchTo(oldcontext);
1719 /* pagedel released buffer, so we shouldn't */
1720 }
1721 else
1722 _bt_relbuf(rel, buf);
1723
1724 if (backtrack_to != P_NONE)
1725 {
1726 blkno = backtrack_to;
1727
1728 /* check for vacuum delay while not holding any buffer lock */
1729 vacuum_delay_point(false);
1730
1731 /*
1732 * We can't use _bt_getbuf() here because it always applies
1733 * _bt_checkpage(), which will barf on an all-zero page. We want to
1734 * recycle all-zero pages, not fail. Also, we want to use a
1735 * nondefault buffer access strategy.
1736 */
1738 info->strategy);
1739 goto backtrack;
1740 }
1741
1742 return scanblkno;
1743}
1744
1745/*
1746 * btreevacuumposting --- determine TIDs still needed in posting list
1747 *
1748 * Returns metadata describing how to build replacement tuple without the TIDs
1749 * that VACUUM needs to delete. Returned value is NULL in the common case
1750 * where no changes are needed to caller's posting list tuple (we avoid
1751 * allocating memory here as an optimization).
1752 *
1753 * The number of TIDs that should remain in the posting list tuple is set for
1754 * caller in *nremaining.
1755 */
1756static BTVacuumPosting
1758 OffsetNumber updatedoffset, int *nremaining)
1759{
1760 int live = 0;
1761 int nitem = BTreeTupleGetNPosting(posting);
1764
1765 for (int i = 0; i < nitem; i++)
1766 {
1767 if (!vstate->callback(items + i, vstate->callback_state))
1768 {
1769 /* Live table TID */
1770 live++;
1771 }
1772 else if (vacposting == NULL)
1773 {
1774 /*
1775 * First dead table TID encountered.
1776 *
1777 * It's now clear that we need to delete one or more dead table
1778 * TIDs, so start maintaining metadata describing how to update
1779 * existing posting list tuple.
1780 */
1782 nitem * sizeof(uint16));
1783
1784 vacposting->itup = posting;
1785 vacposting->updatedoffset = updatedoffset;
1786 vacposting->ndeletedtids = 0;
1787 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1788 }
1789 else
1790 {
1791 /* Second or subsequent dead table TID */
1792 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1793 }
1794 }
1795
1796 *nremaining = live;
1797 return vacposting;
1798}
1799
1800/*
1801 * btcanreturn() -- Check whether btree indexes support index-only scans.
1802 *
1803 * btrees always do, so this is trivial.
1804 */
1805bool
1807{
1808 return true;
1809}
1810
1811/*
1812 * btgettreeheight() -- Compute tree height for use by btcostestimate().
1813 */
1814int
1816{
1817 return _bt_getrootheight(rel);
1818}
1819
1822{
1823 switch (strategy)
1824 {
1826 return COMPARE_LT;
1828 return COMPARE_LE;
1830 return COMPARE_EQ;
1832 return COMPARE_GE;
1834 return COMPARE_GT;
1835 default:
1836 return COMPARE_INVALID;
1837 }
1838}
1839
1842{
1843 switch (cmptype)
1844 {
1845 case COMPARE_LT:
1846 return BTLessStrategyNumber;
1847 case COMPARE_LE:
1849 case COMPARE_EQ:
1850 return BTEqualStrategyNumber;
1851 case COMPARE_GE:
1853 case COMPARE_GT:
1855 default:
1856 return InvalidStrategy;
1857 }
1858}
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static Datum values[MAXATTR]
Definition bootstrap.c:187
int Buffer
Definition buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5535
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4358
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5567
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:913
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:307
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:466
@ RBM_NORMAL
Definition bufmgr.h:46
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:417
static bool PageIsNew(const PageData *page)
Definition bufpage.h:233
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:243
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:353
PageData * Page
Definition bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:371
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition bulk_write.c:130
#define OffsetToPointer(base, offset)
Definition c.h:818
#define Assert(condition)
Definition c.h:906
int64_t int64
Definition c.h:576
#define FLEXIBLE_ARRAY_MEMBER
Definition c.h:513
int16_t int16
Definition c.h:574
uint16_t uint16
Definition c.h:578
size_t Size
Definition c.h:652
CompareType
Definition cmptype.h:32
@ COMPARE_LE
Definition cmptype.h:35
@ COMPARE_INVALID
Definition cmptype.h:33
@ COMPARE_GT
Definition cmptype.h:38
@ COMPARE_EQ
Definition cmptype.h:36
@ COMPARE_GE
Definition cmptype.h:37
@ COMPARE_LT
Definition cmptype.h:34
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumRestore(char **start_address, bool *isnull)
Definition datum.c:523
void datumSerialize(Datum value, bool isnull, bool typByVal, int typLen, char **start_address)
Definition datum.c:461
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition datum.c:414
int errcode(int sqlerrcode)
Definition elog.c:874
#define LOG
Definition elog.h:31
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define ereport(elevel,...)
Definition elog.h:150
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define palloc0_object(type)
Definition fe_memutils.h:75
#define PG_RETURN_POINTER(x)
Definition fmgr.h:363
#define PG_FUNCTION_ARGS
Definition fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition genam.h:93
IndexUniqueCheck
Definition genam.h:122
void IndexFreeSpaceMapVacuum(Relation rel)
Definition indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition ipc.h:52
int i
Definition isn.c:77
IndexTupleData * IndexTuple
Definition itup.h:53
#define MaxIndexTuplesPerPage
Definition itup.h:181
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
#define ExclusiveLock
Definition lockdefs.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1177
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1794
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:699
@ LW_EXCLUSIVE
Definition lwlock.h:112
void MemoryContextReset(MemoryContext context)
Definition mcxt.c:403
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition nbtinsert.c:104
void _bt_relbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1024
int _bt_getrootheight(Relation rel)
Definition nbtpage.c:676
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition nbtpage.c:1801
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition nbtpage.c:1155
void _bt_checkpage(Relation rel, Buffer buf)
Definition nbtpage.c:798
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition nbtpage.c:233
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition nbtpage.c:1110
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition nbtpage.c:68
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition nbtpage.c:180
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition nbtpage.c:3000
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition nbtpage.c:1040
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition nbtpage.c:2958
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition nbtree.c:1092
bool btcanreturn(Relation index, int attno)
Definition nbtree.c:1806
BTPS_State
Definition nbtree.c:56
@ BTPARALLEL_ADVANCING
Definition nbtree.c:59
@ BTPARALLEL_NEED_PRIMSCAN
Definition nbtree.c:58
@ BTPARALLEL_NOT_INITIALIZED
Definition nbtree.c:57
@ BTPARALLEL_IDLE
Definition nbtree.c:60
@ BTPARALLEL_DONE
Definition nbtree.c:61
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition nbtree.c:877
StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily)
Definition nbtree.c:1841
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition nbtree.c:338
static bool _bt_start_prim_scan(IndexScanDesc scan)
Definition nbtree.c:660
static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf)
Definition nbtree.c:1419
Size btestimateparallelscan(Relation rel, int nkeys, int norderbys)
Definition nbtree.c:582
void _bt_parallel_done(IndexScanDesc scan)
Definition nbtree.c:1042
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition nbtree.c:1156
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition nbtree.c:1757
CompareType bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
Definition nbtree.c:1821
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition nbtree.c:229
void btparallelrescan(IndexScanDesc scan)
Definition nbtree.c:834
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition nbtree.c:205
void btbuildempty(Relation index)
Definition nbtree.c:182
int btgettreeheight(Relation rel)
Definition nbtree.c:1815
void btinitparallelscan(void *target)
Definition nbtree.c:818
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition nbtree.c:1126
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition nbtree.c:1244
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition nbtree.c:724
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition nbtree.c:290
void btmarkpos(IndexScanDesc scan)
Definition nbtree.c:498
void btendscan(IndexScanDesc scan)
Definition nbtree.c:462
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition nbtree.c:387
struct BTParallelScanDescData * BTParallelScanDesc
Definition nbtree.c:94
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition nbtree.c:1015
Datum bthandler(PG_FUNCTION_ARGS)
Definition nbtree.c:117
void btrestrpos(IndexScanDesc scan)
Definition nbtree.c:524
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition nbtree.c:767
#define P_ISHALFDEAD(opaque)
Definition nbtree.h:225
#define BTScanPosIsPinned(scanpos)
Definition nbtree.h:1004
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition nbtree.h:481
#define P_ISLEAF(opaque)
Definition nbtree.h:221
#define SK_BT_SKIP
Definition nbtree.h:1106
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define P_ISDELETED(opaque)
Definition nbtree.h:223
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition nbtree.h:538
#define BTNProcs
Definition nbtree.h:723
#define MaxTIDsPerBTreePage
Definition nbtree.h:186
#define BTScanPosIsValid(scanpos)
Definition nbtree.h:1021
#define P_FIRSTDATAKEY(opaque)
Definition nbtree.h:370
uint16 BTCycleId
Definition nbtree.h:30
#define P_NONE
Definition nbtree.h:213
#define P_RIGHTMOST(opaque)
Definition nbtree.h:220
#define BTREE_METAPAGE
Definition nbtree.h:149
#define SK_BT_MAXVAL
Definition nbtree.h:1110
#define BT_READ
Definition nbtree.h:730
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition nbtree.h:292
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition nbtree.h:493
#define BTScanPosInvalidate(scanpos)
Definition nbtree.h:1027
#define BTScanPosUnpinIfPinned(scanpos)
Definition nbtree.h:1015
#define BTMaxItemSize
Definition nbtree.h:165
#define BTP_SPLIT_END
Definition nbtree.h:82
#define BTOPTIONS_PROC
Definition nbtree.h:721
#define SK_BT_MINVAL
Definition nbtree.h:1109
BTScanOpaqueData * BTScanOpaque
Definition nbtree.h:1097
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:878
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:1586
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition nbtsort.c:298
void _bt_end_vacuum(Relation rel)
Definition nbtutils.c:526
void _bt_end_vacuum_callback(int code, Datum arg)
Definition nbtutils.c:554
void _bt_killitems(IndexScanDesc scan)
Definition nbtutils.c:205
char * btbuildphasename(int64 phasenum)
Definition nbtutils.c:650
bytea * btoptions(Datum reloptions, bool validate)
Definition nbtutils.c:604
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition nbtutils.c:627
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition nbtutils.c:1181
BTCycleId _bt_start_vacuum(Relation rel)
Definition nbtutils.c:469
bool btvalidate(Oid opclassoid)
Definition nbtvalidate.c:40
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
int16 attnum
static char buf[DEFAULT_XLOG_SEG_SIZE]
static Datum PointerGetDatum(const void *X)
Definition postgres.h:352
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:342
#define InvalidOid
unsigned int Oid
static int fb(int x)
#define PROGRESS_SCAN_BLOCKS_DONE
Definition progress.h:142
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition progress.h:141
void read_stream_reset(ReadStream *stream)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_MAINTENANCE
Definition read_stream.h:28
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
#define RELATION_IS_LOCAL(relation)
Definition rel.h:657
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetRelationName(relation)
Definition rel.h:548
#define RelationNeedsWAL(relation)
Definition rel.h:637
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:533
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
ScanDirection
Definition sdir.h:25
@ ForwardScanDirection
Definition sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition selfuncs.c:7666
Size add_size(Size s1, Size s2)
Definition shmem.c:482
#define SK_SEARCHNULL
Definition skey.h:121
ScanKeyData * ScanKey
Definition skey.h:75
#define SK_ISNULL
Definition skey.h:115
#define IsMVCCSnapshot(snapshot)
Definition snapmgr.h:55
uint16 StrategyNumber
Definition stratnum.h:22
#define BTGreaterStrategyNumber
Definition stratnum.h:33
#define InvalidStrategy
Definition stratnum.h:24
#define BTMaxStrategyNumber
Definition stratnum.h:35
#define BTLessStrategyNumber
Definition stratnum.h:29
#define BTEqualStrategyNumber
Definition stratnum.h:31
#define BTLessEqualStrategyNumber
Definition stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition stratnum.h:32
Datum * elem_values
Definition nbtree.h:1041
BlockNumber btpo_next
Definition nbtree.h:66
uint16 btpo_flags
Definition nbtree.h:68
BTCycleId btpo_cycleid
Definition nbtree.h:69
BTPS_State btps_pageStatus
Definition nbtree.c:73
BlockNumber btps_lastCurrPage
Definition nbtree.c:71
ConditionVariable btps_cv
Definition nbtree.c:77
BlockNumber btps_nextScanPage
Definition nbtree.c:70
int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]
Definition nbtree.c:84
NodeTag type
Definition amapi.h:234
BlockNumber pages_deleted
Definition genam.h:88
BlockNumber pages_free
Definition genam.h:89
BlockNumber num_pages
Definition genam.h:83
double tuples_removed
Definition genam.h:86
double num_index_tuples
Definition genam.h:85
struct ScanKeyData * keyData
Definition relscan.h:142
struct ParallelIndexScanDescData * parallel_scan
Definition relscan.h:192
bool kill_prior_tuple
Definition relscan.h:148
struct TupleDescData * xs_itupdesc
Definition relscan.h:169
Relation indexRelation
Definition relscan.h:138
ItemPointerData xs_heaptid
Definition relscan.h:173
struct SnapshotData * xs_snapshot
Definition relscan.h:139
Relation heapRelation
Definition relscan.h:137
ItemPointerData t_tid
Definition itup.h:37
Relation index
Definition genam.h:52
double num_heap_tuples
Definition genam.h:58
bool analyze_only
Definition genam.h:54
BufferAccessStrategy strategy
Definition genam.h:59
Relation heaprel
Definition genam.h:53
bool report_progress
Definition genam.h:55
bool estimated_count
Definition genam.h:56
TupleDesc rd_att
Definition rel.h:112
Definition type.h:96
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
static ItemArray items
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition tidbitmap.c:367
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:175
void vacuum_delay_point(bool is_analyze)
Definition vacuum.c:2431
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition vacuum.h:48
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition vacuum.h:55