PostgreSQL Source Code git master
nbtree.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
6 *
7 * NOTES
8 * This file contains only the public interface routines.
9 *
10 *
11 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
13 *
14 * IDENTIFICATION
15 * src/backend/access/nbtree/nbtree.c
16 *
17 *-------------------------------------------------------------------------
18 */
19#include "postgres.h"
20
21#include "access/nbtree.h"
22#include "access/relscan.h"
23#include "access/stratnum.h"
24#include "commands/progress.h"
25#include "commands/vacuum.h"
26#include "nodes/execnodes.h"
27#include "pgstat.h"
28#include "storage/bulk_write.h"
30#include "storage/indexfsm.h"
31#include "storage/ipc.h"
32#include "storage/lmgr.h"
33#include "storage/read_stream.h"
34#include "utils/datum.h"
35#include "utils/fmgrprotos.h"
37#include "utils/memutils.h"
38
39
40/*
41 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
42 *
43 * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
44 * scan to advance it via another call to _bt_first.
45 *
46 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
47 * a new page; others must wait.
48 *
49 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
50 * to a new page; some process can start doing that.
51 *
52 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53 */
54typedef enum
55{
62
63/*
64 * BTParallelScanDescData contains btree specific shared information required
65 * for parallel scan.
66 */
68{
69 BlockNumber btps_nextScanPage; /* next page to be scanned */
70 BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
71 * btps_nextScanPage */
72 BTPS_State btps_pageStatus; /* indicates whether next page is
73 * available for scan. see above for
74 * possible states of parallel scan. */
75 LWLock btps_lock; /* protects shared parallel state */
76 ConditionVariable btps_cv; /* used to synchronize parallel scan */
77
78 /*
79 * btps_arrElems is used when scans need to schedule another primitive
80 * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
81 * offsets for each = scan key associated with a ScalarArrayOp array.
82 */
84
85 /*
86 * Additional space (at the end of the struct) is used when scans need to
87 * schedule another primitive index scan with one or more skip arrays.
88 * Holds a flattened datum representation for each = scan key associated
89 * with a skip array.
90 */
92
94
95
96static bool _bt_start_prim_scan(IndexScanDesc scan);
98 BTScanOpaque so);
100 BTScanOpaque so);
101static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
102 IndexBulkDeleteCallback callback, void *callback_state,
103 BTCycleId cycleid);
106 IndexTuple posting,
107 OffsetNumber updatedoffset,
108 int *nremaining);
109
110
111/*
112 * Btree handler function: return IndexAmRoutine with access method parameters
113 * and callbacks.
114 */
115Datum
117{
118 static const IndexAmRoutine amroutine = {
119 .type = T_IndexAmRoutine,
120 .amstrategies = BTMaxStrategyNumber,
121 .amsupport = BTNProcs,
122 .amoptsprocnum = BTOPTIONS_PROC,
123 .amcanorder = true,
124 .amcanorderbyop = false,
125 .amcanhash = false,
126 .amconsistentequality = true,
127 .amconsistentordering = true,
128 .amcanbackward = true,
129 .amcanunique = true,
130 .amcanmulticol = true,
131 .amoptionalkey = true,
132 .amsearcharray = true,
133 .amsearchnulls = true,
134 .amstorage = false,
135 .amclusterable = true,
136 .ampredlocks = true,
137 .amcanparallel = true,
138 .amcanbuildparallel = true,
139 .amcaninclude = true,
140 .amusemaintenanceworkmem = false,
141 .amsummarizing = false,
142 .amparallelvacuumoptions =
144 .amkeytype = InvalidOid,
145
146 .ambuild = btbuild,
147 .ambuildempty = btbuildempty,
148 .aminsert = btinsert,
149 .aminsertcleanup = NULL,
150 .ambulkdelete = btbulkdelete,
151 .amvacuumcleanup = btvacuumcleanup,
152 .amcanreturn = btcanreturn,
153 .amcostestimate = btcostestimate,
154 .amgettreeheight = btgettreeheight,
155 .amoptions = btoptions,
156 .amproperty = btproperty,
157 .ambuildphasename = btbuildphasename,
158 .amvalidate = btvalidate,
159 .amadjustmembers = btadjustmembers,
160 .ambeginscan = btbeginscan,
161 .amrescan = btrescan,
162 .amgettuple = btgettuple,
163 .amgetbitmap = btgetbitmap,
164 .amendscan = btendscan,
165 .ammarkpos = btmarkpos,
166 .amrestrpos = btrestrpos,
167 .amestimateparallelscan = btestimateparallelscan,
168 .aminitparallelscan = btinitparallelscan,
169 .amparallelrescan = btparallelrescan,
170 .amtranslatestrategy = bttranslatestrategy,
171 .amtranslatecmptype = bttranslatecmptype,
172 };
173
174 PG_RETURN_POINTER(&amroutine);
175}
176
177/*
178 * btbuildempty() -- build an empty btree index in the initialization fork
179 */
180void
182{
183 bool allequalimage = _bt_allequalimage(index, false);
184 BulkWriteState *bulkstate;
185 BulkWriteBuffer metabuf;
186
188
189 /* Construct metapage. */
190 metabuf = smgr_bulk_get_buf(bulkstate);
191 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
192 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
193
194 smgr_bulk_finish(bulkstate);
195}
196
197/*
198 * btinsert() -- insert an index tuple into a btree.
199 *
200 * Descend the tree recursively, find the appropriate location for our
201 * new tuple, and put it there.
202 */
203bool
204btinsert(Relation rel, Datum *values, bool *isnull,
205 ItemPointer ht_ctid, Relation heapRel,
206 IndexUniqueCheck checkUnique,
207 bool indexUnchanged,
208 IndexInfo *indexInfo)
209{
210 bool result;
211 IndexTuple itup;
212
213 /* generate an index tuple */
214 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
215 itup->t_tid = *ht_ctid;
216
217 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
218
219 pfree(itup);
220
221 return result;
222}
223
224/*
225 * btgettuple() -- Get the next tuple in the scan.
226 */
227bool
229{
230 BTScanOpaque so = (BTScanOpaque) scan->opaque;
231 bool res;
232
233 Assert(scan->heapRelation != NULL);
234
235 /* btree indexes are never lossy */
236 scan->xs_recheck = false;
237
238 /* Each loop iteration performs another primitive index scan */
239 do
240 {
241 /*
242 * If we've already initialized this scan, we can just advance it in
243 * the appropriate direction. If we haven't done so yet, we call
244 * _bt_first() to get the first item in the scan.
245 */
246 if (!BTScanPosIsValid(so->currPos))
247 res = _bt_first(scan, dir);
248 else
249 {
250 /*
251 * Check to see if we should kill the previously-fetched tuple.
252 */
253 if (scan->kill_prior_tuple)
254 {
255 /*
256 * Yes, remember it for later. (We'll deal with all such
257 * tuples at once right before leaving the index page.) The
258 * test for numKilled overrun is not just paranoia: if the
259 * caller reverses direction in the indexscan then the same
260 * item might get entered multiple times. It's not worth
261 * trying to optimize that, so we don't detect it, but instead
262 * just forget any excess entries.
263 */
264 if (so->killedItems == NULL)
267 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
268 }
269
270 /*
271 * Now continue the scan.
272 */
273 res = _bt_next(scan, dir);
274 }
275
276 /* If we have a tuple, return it ... */
277 if (res)
278 break;
279 /* ... otherwise see if we need another primitive index scan */
280 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
281
282 return res;
283}
284
285/*
286 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
287 */
288int64
290{
291 BTScanOpaque so = (BTScanOpaque) scan->opaque;
292 int64 ntids = 0;
293 ItemPointer heapTid;
294
295 Assert(scan->heapRelation == NULL);
296
297 /* Each loop iteration performs another primitive index scan */
298 do
299 {
300 /* Fetch the first page & tuple */
302 {
303 /* Save tuple ID, and continue scanning */
304 heapTid = &scan->xs_heaptid;
305 tbm_add_tuples(tbm, heapTid, 1, false);
306 ntids++;
307
308 for (;;)
309 {
310 /*
311 * Advance to next tuple within page. This is the same as the
312 * easy case in _bt_next().
313 */
314 if (++so->currPos.itemIndex > so->currPos.lastItem)
315 {
316 /* let _bt_next do the heavy lifting */
317 if (!_bt_next(scan, ForwardScanDirection))
318 break;
319 }
320
321 /* Save tuple ID, and continue scanning */
322 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
323 tbm_add_tuples(tbm, heapTid, 1, false);
324 ntids++;
325 }
326 }
327 /* Now see if we need another primitive index scan */
328 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
329
330 return ntids;
331}
332
333/*
334 * btbeginscan() -- start a scan on a btree index
335 */
337btbeginscan(Relation rel, int nkeys, int norderbys)
338{
339 IndexScanDesc scan;
340 BTScanOpaque so;
341
342 /* no order by operators allowed */
343 Assert(norderbys == 0);
344
345 /* get the scan */
346 scan = RelationGetIndexScan(rel, nkeys, norderbys);
347
348 /* allocate private workspace */
352 if (scan->numberOfKeys > 0)
353 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
354 else
355 so->keyData = NULL;
356
357 so->skipScan = false;
358 so->needPrimScan = false;
359 so->scanBehind = false;
360 so->oppositeDirCheck = false;
361 so->arrayKeys = NULL;
362 so->orderProcs = NULL;
363 so->arrayContext = NULL;
364
365 so->killedItems = NULL; /* until needed */
366 so->numKilled = 0;
367
368 /*
369 * We don't know yet whether the scan will be index-only, so we do not
370 * allocate the tuple workspace arrays until btrescan. However, we set up
371 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
372 */
373 so->currTuples = so->markTuples = NULL;
374
375 scan->xs_itupdesc = RelationGetDescr(rel);
376
377 scan->opaque = so;
378
379 return scan;
380}
381
382/*
383 * btrescan() -- rescan an index relation
384 */
385void
386btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
387 ScanKey orderbys, int norderbys)
388{
389 BTScanOpaque so = (BTScanOpaque) scan->opaque;
390
391 /* we aren't holding any read locks, but gotta drop the pins */
393 {
394 /* Before leaving current page, deal with any killed items */
395 if (so->numKilled > 0)
396 _bt_killitems(scan);
399 }
400
401 /*
402 * We prefer to eagerly drop leaf page pins before btgettuple returns.
403 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
404 *
405 * We cannot safely drop leaf page pins during index-only scans due to a
406 * race condition involving VACUUM setting pages all-visible in the VM.
407 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
408 *
409 * When we drop pins eagerly, the mechanism that marks so->killedItems[]
410 * index tuples LP_DEAD has to deal with concurrent TID recycling races.
411 * The scheme used to detect unsafe TID recycling won't work when scanning
412 * unlogged relations (since it involves saving an affected page's LSN).
413 * Opt out of eager pin dropping during unlogged relation scans for now
414 * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
415 *
416 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
417 * Pins cannot be held for more than an instant during bitmap scans either
418 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
419 *
420 * See nbtree/README section on making concurrent TID recycling safe.
421 *
422 * Note: so->dropPin should never change across rescans.
423 */
424 so->dropPin = (!scan->xs_want_itup &&
427 scan->heapRelation != NULL);
428
429 so->markItemIndex = -1;
430 so->needPrimScan = false;
431 so->scanBehind = false;
432 so->oppositeDirCheck = false;
435
436 /*
437 * Allocate tuple workspace arrays, if needed for an index-only scan and
438 * not already done in a previous rescan call. To save on palloc
439 * overhead, both workspaces are allocated as one palloc block; only this
440 * function and btendscan know that.
441 */
442 if (scan->xs_want_itup && so->currTuples == NULL)
443 {
444 so->currTuples = (char *) palloc(BLCKSZ * 2);
445 so->markTuples = so->currTuples + BLCKSZ;
446 }
447
448 /*
449 * Reset the scan keys
450 */
451 if (scankey && scan->numberOfKeys > 0)
452 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
453 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
454 so->numArrayKeys = 0; /* ditto */
455}
456
457/*
458 * btendscan() -- close down a scan
459 */
460void
462{
463 BTScanOpaque so = (BTScanOpaque) scan->opaque;
464
465 /* we aren't holding any read locks, but gotta drop the pins */
467 {
468 /* Before leaving current page, deal with any killed items */
469 if (so->numKilled > 0)
470 _bt_killitems(scan);
472 }
473
474 so->markItemIndex = -1;
476
477 /* No need to invalidate positions, the RAM is about to be freed. */
478
479 /* Release storage */
480 if (so->keyData != NULL)
481 pfree(so->keyData);
482 /* so->arrayKeys and so->orderProcs are in arrayContext */
483 if (so->arrayContext != NULL)
485 if (so->killedItems != NULL)
486 pfree(so->killedItems);
487 if (so->currTuples != NULL)
488 pfree(so->currTuples);
489 /* so->markTuples should not be pfree'd, see btrescan */
490 pfree(so);
491}
492
493/*
494 * btmarkpos() -- save current scan position
495 */
496void
498{
499 BTScanOpaque so = (BTScanOpaque) scan->opaque;
500
501 /* There may be an old mark with a pin (but no lock). */
503
504 /*
505 * Just record the current itemIndex. If we later step to next page
506 * before releasing the marked position, _bt_steppage makes a full copy of
507 * the currPos struct in markPos. If (as often happens) the mark is moved
508 * before we leave the page, we don't have to do that work.
509 */
510 if (BTScanPosIsValid(so->currPos))
512 else
513 {
515 so->markItemIndex = -1;
516 }
517}
518
519/*
520 * btrestrpos() -- restore scan to last saved position
521 */
522void
524{
525 BTScanOpaque so = (BTScanOpaque) scan->opaque;
526
527 if (so->markItemIndex >= 0)
528 {
529 /*
530 * The scan has never moved to a new page since the last mark. Just
531 * restore the itemIndex.
532 *
533 * NB: In this case we can't count on anything in so->markPos to be
534 * accurate.
535 */
537 }
538 else
539 {
540 /*
541 * The scan moved to a new page after last mark or restore, and we are
542 * now restoring to the marked page. We aren't holding any read
543 * locks, but if we're still holding the pin for the current position,
544 * we must drop it.
545 */
546 if (BTScanPosIsValid(so->currPos))
547 {
548 /* Before leaving current page, deal with any killed items */
549 if (so->numKilled > 0)
550 _bt_killitems(scan);
552 }
553
554 if (BTScanPosIsValid(so->markPos))
555 {
556 /* bump pin on mark buffer for assignment to current buffer */
557 if (BTScanPosIsPinned(so->markPos))
559 memcpy(&so->currPos, &so->markPos,
560 offsetof(BTScanPosData, items[1]) +
561 so->markPos.lastItem * sizeof(BTScanPosItem));
562 if (so->currTuples)
563 memcpy(so->currTuples, so->markTuples,
565 /* Reset the scan's array keys (see _bt_steppage for why) */
566 if (so->numArrayKeys)
567 {
569 so->needPrimScan = false;
570 }
571 }
572 else
574 }
575}
576
577/*
578 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
579 */
580Size
581btestimateparallelscan(Relation rel, int nkeys, int norderbys)
582{
584 Size estnbtreeshared,
585 genericattrspace;
586
587 /*
588 * Pessimistically assume that every input scan key will be output with
589 * its own SAOP array
590 */
591 estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
592 sizeof(int) * nkeys;
593
594 /* Single column indexes cannot possibly use a skip array */
595 if (nkeyatts == 1)
596 return estnbtreeshared;
597
598 /*
599 * Pessimistically assume that all attributes prior to the least
600 * significant attribute require a skip array (and an associated key)
601 */
602 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
603 sizeof(Datum));
604 for (int attnum = 1; attnum < nkeyatts; attnum++)
605 {
606 CompactAttribute *attr;
607
608 /*
609 * We make the conservative assumption that every index column will
610 * also require a skip array.
611 *
612 * Every skip array must have space to store its scan key's sk_flags.
613 */
614 estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
615
616 /* Consider space required to store a datum of opclass input type */
617 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
618 if (attr->attbyval)
619 {
620 /* This index attribute stores pass-by-value datums */
621 Size estfixed = datumEstimateSpace((Datum) 0, false,
622 true, attr->attlen);
623
624 estnbtreeshared = add_size(estnbtreeshared, estfixed);
625 continue;
626 }
627
628 /*
629 * This index attribute stores pass-by-reference datums.
630 *
631 * Assume that serializing this array will use just as much space as a
632 * pass-by-value datum, in addition to space for the largest possible
633 * whole index tuple (this is not just a per-datum portion of the
634 * largest possible tuple because that'd be almost as large anyway).
635 *
636 * This is quite conservative, but it's not clear how we could do much
637 * better. The executor requires an up-front storage request size
638 * that reliably covers the scan's high watermark memory usage. We
639 * can't be sure of the real high watermark until the scan is over.
640 */
641 estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
642 estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
643 }
644
645 return estnbtreeshared;
646}
647
648/*
649 * _bt_start_prim_scan() -- start scheduled primitive index scan?
650 *
651 * Returns true if _bt_checkkeys scheduled another primitive index scan, just
652 * as the last one ended. Otherwise returns false, indicating that the array
653 * keys are now fully exhausted.
654 *
655 * Only call here during scans with one or more equality type array scan keys,
656 * after _bt_first or _bt_next return false.
657 */
658static bool
660{
661 BTScanOpaque so = (BTScanOpaque) scan->opaque;
662
663 Assert(so->numArrayKeys);
664
665 so->scanBehind = so->oppositeDirCheck = false; /* reset */
666
667 /*
668 * Array keys are advanced within _bt_checkkeys when the scan reaches the
669 * leaf level (more precisely, they're advanced when the scan reaches the
670 * end of each distinct set of array elements). This process avoids
671 * repeat access to leaf pages (across multiple primitive index scans) by
672 * advancing the scan's array keys when it allows the primitive index scan
673 * to find nearby matching tuples (or when it eliminates ranges of array
674 * key space that can't possibly be satisfied by any index tuple).
675 *
676 * _bt_checkkeys sets a simple flag variable to schedule another primitive
677 * index scan. The flag tells us what to do.
678 *
679 * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
680 * various cases where that won't happen. For example, if the index is
681 * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
682 * We also don't expect a call to _bt_checkkeys during searches for a
683 * non-existent value that happens to be lower/higher than any existing
684 * value in the index.
685 *
686 * We don't require special handling for these cases -- we don't need to
687 * be explicitly instructed to _not_ perform another primitive index scan.
688 * It's up to code under the control of _bt_first to always set the flag
689 * when another primitive index scan will be required.
690 *
691 * This works correctly, even with the tricky cases listed above, which
692 * all involve access to leaf pages "near the boundaries of the key space"
693 * (whether it's from a leftmost/rightmost page, or an imaginary empty
694 * leaf root page). If _bt_checkkeys cannot be reached by a primitive
695 * index scan for one set of array keys, then it also won't be reached for
696 * any later set ("later" in terms of the direction that we scan the index
697 * and advance the arrays). The array keys won't have advanced in these
698 * cases, but that's the correct behavior (even _bt_advance_array_keys
699 * won't always advance the arrays at the point they become "exhausted").
700 */
701 if (so->needPrimScan)
702 {
703 /*
704 * Flag was set -- must call _bt_first again, which will reset the
705 * scan's needPrimScan flag
706 */
707 return true;
708 }
709
710 /* The top-level index scan ran out of tuples in this scan direction */
711 if (scan->parallel_scan != NULL)
712 _bt_parallel_done(scan);
713
714 return false;
715}
716
717/*
718 * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
719 *
720 * Caller must have exclusively locked btscan->btps_lock when called.
721 */
722static void
724 BTScanOpaque so)
725{
726 char *datumshared;
727
728 /* Space for serialized datums begins immediately after btps_arrElems[] */
729 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
730 for (int i = 0; i < so->numArrayKeys; i++)
731 {
732 BTArrayKeyInfo *array = &so->arrayKeys[i];
733 ScanKey skey = &so->keyData[array->scan_key];
734
735 if (array->num_elems != -1)
736 {
737 /* Save SAOP array's cur_elem (no need to copy key/datum) */
738 Assert(!(skey->sk_flags & SK_BT_SKIP));
739 btscan->btps_arrElems[i] = array->cur_elem;
740 continue;
741 }
742
743 /* Save all mutable state associated with skip array's key */
744 Assert(skey->sk_flags & SK_BT_SKIP);
745 memcpy(datumshared, &skey->sk_flags, sizeof(int));
746 datumshared += sizeof(int);
747
748 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
749 {
750 /* No sk_argument datum to serialize */
751 Assert(skey->sk_argument == 0);
752 continue;
753 }
754
755 datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
756 array->attbyval, array->attlen, &datumshared);
757 }
758}
759
760/*
761 * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
762 *
763 * Caller must have exclusively locked btscan->btps_lock when called.
764 */
765static void
767 BTScanOpaque so)
768{
769 char *datumshared;
770
771 /* Space for serialized datums begins immediately after btps_arrElems[] */
772 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
773 for (int i = 0; i < so->numArrayKeys; i++)
774 {
775 BTArrayKeyInfo *array = &so->arrayKeys[i];
776 ScanKey skey = &so->keyData[array->scan_key];
777 bool isnull;
778
779 if (array->num_elems != -1)
780 {
781 /* Restore SAOP array using its saved cur_elem */
782 Assert(!(skey->sk_flags & SK_BT_SKIP));
783 array->cur_elem = btscan->btps_arrElems[i];
784 skey->sk_argument = array->elem_values[array->cur_elem];
785 continue;
786 }
787
788 /* Restore skip array by restoring its key directly */
789 if (!array->attbyval && skey->sk_argument)
791 skey->sk_argument = (Datum) 0;
792 memcpy(&skey->sk_flags, datumshared, sizeof(int));
793 datumshared += sizeof(int);
794
795 Assert(skey->sk_flags & SK_BT_SKIP);
796
797 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
798 {
799 /* No sk_argument datum to restore */
800 continue;
801 }
802
803 skey->sk_argument = datumRestore(&datumshared, &isnull);
804 if (isnull)
805 {
806 Assert(skey->sk_argument == 0);
808 Assert(skey->sk_flags & SK_ISNULL);
809 }
810 }
811}
812
813/*
814 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
815 */
816void
818{
819 BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
820
821 LWLockInitialize(&bt_target->btps_lock,
822 LWTRANCHE_PARALLEL_BTREE_SCAN);
826 ConditionVariableInit(&bt_target->btps_cv);
827}
828
829/*
830 * btparallelrescan() -- reset parallel scan
831 */
832void
834{
835 BTParallelScanDesc btscan;
836 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
837
838 Assert(parallel_scan);
839
840 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
841 parallel_scan->ps_offset_am);
842
843 /*
844 * In theory, we don't need to acquire the LWLock here, because there
845 * shouldn't be any other workers running at this point, but we do so for
846 * consistency.
847 */
852 LWLockRelease(&btscan->btps_lock);
853}
854
855/*
856 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
857 * page. Other scans must wait until we call _bt_parallel_release()
858 * or _bt_parallel_done().
859 *
860 * The return value is true if we successfully seized the scan and false
861 * if we did not. The latter case occurs when no pages remain, or when
862 * another primitive index scan is scheduled that caller's backend cannot
863 * start just yet (only backends that call from _bt_first are capable of
864 * starting primitive index scans, which they indicate by passing first=true).
865 *
866 * If the return value is true, *next_scan_page returns the next page of the
867 * scan, and *last_curr_page returns the page that *next_scan_page came from.
868 * An invalid *next_scan_page means the scan hasn't yet started, or that
869 * caller needs to start the next primitive index scan (if it's the latter
870 * case we'll set so.needPrimScan).
871 *
872 * Callers should ignore the value of *next_scan_page and *last_curr_page if
873 * the return value is false.
874 */
875bool
877 BlockNumber *last_curr_page, bool first)
878{
879 Relation rel = scan->indexRelation;
880 BTScanOpaque so = (BTScanOpaque) scan->opaque;
881 bool exit_loop = false,
882 status = true,
883 endscan = false;
884 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
885 BTParallelScanDesc btscan;
886
887 *next_scan_page = InvalidBlockNumber;
888 *last_curr_page = InvalidBlockNumber;
889
890 /*
891 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
892 * call to _bt_readnextpage treats this backend similarly to a serial
893 * backend that steps from *last_curr_page to *next_scan_page (unless this
894 * backend's so->currPos is initialized by _bt_readfirstpage before then).
895 */
897 so->currPos.moreLeft = so->currPos.moreRight = true;
898
899 if (first)
900 {
901 /*
902 * Initialize array related state when called from _bt_first, assuming
903 * that this will be the first primitive index scan for the scan
904 */
905 so->needPrimScan = false;
906 so->scanBehind = false;
907 so->oppositeDirCheck = false;
908 }
909 else
910 {
911 /*
912 * Don't attempt to seize the scan when it requires another primitive
913 * index scan, since caller's backend cannot start it right now
914 */
915 if (so->needPrimScan)
916 return false;
917 }
918
919 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
920 parallel_scan->ps_offset_am);
921
922 while (1)
923 {
924 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
925
926 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
927 {
928 /* We're done with this parallel index scan */
929 status = false;
930 }
931 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
932 btscan->btps_nextScanPage == P_NONE)
933 {
934 /* End this parallel index scan */
935 status = false;
936 endscan = true;
937 }
938 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
939 {
940 Assert(so->numArrayKeys);
941
942 if (first)
943 {
944 /* Can start scheduled primitive scan right away, so do so */
945 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
946
947 /* Restore scan's array keys from serialized values */
948 _bt_parallel_restore_arrays(rel, btscan, so);
949 exit_loop = true;
950 }
951 else
952 {
953 /*
954 * Don't attempt to seize the scan when it requires another
955 * primitive index scan, since caller's backend cannot start
956 * it right now
957 */
958 status = false;
959 }
960
961 /*
962 * Either way, update backend local state to indicate that a
963 * pending primitive scan is required
964 */
965 so->needPrimScan = true;
966 so->scanBehind = false;
967 so->oppositeDirCheck = false;
968 }
969 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
970 {
971 /*
972 * We have successfully seized control of the scan for the purpose
973 * of advancing it to a new page!
974 */
975 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
976 Assert(btscan->btps_nextScanPage != P_NONE);
977 *next_scan_page = btscan->btps_nextScanPage;
978 *last_curr_page = btscan->btps_lastCurrPage;
979 exit_loop = true;
980 }
981 LWLockRelease(&btscan->btps_lock);
982 if (exit_loop || !status)
983 break;
984 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
985 }
987
988 /* When the scan has reached the rightmost (or leftmost) page, end it */
989 if (endscan)
990 _bt_parallel_done(scan);
991
992 return status;
993}
994
995/*
996 * _bt_parallel_release() -- Complete the process of advancing the scan to a
997 * new page. We now have the new value btps_nextScanPage; another backend
998 * can now begin advancing the scan.
999 *
1000 * Callers whose scan uses array keys must save their curr_page argument so
1001 * that it can be passed to _bt_parallel_primscan_schedule, should caller
1002 * determine that another primitive index scan is required.
1003 *
1004 * If caller's next_scan_page is P_NONE, the scan has reached the index's
1005 * rightmost/leftmost page. This is treated as reaching the end of the scan
1006 * within _bt_parallel_seize.
1007 *
1008 * Note: unlike the serial case, parallel scans don't need to remember both
1009 * sibling links. next_scan_page is whichever link is next given the scan's
1010 * direction. That's all we'll ever need, since the direction of a parallel
1011 * scan can never change.
1012 */
1013void
1015 BlockNumber curr_page)
1016{
1017 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1018 BTParallelScanDesc btscan;
1019
1020 Assert(BlockNumberIsValid(next_scan_page));
1021
1022 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1023 parallel_scan->ps_offset_am);
1024
1026 btscan->btps_nextScanPage = next_scan_page;
1027 btscan->btps_lastCurrPage = curr_page;
1029 LWLockRelease(&btscan->btps_lock);
1031}
1032
1033/*
1034 * _bt_parallel_done() -- Mark the parallel scan as complete.
1035 *
1036 * When there are no pages left to scan, this function should be called to
1037 * notify other workers. Otherwise, they might wait forever for the scan to
1038 * advance to the next page.
1039 */
1040void
1042{
1043 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1044 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1045 BTParallelScanDesc btscan;
1046 bool status_changed = false;
1047
1049
1050 /* Do nothing, for non-parallel scans */
1051 if (parallel_scan == NULL)
1052 return;
1053
1054 /*
1055 * Should not mark parallel scan done when there's still a pending
1056 * primitive index scan
1057 */
1058 if (so->needPrimScan)
1059 return;
1060
1061 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1062 parallel_scan->ps_offset_am);
1063
1064 /*
1065 * Mark the parallel scan as done, unless some other process did so
1066 * already
1067 */
1068 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1069 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1070 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1071 {
1072 btscan->btps_pageStatus = BTPARALLEL_DONE;
1073 status_changed = true;
1074 }
1075 LWLockRelease(&btscan->btps_lock);
1076
1077 /* wake up all the workers associated with this parallel scan */
1078 if (status_changed)
1079 ConditionVariableBroadcast(&btscan->btps_cv);
1080}
1081
1082/*
1083 * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1084 *
1085 * Caller passes the curr_page most recently passed to _bt_parallel_release
1086 * by its backend. Caller successfully schedules the next primitive index scan
1087 * if the shared parallel state hasn't been seized since caller's backend last
1088 * advanced the scan.
1089 */
1090void
1092{
1093 Relation rel = scan->indexRelation;
1094 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1095 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1096 BTParallelScanDesc btscan;
1097
1098 Assert(so->numArrayKeys);
1099
1100 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1101 parallel_scan->ps_offset_am);
1102
1103 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1104 if (btscan->btps_lastCurrPage == curr_page &&
1105 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1106 {
1107 btscan->btps_nextScanPage = InvalidBlockNumber;
1108 btscan->btps_lastCurrPage = InvalidBlockNumber;
1109 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1110
1111 /* Serialize scan's current array keys */
1112 _bt_parallel_serialize_arrays(rel, btscan, so);
1113 }
1114 LWLockRelease(&btscan->btps_lock);
1115}
1116
1117/*
1118 * Bulk deletion of all index entries pointing to a set of heap tuples.
1119 * The set of target tuples is specified via a callback routine that tells
1120 * whether any given heap tuple (identified by ItemPointer) is being deleted.
1121 *
1122 * Result: a palloc'd struct containing statistical info for VACUUM displays.
1123 */
1126 IndexBulkDeleteCallback callback, void *callback_state)
1127{
1128 Relation rel = info->index;
1129 BTCycleId cycleid;
1130
1131 /* allocate stats if first time through, else re-use existing struct */
1132 if (stats == NULL)
1134
1135 /* Establish the vacuum cycle ID to use for this scan */
1136 /* The ENSURE stuff ensures we clean up shared memory on failure */
1138 {
1139 cycleid = _bt_start_vacuum(rel);
1140
1141 btvacuumscan(info, stats, callback, callback_state, cycleid);
1142 }
1144 _bt_end_vacuum(rel);
1145
1146 return stats;
1147}
1148
1149/*
1150 * Post-VACUUM cleanup.
1151 *
1152 * Result: a palloc'd struct containing statistical info for VACUUM displays.
1153 */
1156{
1157 BlockNumber num_delpages;
1158
1159 /* No-op in ANALYZE ONLY mode */
1160 if (info->analyze_only)
1161 return stats;
1162
1163 /*
1164 * If btbulkdelete was called, we need not do anything (we just maintain
1165 * the information used within _bt_vacuum_needs_cleanup() by calling
1166 * _bt_set_cleanup_info() below).
1167 *
1168 * If btbulkdelete was _not_ called, then we have a choice to make: we
1169 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1170 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1171 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1172 * now.
1173 */
1174 if (stats == NULL)
1175 {
1176 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1177 if (!_bt_vacuum_needs_cleanup(info->index))
1178 return NULL;
1179
1180 /*
1181 * Since we aren't going to actually delete any leaf items, there's no
1182 * need to go through all the vacuum-cycle-ID pushups here.
1183 *
1184 * Posting list tuples are a source of inaccuracy for cleanup-only
1185 * scans. btvacuumscan() will assume that the number of index tuples
1186 * from each page can be used as num_index_tuples, even though
1187 * num_index_tuples is supposed to represent the number of TIDs in the
1188 * index. This naive approach can underestimate the number of tuples
1189 * in the index significantly.
1190 *
1191 * We handle the problem by making num_index_tuples an estimate in
1192 * cleanup-only case.
1193 */
1195 btvacuumscan(info, stats, NULL, NULL, 0);
1196 stats->estimated_count = true;
1197 }
1198
1199 /*
1200 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1201 *
1202 * num_delpages is the number of deleted pages now in the index that were
1203 * not safe to place in the FSM to be recycled just yet. num_delpages is
1204 * greater than 0 only when _bt_pagedel() actually deleted pages during
1205 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1206 * have failed to place any newly deleted pages in the FSM just moments
1207 * ago. (Actually, there are edge cases where recycling of the current
1208 * VACUUM's newly deleted pages does not even become safe by the time the
1209 * next VACUUM comes around. See nbtree/README.)
1210 */
1211 Assert(stats->pages_deleted >= stats->pages_free);
1212 num_delpages = stats->pages_deleted - stats->pages_free;
1213 _bt_set_cleanup_info(info->index, num_delpages);
1214
1215 /*
1216 * It's quite possible for us to be fooled by concurrent page splits into
1217 * double-counting some index tuples, so disbelieve any total that exceeds
1218 * the underlying heap's count ... if we know that accurately. Otherwise
1219 * this might just make matters worse.
1220 */
1221 if (!info->estimated_count)
1222 {
1223 if (stats->num_index_tuples > info->num_heap_tuples)
1224 stats->num_index_tuples = info->num_heap_tuples;
1225 }
1226
1227 return stats;
1228}
1229
1230/*
1231 * btvacuumscan --- scan the index for VACUUMing purposes
1232 *
1233 * This combines the functions of looking for leaf tuples that are deletable
1234 * according to the vacuum callback, looking for empty pages that can be
1235 * deleted, and looking for old deleted pages that can be recycled. Both
1236 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1237 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1238 *
1239 * The caller is responsible for initially allocating/zeroing a stats struct
1240 * and for obtaining a vacuum cycle ID if necessary.
1241 */
1242static void
1244 IndexBulkDeleteCallback callback, void *callback_state,
1245 BTCycleId cycleid)
1246{
1247 Relation rel = info->index;
1248 BTVacState vstate;
1249 BlockNumber num_pages;
1250 bool needLock;
1252 ReadStream *stream = NULL;
1253
1254 /*
1255 * Reset fields that track information about the entire index now. This
1256 * avoids double-counting in the case where a single VACUUM command
1257 * requires multiple scans of the index.
1258 *
1259 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1260 * since they track information about the VACUUM command, and so must last
1261 * across each call to btvacuumscan().
1262 *
1263 * (Note that pages_free is treated as state about the whole index, not
1264 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1265 * calls are idempotent, and get repeated for the same deleted pages in
1266 * some scenarios. The point for us is to track the number of recyclable
1267 * pages in the index at the end of the VACUUM command.)
1268 */
1269 stats->num_pages = 0;
1270 stats->num_index_tuples = 0;
1271 stats->pages_deleted = 0;
1272 stats->pages_free = 0;
1273
1274 /* Set up info to pass down to btvacuumpage */
1275 vstate.info = info;
1276 vstate.stats = stats;
1277 vstate.callback = callback;
1278 vstate.callback_state = callback_state;
1279 vstate.cycleid = cycleid;
1280
1281 /* Create a temporary memory context to run _bt_pagedel in */
1283 "_bt_pagedel",
1285
1286 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1287 vstate.bufsize = 0;
1288 vstate.maxbufsize = 0;
1289 vstate.pendingpages = NULL;
1290 vstate.npendingpages = 0;
1291 /* Consider applying _bt_pendingfsm_finalize optimization */
1292 _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1293
1294 /*
1295 * The outer loop iterates over all index pages except the metapage, in
1296 * physical order (we hope the kernel will cooperate in providing
1297 * read-ahead for speed). It is critical that we visit all leaf pages,
1298 * including ones added after we start the scan, else we might fail to
1299 * delete some deletable tuples. Hence, we must repeatedly check the
1300 * relation length. We must acquire the relation-extension lock while
1301 * doing so to avoid a race condition: if someone else is extending the
1302 * relation, there is a window where bufmgr/smgr have created a new
1303 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1304 * we manage to scan such a page here, we'll improperly assume it can be
1305 * recycled. Taking the lock synchronizes things enough to prevent a
1306 * problem: either num_pages won't include the new page, or _bt_getbuf
1307 * already has write lock on the buffer and it will be fully initialized
1308 * before we can examine it. Also, we need not worry if a page is added
1309 * immediately after we look; the page splitting code already has
1310 * write-lock on the left page before it adds a right page, so we must
1311 * already have processed any tuples due to be moved into such a page.
1312 *
1313 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1314 * think the use of the extension lock is still required.
1315 *
1316 * We can skip locking for new or temp relations, however, since no one
1317 * else could be accessing them.
1318 */
1319 needLock = !RELATION_IS_LOCAL(rel);
1320
1322
1323 /*
1324 * It is safe to use batchmode as block_range_read_stream_cb takes no
1325 * locks.
1326 */
1330 info->strategy,
1331 rel,
1334 &p,
1335 0);
1336 for (;;)
1337 {
1338 /* Get the current relation length */
1339 if (needLock)
1341 num_pages = RelationGetNumberOfBlocks(rel);
1342 if (needLock)
1344
1345 if (info->report_progress)
1347 num_pages);
1348
1349 /* Quit if we've scanned the whole relation */
1350 if (p.current_blocknum >= num_pages)
1351 break;
1352
1353 p.last_exclusive = num_pages;
1354
1355 /* Iterate over pages, then loop back to recheck relation length */
1356 while (true)
1357 {
1358 BlockNumber current_block;
1359 Buffer buf;
1360
1361 /* call vacuum_delay_point while not holding any buffer lock */
1362 vacuum_delay_point(false);
1363
1364 buf = read_stream_next_buffer(stream, NULL);
1365
1366 if (!BufferIsValid(buf))
1367 break;
1368
1369 current_block = btvacuumpage(&vstate, buf);
1370
1371 if (info->report_progress)
1373 current_block);
1374 }
1375
1376 /*
1377 * We have to reset the read stream to use it again. After returning
1378 * InvalidBuffer, the read stream API won't invoke our callback again
1379 * until the stream has been reset.
1380 */
1381 read_stream_reset(stream);
1382 }
1383
1384 read_stream_end(stream);
1385
1386 /* Set statistics num_pages field to final size of index */
1387 stats->num_pages = num_pages;
1388
1390
1391 /*
1392 * If there were any calls to _bt_pagedel() during scan of the index then
1393 * see if any of the resulting pages can be placed in the FSM now. When
1394 * it's not safe we'll have to leave it up to a future VACUUM operation.
1395 *
1396 * Finally, if we placed any pages in the FSM (either just now or during
1397 * the scan), forcibly update the upper-level FSM pages to ensure that
1398 * searchers can find them.
1399 */
1400 _bt_pendingfsm_finalize(rel, &vstate);
1401 if (stats->pages_free > 0)
1403}
1404
1405/*
1406 * btvacuumpage --- VACUUM one page
1407 *
1408 * This processes a single page for btvacuumscan(). In some cases we must
1409 * backtrack to re-examine and VACUUM pages that were on buf's page during
1410 * a previous call here. This is how we handle page splits (that happened
1411 * after our cycleid was acquired) whose right half page happened to reuse
1412 * a block that we might have processed at some point before it was
1413 * recycled (i.e. before the page split).
1414 *
1415 * Returns BlockNumber of a scanned page (not backtracked).
1416 */
1417static BlockNumber
1419{
1420 IndexVacuumInfo *info = vstate->info;
1421 IndexBulkDeleteResult *stats = vstate->stats;
1423 void *callback_state = vstate->callback_state;
1424 Relation rel = info->index;
1425 Relation heaprel = info->heaprel;
1426 bool attempt_pagedel;
1427 BlockNumber blkno,
1428 backtrack_to;
1430 Page page;
1431 BTPageOpaque opaque;
1432
1433 blkno = scanblkno;
1434
1435backtrack:
1436
1437 attempt_pagedel = false;
1438 backtrack_to = P_NONE;
1439
1440 _bt_lockbuf(rel, buf, BT_READ);
1441 page = BufferGetPage(buf);
1442 opaque = NULL;
1443 if (!PageIsNew(page))
1444 {
1445 _bt_checkpage(rel, buf);
1446 opaque = BTPageGetOpaque(page);
1447 }
1448
1449 Assert(blkno <= scanblkno);
1450 if (blkno != scanblkno)
1451 {
1452 /*
1453 * We're backtracking.
1454 *
1455 * We followed a right link to a sibling leaf page (a page that
1456 * happens to be from a block located before scanblkno). The only
1457 * case we want to do anything with is a live leaf page having the
1458 * current vacuum cycle ID.
1459 *
1460 * The page had better be in a state that's consistent with what we
1461 * expect. Check for conditions that imply corruption in passing. It
1462 * can't be half-dead because only an interrupted VACUUM process can
1463 * leave pages in that state, so we'd definitely have dealt with it
1464 * back when the page was the scanblkno page (half-dead pages are
1465 * always marked fully deleted by _bt_pagedel(), barring corruption).
1466 */
1467 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1468 {
1469 Assert(false);
1470 ereport(LOG,
1471 (errcode(ERRCODE_INDEX_CORRUPTED),
1472 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1473 blkno, scanblkno, RelationGetRelationName(rel))));
1474 _bt_relbuf(rel, buf);
1475 return scanblkno;
1476 }
1477
1478 /*
1479 * We may have already processed the page in an earlier call, when the
1480 * page was scanblkno. This happens when the leaf page split occurred
1481 * after the scan began, but before the right sibling page became the
1482 * scanblkno.
1483 *
1484 * Page may also have been deleted by current btvacuumpage() call,
1485 * since _bt_pagedel() sometimes deletes the right sibling page of
1486 * scanblkno in passing (it does so after we decided where to
1487 * backtrack to). We don't need to process this page as a deleted
1488 * page a second time now (in fact, it would be wrong to count it as a
1489 * deleted page in the bulk delete statistics a second time).
1490 */
1491 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1492 {
1493 /* Done with current scanblkno (and all lower split pages) */
1494 _bt_relbuf(rel, buf);
1495 return scanblkno;
1496 }
1497 }
1498
1499 if (!opaque || BTPageIsRecyclable(page, heaprel))
1500 {
1501 /* Okay to recycle this page (which could be leaf or internal) */
1502 RecordFreeIndexPage(rel, blkno);
1503 stats->pages_deleted++;
1504 stats->pages_free++;
1505 }
1506 else if (P_ISDELETED(opaque))
1507 {
1508 /*
1509 * Already deleted page (which could be leaf or internal). Can't
1510 * recycle yet.
1511 */
1512 stats->pages_deleted++;
1513 }
1514 else if (P_ISHALFDEAD(opaque))
1515 {
1516 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1517 attempt_pagedel = true;
1518
1519 /*
1520 * _bt_pagedel() will increment both pages_newly_deleted and
1521 * pages_deleted stats in all cases (barring corruption)
1522 */
1523 }
1524 else if (P_ISLEAF(opaque))
1525 {
1527 int ndeletable;
1529 int nupdatable;
1530 OffsetNumber offnum,
1531 minoff,
1532 maxoff;
1533 int nhtidsdead,
1534 nhtidslive;
1535
1536 /*
1537 * Trade in the initial read lock for a full cleanup lock on this
1538 * page. We must get such a lock on every leaf page over the course
1539 * of the vacuum scan, whether or not it actually contains any
1540 * deletable tuples --- see nbtree/README.
1541 */
1543
1544 /*
1545 * Check whether we need to backtrack to earlier pages. What we are
1546 * concerned about is a page split that happened since we started the
1547 * vacuum scan. If the split moved tuples on the right half of the
1548 * split (i.e. the tuples that sort high) to a block that we already
1549 * passed over, then we might have missed the tuples. We need to
1550 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1551 * or deleting scanblkno page below!)
1552 */
1553 if (vstate->cycleid != 0 &&
1554 opaque->btpo_cycleid == vstate->cycleid &&
1555 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1556 !P_RIGHTMOST(opaque) &&
1557 opaque->btpo_next < scanblkno)
1558 backtrack_to = opaque->btpo_next;
1559
1560 ndeletable = 0;
1561 nupdatable = 0;
1562 minoff = P_FIRSTDATAKEY(opaque);
1563 maxoff = PageGetMaxOffsetNumber(page);
1564 nhtidsdead = 0;
1565 nhtidslive = 0;
1566 if (callback)
1567 {
1568 /* btbulkdelete callback tells us what to delete (or update) */
1569 for (offnum = minoff;
1570 offnum <= maxoff;
1571 offnum = OffsetNumberNext(offnum))
1572 {
1573 IndexTuple itup;
1574
1575 itup = (IndexTuple) PageGetItem(page,
1576 PageGetItemId(page, offnum));
1577
1578 Assert(!BTreeTupleIsPivot(itup));
1579 if (!BTreeTupleIsPosting(itup))
1580 {
1581 /* Regular tuple, standard table TID representation */
1582 if (callback(&itup->t_tid, callback_state))
1583 {
1584 deletable[ndeletable++] = offnum;
1585 nhtidsdead++;
1586 }
1587 else
1588 nhtidslive++;
1589 }
1590 else
1591 {
1592 BTVacuumPosting vacposting;
1593 int nremaining;
1594
1595 /* Posting list tuple */
1596 vacposting = btreevacuumposting(vstate, itup, offnum,
1597 &nremaining);
1598 if (vacposting == NULL)
1599 {
1600 /*
1601 * All table TIDs from the posting tuple remain, so no
1602 * delete or update required
1603 */
1604 Assert(nremaining == BTreeTupleGetNPosting(itup));
1605 }
1606 else if (nremaining > 0)
1607 {
1608
1609 /*
1610 * Store metadata about posting list tuple in
1611 * updatable array for entire page. Existing tuple
1612 * will be updated during the later call to
1613 * _bt_delitems_vacuum().
1614 */
1615 Assert(nremaining < BTreeTupleGetNPosting(itup));
1616 updatable[nupdatable++] = vacposting;
1617 nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1618 }
1619 else
1620 {
1621 /*
1622 * All table TIDs from the posting list must be
1623 * deleted. We'll delete the index tuple completely
1624 * (no update required).
1625 */
1626 Assert(nremaining == 0);
1627 deletable[ndeletable++] = offnum;
1628 nhtidsdead += BTreeTupleGetNPosting(itup);
1629 pfree(vacposting);
1630 }
1631
1632 nhtidslive += nremaining;
1633 }
1634 }
1635 }
1636
1637 /*
1638 * Apply any needed deletes or updates. We issue just one
1639 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1640 */
1641 if (ndeletable > 0 || nupdatable > 0)
1642 {
1643 Assert(nhtidsdead >= ndeletable + nupdatable);
1644 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1645 nupdatable);
1646
1647 stats->tuples_removed += nhtidsdead;
1648 /* must recompute maxoff */
1649 maxoff = PageGetMaxOffsetNumber(page);
1650
1651 /* can't leak memory here */
1652 for (int i = 0; i < nupdatable; i++)
1653 pfree(updatable[i]);
1654 }
1655 else
1656 {
1657 /*
1658 * If the leaf page has been split during this vacuum cycle, it
1659 * seems worth expending a write to clear btpo_cycleid even if we
1660 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1661 * takes care of this.) This ensures we won't process the page
1662 * again.
1663 *
1664 * We treat this like a hint-bit update because there's no need to
1665 * WAL-log it.
1666 */
1667 Assert(nhtidsdead == 0);
1668 if (vstate->cycleid != 0 &&
1669 opaque->btpo_cycleid == vstate->cycleid)
1670 {
1671 opaque->btpo_cycleid = 0;
1672 MarkBufferDirtyHint(buf, true);
1673 }
1674 }
1675
1676 /*
1677 * If the leaf page is now empty, try to delete it; else count the
1678 * live tuples (live table TIDs in posting lists are counted as
1679 * separate live tuples). We don't delete when backtracking, though,
1680 * since that would require teaching _bt_pagedel() about backtracking
1681 * (doesn't seem worth adding more complexity to deal with that).
1682 *
1683 * We don't count the number of live TIDs during cleanup-only calls to
1684 * btvacuumscan (i.e. when callback is not set). We count the number
1685 * of index tuples directly instead. This avoids the expense of
1686 * directly examining all of the tuples on each page. VACUUM will
1687 * treat num_index_tuples as an estimate in cleanup-only case, so it
1688 * doesn't matter that this underestimates num_index_tuples
1689 * significantly in some cases.
1690 */
1691 if (minoff > maxoff)
1692 attempt_pagedel = (blkno == scanblkno);
1693 else if (callback)
1694 stats->num_index_tuples += nhtidslive;
1695 else
1696 stats->num_index_tuples += maxoff - minoff + 1;
1697
1698 Assert(!attempt_pagedel || nhtidslive == 0);
1699 }
1700
1701 if (attempt_pagedel)
1702 {
1703 MemoryContext oldcontext;
1704
1705 /* Run pagedel in a temp context to avoid memory leakage */
1707 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1708
1709 /*
1710 * _bt_pagedel maintains the bulk delete stats on our behalf;
1711 * pages_newly_deleted and pages_deleted are likely to be incremented
1712 * during call
1713 */
1714 Assert(blkno == scanblkno);
1715 _bt_pagedel(rel, buf, vstate);
1716
1717 MemoryContextSwitchTo(oldcontext);
1718 /* pagedel released buffer, so we shouldn't */
1719 }
1720 else
1721 _bt_relbuf(rel, buf);
1722
1723 if (backtrack_to != P_NONE)
1724 {
1725 blkno = backtrack_to;
1726
1727 /* check for vacuum delay while not holding any buffer lock */
1728 vacuum_delay_point(false);
1729
1730 /*
1731 * We can't use _bt_getbuf() here because it always applies
1732 * _bt_checkpage(), which will barf on an all-zero page. We want to
1733 * recycle all-zero pages, not fail. Also, we want to use a
1734 * nondefault buffer access strategy.
1735 */
1737 info->strategy);
1738 goto backtrack;
1739 }
1740
1741 return scanblkno;
1742}
1743
1744/*
1745 * btreevacuumposting --- determine TIDs still needed in posting list
1746 *
1747 * Returns metadata describing how to build replacement tuple without the TIDs
1748 * that VACUUM needs to delete. Returned value is NULL in the common case
1749 * where no changes are needed to caller's posting list tuple (we avoid
1750 * allocating memory here as an optimization).
1751 *
1752 * The number of TIDs that should remain in the posting list tuple is set for
1753 * caller in *nremaining.
1754 */
1755static BTVacuumPosting
1757 OffsetNumber updatedoffset, int *nremaining)
1758{
1759 int live = 0;
1760 int nitem = BTreeTupleGetNPosting(posting);
1762 BTVacuumPosting vacposting = NULL;
1763
1764 for (int i = 0; i < nitem; i++)
1765 {
1766 if (!vstate->callback(items + i, vstate->callback_state))
1767 {
1768 /* Live table TID */
1769 live++;
1770 }
1771 else if (vacposting == NULL)
1772 {
1773 /*
1774 * First dead table TID encountered.
1775 *
1776 * It's now clear that we need to delete one or more dead table
1777 * TIDs, so start maintaining metadata describing how to update
1778 * existing posting list tuple.
1779 */
1780 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1781 nitem * sizeof(uint16));
1782
1783 vacposting->itup = posting;
1784 vacposting->updatedoffset = updatedoffset;
1785 vacposting->ndeletedtids = 0;
1786 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1787 }
1788 else
1789 {
1790 /* Second or subsequent dead table TID */
1791 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1792 }
1793 }
1794
1795 *nremaining = live;
1796 return vacposting;
1797}
1798
1799/*
1800 * btcanreturn() -- Check whether btree indexes support index-only scans.
1801 *
1802 * btrees always do, so this is trivial.
1803 */
1804bool
1806{
1807 return true;
1808}
1809
1810/*
1811 * btgettreeheight() -- Compute tree height for use by btcostestimate().
1812 */
1813int
1815{
1816 return _bt_getrootheight(rel);
1817}
1818
1821{
1822 switch (strategy)
1823 {
1825 return COMPARE_LT;
1827 return COMPARE_LE;
1829 return COMPARE_EQ;
1831 return COMPARE_GE;
1833 return COMPARE_GT;
1834 default:
1835 return COMPARE_INVALID;
1836 }
1837}
1838
1841{
1842 switch (cmptype)
1843 {
1844 case COMPARE_LT:
1845 return BTLessStrategyNumber;
1846 case COMPARE_LE:
1848 case COMPARE_EQ:
1849 return BTEqualStrategyNumber;
1850 case COMPARE_GE:
1852 case COMPARE_GT:
1854 default:
1855 return InvalidStrategy;
1856 }
1857}
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static Datum values[MAXATTR]
Definition: bootstrap.c:153
int Buffer
Definition: buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5493
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4318
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5525
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:886
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:294
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:436
@ RBM_NORMAL
Definition: bufmgr.h:46
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:387
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:233
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
PageData * Page
Definition: bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
#define OffsetToPointer(base, offset)
Definition: c.h:785
int64_t int64
Definition: c.h:549
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:486
int16_t int16
Definition: c.h:547
uint16_t uint16
Definition: c.h:551
size_t Size
Definition: c.h:624
CompareType
Definition: cmptype.h:32
@ COMPARE_LE
Definition: cmptype.h:35
@ COMPARE_INVALID
Definition: cmptype.h:33
@ COMPARE_GT
Definition: cmptype.h:38
@ COMPARE_EQ
Definition: cmptype.h:36
@ COMPARE_GE
Definition: cmptype.h:37
@ COMPARE_LT
Definition: cmptype.h:34
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumRestore(char **start_address, bool *isnull)
Definition: datum.c:521
void datumSerialize(Datum value, bool isnull, bool typByVal, int typLen, char **start_address)
Definition: datum.c:459
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition: datum.c:412
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errcode(int sqlerrcode)
Definition: elog.c:863
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:150
#define palloc_object(type)
Definition: fe_memutils.h:74
#define palloc_array(type, count)
Definition: fe_memutils.h:76
#define palloc0_object(type)
Definition: fe_memutils.h:75
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:114
IndexUniqueCheck
Definition: genam.h:143
Assert(PointerIsAligned(start, uint64))
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:181
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
#define ExclusiveLock
Definition: lockdefs.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1178
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1898
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:698
@ LW_EXCLUSIVE
Definition: lwlock.h:112
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:403
void pfree(void *pointer)
Definition: mcxt.c:1616
void * palloc(Size size)
Definition: mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition: mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:472
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:104
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1024
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:676
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1801
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1155
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:798
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:233
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1110
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:68
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:180
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:3000
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1040
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2958
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtreadpage.c:537
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition: nbtree.c:1091
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1805
BTPS_State
Definition: nbtree.c:55
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:58
@ BTPARALLEL_NEED_PRIMSCAN
Definition: nbtree.c:57
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56
@ BTPARALLEL_IDLE
Definition: nbtree.c:59
@ BTPARALLEL_DONE
Definition: nbtree.c:60
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:876
StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily)
Definition: nbtree.c:1840
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:337
static bool _bt_start_prim_scan(IndexScanDesc scan)
Definition: nbtree.c:659
static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf)
Definition: nbtree.c:1418
Size btestimateparallelscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:581
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:1041
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:1155
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1756
CompareType bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
Definition: nbtree.c:1820
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:228
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:833
struct BTParallelScanDescData BTParallelScanDescData
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:204
void btbuildempty(Relation index)
Definition: nbtree.c:181
int btgettreeheight(Relation rel)
Definition: nbtree.c:1814
void btinitparallelscan(void *target)
Definition: nbtree.c:817
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:1125
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:1243
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:723
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:289
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:497
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:461
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:386
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:93
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition: nbtree.c:1014
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:116
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:523
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:766
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:225
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:1004
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define SK_BT_SKIP
Definition: nbtree.h:1106
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:538
#define BTNProcs
Definition: nbtree.h:723
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1021
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
uint16 BTCycleId
Definition: nbtree.h:30
#define P_NONE
Definition: nbtree.h:213
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define BTREE_METAPAGE
Definition: nbtree.h:149
#define SK_BT_MAXVAL
Definition: nbtree.h:1110
#define BT_READ
Definition: nbtree.h:730
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:292
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1027
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1015
#define BTMaxItemSize
Definition: nbtree.h:165
#define BTP_SPLIT_END
Definition: nbtree.h:82
#define BTOPTIONS_PROC
Definition: nbtree.h:721
#define SK_BT_MINVAL
Definition: nbtree.h:1109
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1097
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:877
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1585
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:296
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:526
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:554
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:205
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:650
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:604
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:627
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:1181
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:469
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:40
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:288
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
int16 attnum
Definition: pg_attribute.h:74
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:332
uint64_t Datum
Definition: postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:142
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:141
void read_stream_reset(ReadStream *stream)
Definition: read_stream.c:1044
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:791
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:737
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1089
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_MAINTENANCE
Definition: read_stream.h:28
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:658
#define RelationGetDescr(relation)
Definition: rel.h:541
#define RelationGetRelationName(relation)
Definition: rel.h:549
#define RelationNeedsWAL(relation)
Definition: rel.h:638
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:534
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:7686
Size add_size(Size s1, Size s2)
Definition: shmem.c:495
#define SK_SEARCHNULL
Definition: skey.h:121
ScanKeyData * ScanKey
Definition: skey.h:75
#define SK_ISNULL
Definition: skey.h:115
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:55
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTMaxStrategyNumber
Definition: stratnum.h:35
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
bool attbyval
Definition: nbtree.h:1046
Datum * elem_values
Definition: nbtree.h:1041
int16 attlen
Definition: nbtree.h:1045
BlockNumber btpo_next
Definition: nbtree.h:66
uint16 btpo_flags
Definition: nbtree.h:68
BTCycleId btpo_cycleid
Definition: nbtree.h:69
BTPS_State btps_pageStatus
Definition: nbtree.c:72
BlockNumber btps_lastCurrPage
Definition: nbtree.c:70
ConditionVariable btps_cv
Definition: nbtree.c:76
BlockNumber btps_nextScanPage
Definition: nbtree.c:69
int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.c:83
bool needPrimScan
Definition: nbtree.h:1063
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1066
char * markTuples
Definition: nbtree.h:1081
FmgrInfo * orderProcs
Definition: nbtree.h:1067
BTScanPosData currPos
Definition: nbtree.h:1093
int * killedItems
Definition: nbtree.h:1071
char * currTuples
Definition: nbtree.h:1080
bool oppositeDirCheck
Definition: nbtree.h:1065
BTScanPosData markPos
Definition: nbtree.h:1094
ScanKey keyData
Definition: nbtree.h:1058
MemoryContext arrayContext
Definition: nbtree.h:1068
bool moreRight
Definition: nbtree.h:986
Buffer buf
Definition: nbtree.h:964
int nextTupleOffset
Definition: nbtree.h:979
bool moreLeft
Definition: nbtree.h:985
int lastItem
Definition: nbtree.h:996
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:999
int itemIndex
Definition: nbtree.h:997
ScanDirection dir
Definition: nbtree.h:973
ItemPointerData heapTid
Definition: nbtree.h:957
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BTCycleId cycleid
Definition: nbtree.h:337
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
IndexBulkDeleteCallback callback
Definition: nbtree.h:335
MemoryContext pagedelcontext
Definition: nbtree.h:338
IndexVacuumInfo * info
Definition: nbtree.h:333
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344
void * callback_state
Definition: nbtree.h:336
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:922
uint16 ndeletedtids
Definition: nbtree.h:921
IndexTuple itup
Definition: nbtree.h:917
OffsetNumber updatedoffset
Definition: nbtree.h:918
int16 attlen
Definition: tupdesc.h:71
NodeTag type
Definition: amapi.h:234
BlockNumber pages_deleted
Definition: genam.h:109
BlockNumber pages_free
Definition: genam.h:110
BlockNumber num_pages
Definition: genam.h:104
double tuples_removed
Definition: genam.h:107
double num_index_tuples
Definition: genam.h:106
struct ScanKeyData * keyData
Definition: relscan.h:143
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:193
bool kill_prior_tuple
Definition: relscan.h:149
struct TupleDescData * xs_itupdesc
Definition: relscan.h:170
Relation indexRelation
Definition: relscan.h:139
ItemPointerData xs_heaptid
Definition: relscan.h:174
struct SnapshotData * xs_snapshot
Definition: relscan.h:140
Relation heapRelation
Definition: relscan.h:138
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:73
double num_heap_tuples
Definition: genam.h:79
bool analyze_only
Definition: genam.h:75
BufferAccessStrategy strategy
Definition: genam.h:80
Relation heaprel
Definition: genam.h:74
bool report_progress
Definition: genam.h:76
bool estimated_count
Definition: genam.h:77
Definition: lwlock.h:42
TupleDesc rd_att
Definition: rel.h:112
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
Definition: type.h:96
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
static ItemArray items
Definition: test_tidstore.c:48
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition: tidbitmap.c:367
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition: tupdesc.h:175
void vacuum_delay_point(bool is_analyze)
Definition: vacuum.c:2426
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:55