PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtree.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
6 *
7 * NOTES
8 * This file contains only the public interface routines.
9 *
10 *
11 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
13 *
14 * IDENTIFICATION
15 * src/backend/access/nbtree/nbtree.c
16 *
17 *-------------------------------------------------------------------------
18 */
19#include "postgres.h"
20
21#include "access/nbtree.h"
22#include "access/relscan.h"
23#include "access/stratnum.h"
24#include "commands/progress.h"
25#include "commands/vacuum.h"
26#include "nodes/execnodes.h"
27#include "pgstat.h"
28#include "storage/bulk_write.h"
30#include "storage/indexfsm.h"
31#include "storage/ipc.h"
32#include "storage/lmgr.h"
33#include "storage/lwlock.h"
34#include "storage/read_stream.h"
35#include "utils/datum.h"
36#include "utils/fmgrprotos.h"
38#include "utils/memutils.h"
39#include "utils/wait_event.h"
40
41
42/*
43 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
44 *
45 * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
46 * scan to advance it via another call to _bt_first.
47 *
48 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
49 * a new page; others must wait.
50 *
51 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
52 * to a new page; some process can start doing that.
53 *
54 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
55 */
64
65/*
66 * BTParallelScanDescData contains btree specific shared information required
67 * for parallel scan.
68 */
70{
71 BlockNumber btps_nextScanPage; /* next page to be scanned */
72 BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
73 * btps_nextScanPage */
74 BTPS_State btps_pageStatus; /* indicates whether next page is
75 * available for scan. see above for
76 * possible states of parallel scan. */
77 LWLock btps_lock; /* protects shared parallel state */
78 ConditionVariable btps_cv; /* used to synchronize parallel scan */
79
80 /*
81 * btps_arrElems is used when scans need to schedule another primitive
82 * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
83 * offsets for each = scan key associated with a ScalarArrayOp array.
84 */
86
87 /*
88 * Additional space (at the end of the struct) is used when scans need to
89 * schedule another primitive index scan with one or more skip arrays.
90 * Holds a flattened datum representation for each = scan key associated
91 * with a skip array.
92 */
94
96
97
98static bool _bt_start_prim_scan(IndexScanDesc scan);
103static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
104 IndexBulkDeleteCallback callback, void *callback_state,
105 BTCycleId cycleid);
109 OffsetNumber updatedoffset,
110 int *nremaining);
111
112
113/*
114 * Btree handler function: return IndexAmRoutine with access method parameters
115 * and callbacks.
116 */
117Datum
119{
120 static const IndexAmRoutine amroutine = {
122 .amstrategies = BTMaxStrategyNumber,
123 .amsupport = BTNProcs,
124 .amoptsprocnum = BTOPTIONS_PROC,
125 .amcanorder = true,
126 .amcanorderbyop = false,
127 .amcanhash = false,
128 .amconsistentequality = true,
129 .amconsistentordering = true,
130 .amcanbackward = true,
131 .amcanunique = true,
132 .amcanmulticol = true,
133 .amoptionalkey = true,
134 .amsearcharray = true,
135 .amsearchnulls = true,
136 .amstorage = false,
137 .amclusterable = true,
138 .ampredlocks = true,
139 .amcanparallel = true,
140 .amcanbuildparallel = true,
141 .amcaninclude = true,
142 .amusemaintenanceworkmem = false,
143 .amsummarizing = false,
144 .amparallelvacuumoptions =
146 .amkeytype = InvalidOid,
147
148 .ambuild = btbuild,
149 .ambuildempty = btbuildempty,
150 .aminsert = btinsert,
151 .aminsertcleanup = NULL,
152 .ambulkdelete = btbulkdelete,
153 .amvacuumcleanup = btvacuumcleanup,
154 .amcanreturn = btcanreturn,
155 .amcostestimate = btcostestimate,
156 .amgettreeheight = btgettreeheight,
157 .amoptions = btoptions,
158 .amproperty = btproperty,
159 .ambuildphasename = btbuildphasename,
160 .amvalidate = btvalidate,
161 .amadjustmembers = btadjustmembers,
162 .ambeginscan = btbeginscan,
163 .amrescan = btrescan,
164 .amgettuple = btgettuple,
165 .amgetbitmap = btgetbitmap,
166 .amendscan = btendscan,
167 .ammarkpos = btmarkpos,
168 .amrestrpos = btrestrpos,
169 .amestimateparallelscan = btestimateparallelscan,
170 .aminitparallelscan = btinitparallelscan,
171 .amparallelrescan = btparallelrescan,
172 .amtranslatestrategy = bttranslatestrategy,
173 .amtranslatecmptype = bttranslatecmptype,
174 };
175
177}
178
179/*
180 * btbuildempty() -- build an empty btree index in the initialization fork
181 */
182void
184{
185 bool allequalimage = _bt_allequalimage(index, false);
186 BulkWriteState *bulkstate;
188
190
191 /* Construct metapage. */
192 metabuf = smgr_bulk_get_buf(bulkstate);
193 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
194 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
195
196 smgr_bulk_finish(bulkstate);
197}
198
199/*
200 * btinsert() -- insert an index tuple into a btree.
201 *
202 * Descend the tree recursively, find the appropriate location for our
203 * new tuple, and put it there.
204 */
205bool
206btinsert(Relation rel, Datum *values, bool *isnull,
209 bool indexUnchanged,
210 IndexInfo *indexInfo)
211{
212 bool result;
213 IndexTuple itup;
214
215 /* generate an index tuple */
216 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
217 itup->t_tid = *ht_ctid;
218
219 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
220
221 pfree(itup);
222
223 return result;
224}
225
226/*
227 * btgettuple() -- Get the next tuple in the scan.
228 */
229bool
231{
233 bool res;
234
235 Assert(scan->heapRelation != NULL);
236
237 /* btree indexes are never lossy */
238 scan->xs_recheck = false;
239
240 /* Each loop iteration performs another primitive index scan */
241 do
242 {
243 /*
244 * If we've already initialized this scan, we can just advance it in
245 * the appropriate direction. If we haven't done so yet, we call
246 * _bt_first() to get the first item in the scan.
247 */
248 if (!BTScanPosIsValid(so->currPos))
249 res = _bt_first(scan, dir);
250 else
251 {
252 /*
253 * Check to see if we should kill the previously-fetched tuple.
254 */
255 if (scan->kill_prior_tuple)
256 {
257 /*
258 * Yes, remember it for later. (We'll deal with all such
259 * tuples at once right before leaving the index page.) The
260 * test for numKilled overrun is not just paranoia: if the
261 * caller reverses direction in the indexscan then the same
262 * item might get entered multiple times. It's not worth
263 * trying to optimize that, so we don't detect it, but instead
264 * just forget any excess entries.
265 */
266 if (so->killedItems == NULL)
267 so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
268 if (so->numKilled < MaxTIDsPerBTreePage)
269 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
270 }
271
272 /*
273 * Now continue the scan.
274 */
275 res = _bt_next(scan, dir);
276 }
277
278 /* If we have a tuple, return it ... */
279 if (res)
280 break;
281 /* ... otherwise see if we need another primitive index scan */
282 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
283
284 return res;
285}
286
287/*
288 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
289 */
290int64
292{
294 int64 ntids = 0;
295 ItemPointer heapTid;
296
297 Assert(scan->heapRelation == NULL);
298
299 /* Each loop iteration performs another primitive index scan */
300 do
301 {
302 /* Fetch the first page & tuple */
304 {
305 /* Save tuple ID, and continue scanning */
306 heapTid = &scan->xs_heaptid;
307 tbm_add_tuples(tbm, heapTid, 1, false);
308 ntids++;
309
310 for (;;)
311 {
312 /*
313 * Advance to next tuple within page. This is the same as the
314 * easy case in _bt_next().
315 */
316 if (++so->currPos.itemIndex > so->currPos.lastItem)
317 {
318 /* let _bt_next do the heavy lifting */
319 if (!_bt_next(scan, ForwardScanDirection))
320 break;
321 }
322
323 /* Save tuple ID, and continue scanning */
324 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
325 tbm_add_tuples(tbm, heapTid, 1, false);
326 ntids++;
327 }
328 }
329 /* Now see if we need another primitive index scan */
330 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
331
332 return ntids;
333}
334
335/*
336 * btbeginscan() -- start a scan on a btree index
337 */
339btbeginscan(Relation rel, int nkeys, int norderbys)
340{
341 IndexScanDesc scan;
343
344 /* no order by operators allowed */
345 Assert(norderbys == 0);
346
347 /* get the scan */
348 scan = RelationGetIndexScan(rel, nkeys, norderbys);
349
350 /* allocate private workspace */
352 BTScanPosInvalidate(so->currPos);
353 BTScanPosInvalidate(so->markPos);
354 if (scan->numberOfKeys > 0)
355 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
356 else
357 so->keyData = NULL;
358
359 so->skipScan = false;
360 so->needPrimScan = false;
361 so->scanBehind = false;
362 so->oppositeDirCheck = false;
363 so->arrayKeys = NULL;
364 so->orderProcs = NULL;
365 so->arrayContext = NULL;
366
367 so->killedItems = NULL; /* until needed */
368 so->numKilled = 0;
369
370 /*
371 * We don't know yet whether the scan will be index-only, so we do not
372 * allocate the tuple workspace arrays until btrescan. However, we set up
373 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
374 */
375 so->currTuples = so->markTuples = NULL;
376
377 scan->xs_itupdesc = RelationGetDescr(rel);
378
379 scan->opaque = so;
380
381 return scan;
382}
383
384/*
385 * btrescan() -- rescan an index relation
386 */
387void
389 ScanKey orderbys, int norderbys)
390{
392
393 /* we aren't holding any read locks, but gotta drop the pins */
394 if (BTScanPosIsValid(so->currPos))
395 {
396 /* Before leaving current page, deal with any killed items */
397 if (so->numKilled > 0)
398 _bt_killitems(scan);
399 BTScanPosUnpinIfPinned(so->currPos);
400 BTScanPosInvalidate(so->currPos);
401 }
402
403 /*
404 * We prefer to eagerly drop leaf page pins before btgettuple returns.
405 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
406 *
407 * We cannot safely drop leaf page pins during index-only scans due to a
408 * race condition involving VACUUM setting pages all-visible in the VM.
409 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
410 *
411 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
412 * Pins cannot be held for more than an instant during bitmap scans either
413 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
414 *
415 * See nbtree/README section on making concurrent TID recycling safe.
416 *
417 * Note: so->dropPin should never change across rescans.
418 */
419 so->dropPin = (!scan->xs_want_itup &&
421 scan->heapRelation != NULL);
422
423 so->markItemIndex = -1;
424 so->needPrimScan = false;
425 so->scanBehind = false;
426 so->oppositeDirCheck = false;
427 BTScanPosUnpinIfPinned(so->markPos);
428 BTScanPosInvalidate(so->markPos);
429
430 /*
431 * Allocate tuple workspace arrays, if needed for an index-only scan and
432 * not already done in a previous rescan call. To save on palloc
433 * overhead, both workspaces are allocated as one palloc block; only this
434 * function and btendscan know that.
435 */
436 if (scan->xs_want_itup && so->currTuples == NULL)
437 {
438 so->currTuples = (char *) palloc(BLCKSZ * 2);
439 so->markTuples = so->currTuples + BLCKSZ;
440 }
441
442 /*
443 * Reset the scan keys
444 */
445 if (scankey && scan->numberOfKeys > 0)
446 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
447 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
448 so->numArrayKeys = 0; /* ditto */
449}
450
451/*
452 * btendscan() -- close down a scan
453 */
454void
456{
458
459 /* we aren't holding any read locks, but gotta drop the pins */
460 if (BTScanPosIsValid(so->currPos))
461 {
462 /* Before leaving current page, deal with any killed items */
463 if (so->numKilled > 0)
464 _bt_killitems(scan);
465 BTScanPosUnpinIfPinned(so->currPos);
466 }
467
468 so->markItemIndex = -1;
469 BTScanPosUnpinIfPinned(so->markPos);
470
471 /* No need to invalidate positions, the RAM is about to be freed. */
472
473 /* Release storage */
474 if (so->keyData != NULL)
475 pfree(so->keyData);
476 /* so->arrayKeys and so->orderProcs are in arrayContext */
477 if (so->arrayContext != NULL)
478 MemoryContextDelete(so->arrayContext);
479 if (so->killedItems != NULL)
480 pfree(so->killedItems);
481 if (so->currTuples != NULL)
482 pfree(so->currTuples);
483 /* so->markTuples should not be pfree'd, see btrescan */
484 pfree(so);
485}
486
487/*
488 * btmarkpos() -- save current scan position
489 */
490void
492{
494
495 /* There may be an old mark with a pin (but no lock). */
496 BTScanPosUnpinIfPinned(so->markPos);
497
498 /*
499 * Just record the current itemIndex. If we later step to next page
500 * before releasing the marked position, _bt_steppage makes a full copy of
501 * the currPos struct in markPos. If (as often happens) the mark is moved
502 * before we leave the page, we don't have to do that work.
503 */
504 if (BTScanPosIsValid(so->currPos))
505 so->markItemIndex = so->currPos.itemIndex;
506 else
507 {
508 BTScanPosInvalidate(so->markPos);
509 so->markItemIndex = -1;
510 }
511}
512
513/*
514 * btrestrpos() -- restore scan to last saved position
515 */
516void
518{
520
521 if (so->markItemIndex >= 0)
522 {
523 /*
524 * The scan has never moved to a new page since the last mark. Just
525 * restore the itemIndex.
526 *
527 * NB: In this case we can't count on anything in so->markPos to be
528 * accurate.
529 */
530 so->currPos.itemIndex = so->markItemIndex;
531 }
532 else
533 {
534 /*
535 * The scan moved to a new page after last mark or restore, and we are
536 * now restoring to the marked page. We aren't holding any read
537 * locks, but if we're still holding the pin for the current position,
538 * we must drop it.
539 */
540 if (BTScanPosIsValid(so->currPos))
541 {
542 /* Before leaving current page, deal with any killed items */
543 if (so->numKilled > 0)
544 _bt_killitems(scan);
545 BTScanPosUnpinIfPinned(so->currPos);
546 }
547
548 if (BTScanPosIsValid(so->markPos))
549 {
550 /* bump pin on mark buffer for assignment to current buffer */
551 if (BTScanPosIsPinned(so->markPos))
552 IncrBufferRefCount(so->markPos.buf);
553 memcpy(&so->currPos, &so->markPos,
555 so->markPos.lastItem * sizeof(BTScanPosItem));
556 if (so->currTuples)
557 memcpy(so->currTuples, so->markTuples,
558 so->markPos.nextTupleOffset);
559 /* Reset the scan's array keys (see _bt_steppage for why) */
560 if (so->numArrayKeys)
561 {
562 _bt_start_array_keys(scan, so->currPos.dir);
563 so->needPrimScan = false;
564 }
565 }
566 else
567 BTScanPosInvalidate(so->currPos);
568 }
569}
570
571/*
572 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
573 */
574Size
575btestimateparallelscan(Relation rel, int nkeys, int norderbys)
576{
580
581 /*
582 * Pessimistically assume that every input scan key will be output with
583 * its own SAOP array
584 */
586 sizeof(int) * nkeys;
587
588 /* Single column indexes cannot possibly use a skip array */
589 if (nkeyatts == 1)
590 return estnbtreeshared;
591
592 /*
593 * Pessimistically assume that all attributes prior to the least
594 * significant attribute require a skip array (and an associated key)
595 */
596 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
597 sizeof(Datum));
598 for (int attnum = 1; attnum < nkeyatts; attnum++)
599 {
600 CompactAttribute *attr;
601
602 /*
603 * We make the conservative assumption that every index column will
604 * also require a skip array.
605 *
606 * Every skip array must have space to store its scan key's sk_flags.
607 */
609
610 /* Consider space required to store a datum of opclass input type */
611 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
612 if (attr->attbyval)
613 {
614 /* This index attribute stores pass-by-value datums */
616 true, attr->attlen);
617
619 continue;
620 }
621
622 /*
623 * This index attribute stores pass-by-reference datums.
624 *
625 * Assume that serializing this array will use just as much space as a
626 * pass-by-value datum, in addition to space for the largest possible
627 * whole index tuple (this is not just a per-datum portion of the
628 * largest possible tuple because that'd be almost as large anyway).
629 *
630 * This is quite conservative, but it's not clear how we could do much
631 * better. The executor requires an up-front storage request size
632 * that reliably covers the scan's high watermark memory usage. We
633 * can't be sure of the real high watermark until the scan is over.
634 */
637 }
638
639 return estnbtreeshared;
640}
641
642/*
643 * _bt_start_prim_scan() -- start scheduled primitive index scan?
644 *
645 * Returns true if _bt_checkkeys scheduled another primitive index scan, just
646 * as the last one ended. Otherwise returns false, indicating that the array
647 * keys are now fully exhausted.
648 *
649 * Only call here during scans with one or more equality type array scan keys,
650 * after _bt_first or _bt_next return false.
651 */
652static bool
654{
656
657 Assert(so->numArrayKeys);
658
659 so->scanBehind = so->oppositeDirCheck = false; /* reset */
660
661 /*
662 * Array keys are advanced within _bt_checkkeys when the scan reaches the
663 * leaf level (more precisely, they're advanced when the scan reaches the
664 * end of each distinct set of array elements). This process avoids
665 * repeat access to leaf pages (across multiple primitive index scans) by
666 * advancing the scan's array keys when it allows the primitive index scan
667 * to find nearby matching tuples (or when it eliminates ranges of array
668 * key space that can't possibly be satisfied by any index tuple).
669 *
670 * _bt_checkkeys sets a simple flag variable to schedule another primitive
671 * index scan. The flag tells us what to do.
672 *
673 * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
674 * various cases where that won't happen. For example, if the index is
675 * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
676 * We also don't expect a call to _bt_checkkeys during searches for a
677 * non-existent value that happens to be lower/higher than any existing
678 * value in the index.
679 *
680 * We don't require special handling for these cases -- we don't need to
681 * be explicitly instructed to _not_ perform another primitive index scan.
682 * It's up to code under the control of _bt_first to always set the flag
683 * when another primitive index scan will be required.
684 *
685 * This works correctly, even with the tricky cases listed above, which
686 * all involve access to leaf pages "near the boundaries of the key space"
687 * (whether it's from a leftmost/rightmost page, or an imaginary empty
688 * leaf root page). If _bt_checkkeys cannot be reached by a primitive
689 * index scan for one set of array keys, then it also won't be reached for
690 * any later set ("later" in terms of the direction that we scan the index
691 * and advance the arrays). The array keys won't have advanced in these
692 * cases, but that's the correct behavior (even _bt_advance_array_keys
693 * won't always advance the arrays at the point they become "exhausted").
694 */
695 if (so->needPrimScan)
696 {
697 /*
698 * Flag was set -- must call _bt_first again, which will reset the
699 * scan's needPrimScan flag
700 */
701 return true;
702 }
703
704 /* The top-level index scan ran out of tuples in this scan direction */
705 if (scan->parallel_scan != NULL)
706 _bt_parallel_done(scan);
707
708 return false;
709}
710
711/*
712 * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
713 *
714 * Caller must have exclusively locked btscan->btps_lock when called.
715 */
716static void
719{
720 char *datumshared;
721
722 /* Space for serialized datums begins immediately after btps_arrElems[] */
723 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
724 for (int i = 0; i < so->numArrayKeys; i++)
725 {
726 BTArrayKeyInfo *array = &so->arrayKeys[i];
727 ScanKey skey = &so->keyData[array->scan_key];
728
729 if (array->num_elems != -1)
730 {
731 /* Save SAOP array's cur_elem (no need to copy key/datum) */
732 Assert(!(skey->sk_flags & SK_BT_SKIP));
733 btscan->btps_arrElems[i] = array->cur_elem;
734 continue;
735 }
736
737 /* Save all mutable state associated with skip array's key */
738 Assert(skey->sk_flags & SK_BT_SKIP);
739 memcpy(datumshared, &skey->sk_flags, sizeof(int));
740 datumshared += sizeof(int);
741
742 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
743 {
744 /* No sk_argument datum to serialize */
745 Assert(skey->sk_argument == 0);
746 continue;
747 }
748
749 datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
750 array->attbyval, array->attlen, &datumshared);
751 }
752}
753
754/*
755 * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
756 *
757 * Caller must have exclusively locked btscan->btps_lock when called.
758 */
759static void
762{
763 char *datumshared;
764
765 /* Space for serialized datums begins immediately after btps_arrElems[] */
766 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
767 for (int i = 0; i < so->numArrayKeys; i++)
768 {
769 BTArrayKeyInfo *array = &so->arrayKeys[i];
770 ScanKey skey = &so->keyData[array->scan_key];
771 bool isnull;
772
773 if (array->num_elems != -1)
774 {
775 /* Restore SAOP array using its saved cur_elem */
776 Assert(!(skey->sk_flags & SK_BT_SKIP));
777 array->cur_elem = btscan->btps_arrElems[i];
778 skey->sk_argument = array->elem_values[array->cur_elem];
779 continue;
780 }
781
782 /* Restore skip array by restoring its key directly */
783 if (!array->attbyval && skey->sk_argument)
784 pfree(DatumGetPointer(skey->sk_argument));
785 skey->sk_argument = (Datum) 0;
786 memcpy(&skey->sk_flags, datumshared, sizeof(int));
787 datumshared += sizeof(int);
788
789 Assert(skey->sk_flags & SK_BT_SKIP);
790
791 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
792 {
793 /* No sk_argument datum to restore */
794 continue;
795 }
796
797 skey->sk_argument = datumRestore(&datumshared, &isnull);
798 if (isnull)
799 {
800 Assert(skey->sk_argument == 0);
801 Assert(skey->sk_flags & SK_SEARCHNULL);
802 Assert(skey->sk_flags & SK_ISNULL);
803 }
804 }
805}
806
807/*
808 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
809 */
810void
812{
814
815 LWLockInitialize(&bt_target->btps_lock,
817 bt_target->btps_nextScanPage = InvalidBlockNumber;
818 bt_target->btps_lastCurrPage = InvalidBlockNumber;
819 bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
821}
822
823/*
824 * btparallelrescan() -- reset parallel scan
825 */
826void
828{
830 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
831
832 Assert(parallel_scan);
833
834 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
835 parallel_scan->ps_offset_am);
836
837 /*
838 * In theory, we don't need to acquire the LWLock here, because there
839 * shouldn't be any other workers running at this point, but we do so for
840 * consistency.
841 */
842 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
843 btscan->btps_nextScanPage = InvalidBlockNumber;
844 btscan->btps_lastCurrPage = InvalidBlockNumber;
845 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
846 LWLockRelease(&btscan->btps_lock);
847}
848
849/*
850 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
851 * page. Other scans must wait until we call _bt_parallel_release()
852 * or _bt_parallel_done().
853 *
854 * The return value is true if we successfully seized the scan and false
855 * if we did not. The latter case occurs when no pages remain, or when
856 * another primitive index scan is scheduled that caller's backend cannot
857 * start just yet (only backends that call from _bt_first are capable of
858 * starting primitive index scans, which they indicate by passing first=true).
859 *
860 * If the return value is true, *next_scan_page returns the next page of the
861 * scan, and *last_curr_page returns the page that *next_scan_page came from.
862 * An invalid *next_scan_page means the scan hasn't yet started, or that
863 * caller needs to start the next primitive index scan (if it's the latter
864 * case we'll set so.needPrimScan).
865 *
866 * Callers should ignore the value of *next_scan_page and *last_curr_page if
867 * the return value is false.
868 */
869bool
871 BlockNumber *last_curr_page, bool first)
872{
873 Relation rel = scan->indexRelation;
875 bool exit_loop = false,
876 status = true,
877 endscan = false;
878 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
880
883
884 /*
885 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
886 * call to _bt_readnextpage treats this backend similarly to a serial
887 * backend that steps from *last_curr_page to *next_scan_page (unless this
888 * backend's so->currPos is initialized by _bt_readfirstpage before then).
889 */
890 BTScanPosInvalidate(so->currPos);
891 so->currPos.moreLeft = so->currPos.moreRight = true;
892
893 if (first)
894 {
895 /*
896 * Initialize array related state when called from _bt_first, assuming
897 * that this will be the first primitive index scan for the scan
898 */
899 so->needPrimScan = false;
900 so->scanBehind = false;
901 so->oppositeDirCheck = false;
902 }
903 else
904 {
905 /*
906 * Don't attempt to seize the scan when it requires another primitive
907 * index scan, since caller's backend cannot start it right now
908 */
909 if (so->needPrimScan)
910 return false;
911 }
912
913 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
914 parallel_scan->ps_offset_am);
915
916 while (1)
917 {
918 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
919
920 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
921 {
922 /* We're done with this parallel index scan */
923 status = false;
924 }
925 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
926 btscan->btps_nextScanPage == P_NONE)
927 {
928 /* End this parallel index scan */
929 status = false;
930 endscan = true;
931 }
932 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
933 {
934 Assert(so->numArrayKeys);
935
936 if (first)
937 {
938 /* Can start scheduled primitive scan right away, so do so */
939 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
940
941 /* Restore scan's array keys from serialized values */
943 exit_loop = true;
944 }
945 else
946 {
947 /*
948 * Don't attempt to seize the scan when it requires another
949 * primitive index scan, since caller's backend cannot start
950 * it right now
951 */
952 status = false;
953 }
954
955 /*
956 * Either way, update backend local state to indicate that a
957 * pending primitive scan is required
958 */
959 so->needPrimScan = true;
960 so->scanBehind = false;
961 so->oppositeDirCheck = false;
962 }
963 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
964 {
965 /*
966 * We have successfully seized control of the scan for the purpose
967 * of advancing it to a new page!
968 */
969 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
970 Assert(btscan->btps_nextScanPage != P_NONE);
971 *next_scan_page = btscan->btps_nextScanPage;
972 *last_curr_page = btscan->btps_lastCurrPage;
973 exit_loop = true;
974 }
975 LWLockRelease(&btscan->btps_lock);
976 if (exit_loop || !status)
977 break;
979 }
981
982 /* When the scan has reached the rightmost (or leftmost) page, end it */
983 if (endscan)
984 _bt_parallel_done(scan);
985
986 return status;
987}
988
989/*
990 * _bt_parallel_release() -- Complete the process of advancing the scan to a
991 * new page. We now have the new value btps_nextScanPage; another backend
992 * can now begin advancing the scan.
993 *
994 * Callers whose scan uses array keys must save their curr_page argument so
995 * that it can be passed to _bt_parallel_primscan_schedule, should caller
996 * determine that another primitive index scan is required.
997 *
998 * If caller's next_scan_page is P_NONE, the scan has reached the index's
999 * rightmost/leftmost page. This is treated as reaching the end of the scan
1000 * within _bt_parallel_seize.
1001 *
1002 * Note: unlike the serial case, parallel scans don't need to remember both
1003 * sibling links. next_scan_page is whichever link is next given the scan's
1004 * direction. That's all we'll ever need, since the direction of a parallel
1005 * scan can never change.
1006 */
1007void
1010{
1011 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1013
1015
1016 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1017 parallel_scan->ps_offset_am);
1018
1019 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1020 btscan->btps_nextScanPage = next_scan_page;
1021 btscan->btps_lastCurrPage = curr_page;
1022 btscan->btps_pageStatus = BTPARALLEL_IDLE;
1023 LWLockRelease(&btscan->btps_lock);
1025}
1026
1027/*
1028 * _bt_parallel_done() -- Mark the parallel scan as complete.
1029 *
1030 * When there are no pages left to scan, this function should be called to
1031 * notify other workers. Otherwise, they might wait forever for the scan to
1032 * advance to the next page.
1033 */
1034void
1036{
1038 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1040 bool status_changed = false;
1041
1042 Assert(!BTScanPosIsValid(so->currPos));
1043
1044 /* Do nothing, for non-parallel scans */
1045 if (parallel_scan == NULL)
1046 return;
1047
1048 /*
1049 * Should not mark parallel scan done when there's still a pending
1050 * primitive index scan
1051 */
1052 if (so->needPrimScan)
1053 return;
1054
1055 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1056 parallel_scan->ps_offset_am);
1057
1058 /*
1059 * Mark the parallel scan as done, unless some other process did so
1060 * already
1061 */
1062 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1063 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1064 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1065 {
1066 btscan->btps_pageStatus = BTPARALLEL_DONE;
1067 status_changed = true;
1068 }
1069 LWLockRelease(&btscan->btps_lock);
1070
1071 /* wake up all the workers associated with this parallel scan */
1072 if (status_changed)
1074}
1075
1076/*
1077 * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1078 *
1079 * Caller passes the curr_page most recently passed to _bt_parallel_release
1080 * by its backend. Caller successfully schedules the next primitive index scan
1081 * if the shared parallel state hasn't been seized since caller's backend last
1082 * advanced the scan.
1083 */
1084void
1086{
1087 Relation rel = scan->indexRelation;
1089 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1091
1092 Assert(so->numArrayKeys);
1093
1094 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1095 parallel_scan->ps_offset_am);
1096
1097 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1098 if (btscan->btps_lastCurrPage == curr_page &&
1099 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1100 {
1101 btscan->btps_nextScanPage = InvalidBlockNumber;
1102 btscan->btps_lastCurrPage = InvalidBlockNumber;
1103 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1104
1105 /* Serialize scan's current array keys */
1107 }
1108 LWLockRelease(&btscan->btps_lock);
1109}
1110
1111/*
1112 * Bulk deletion of all index entries pointing to a set of heap tuples.
1113 * The set of target tuples is specified via a callback routine that tells
1114 * whether any given heap tuple (identified by ItemPointer) is being deleted.
1115 *
1116 * Result: a palloc'd struct containing statistical info for VACUUM displays.
1117 */
1120 IndexBulkDeleteCallback callback, void *callback_state)
1121{
1122 Relation rel = info->index;
1123 BTCycleId cycleid;
1124
1125 /* allocate stats if first time through, else re-use existing struct */
1126 if (stats == NULL)
1128
1129 /* Establish the vacuum cycle ID to use for this scan */
1130 /* The ENSURE stuff ensures we clean up shared memory on failure */
1132 {
1133 cycleid = _bt_start_vacuum(rel);
1134
1135 btvacuumscan(info, stats, callback, callback_state, cycleid);
1136 }
1138 _bt_end_vacuum(rel);
1139
1140 return stats;
1141}
1142
1143/*
1144 * Post-VACUUM cleanup.
1145 *
1146 * Result: a palloc'd struct containing statistical info for VACUUM displays.
1147 */
1150{
1152
1153 /* No-op in ANALYZE ONLY mode */
1154 if (info->analyze_only)
1155 return stats;
1156
1157 /*
1158 * If btbulkdelete was called, we need not do anything (we just maintain
1159 * the information used within _bt_vacuum_needs_cleanup() by calling
1160 * _bt_set_cleanup_info() below).
1161 *
1162 * If btbulkdelete was _not_ called, then we have a choice to make: we
1163 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1164 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1165 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1166 * now.
1167 */
1168 if (stats == NULL)
1169 {
1170 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1171 if (!_bt_vacuum_needs_cleanup(info->index))
1172 return NULL;
1173
1174 /*
1175 * Since we aren't going to actually delete any leaf items, there's no
1176 * need to go through all the vacuum-cycle-ID pushups here.
1177 *
1178 * Posting list tuples are a source of inaccuracy for cleanup-only
1179 * scans. btvacuumscan() will assume that the number of index tuples
1180 * from each page can be used as num_index_tuples, even though
1181 * num_index_tuples is supposed to represent the number of TIDs in the
1182 * index. This naive approach can underestimate the number of tuples
1183 * in the index significantly.
1184 *
1185 * We handle the problem by making num_index_tuples an estimate in
1186 * cleanup-only case.
1187 */
1189 btvacuumscan(info, stats, NULL, NULL, 0);
1190 stats->estimated_count = true;
1191 }
1192
1193 /*
1194 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1195 *
1196 * num_delpages is the number of deleted pages now in the index that were
1197 * not safe to place in the FSM to be recycled just yet. num_delpages is
1198 * greater than 0 only when _bt_pagedel() actually deleted pages during
1199 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1200 * have failed to place any newly deleted pages in the FSM just moments
1201 * ago. (Actually, there are edge cases where recycling of the current
1202 * VACUUM's newly deleted pages does not even become safe by the time the
1203 * next VACUUM comes around. See nbtree/README.)
1204 */
1205 Assert(stats->pages_deleted >= stats->pages_free);
1206 num_delpages = stats->pages_deleted - stats->pages_free;
1208
1209 /*
1210 * It's quite possible for us to be fooled by concurrent page splits into
1211 * double-counting some index tuples, so disbelieve any total that exceeds
1212 * the underlying heap's count ... if we know that accurately. Otherwise
1213 * this might just make matters worse.
1214 */
1215 if (!info->estimated_count)
1216 {
1217 if (stats->num_index_tuples > info->num_heap_tuples)
1218 stats->num_index_tuples = info->num_heap_tuples;
1219 }
1220
1221 return stats;
1222}
1223
1224/*
1225 * btvacuumscan --- scan the index for VACUUMing purposes
1226 *
1227 * This combines the functions of looking for leaf tuples that are deletable
1228 * according to the vacuum callback, looking for empty pages that can be
1229 * deleted, and looking for old deleted pages that can be recycled. Both
1230 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1231 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1232 *
1233 * The caller is responsible for initially allocating/zeroing a stats struct
1234 * and for obtaining a vacuum cycle ID if necessary.
1235 */
1236static void
1238 IndexBulkDeleteCallback callback, void *callback_state,
1239 BTCycleId cycleid)
1240{
1241 Relation rel = info->index;
1243 BlockNumber num_pages;
1244 bool needLock;
1246 ReadStream *stream = NULL;
1247
1248 /*
1249 * Reset fields that track information about the entire index now. This
1250 * avoids double-counting in the case where a single VACUUM command
1251 * requires multiple scans of the index.
1252 *
1253 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1254 * since they track information about the VACUUM command, and so must last
1255 * across each call to btvacuumscan().
1256 *
1257 * (Note that pages_free is treated as state about the whole index, not
1258 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1259 * calls are idempotent, and get repeated for the same deleted pages in
1260 * some scenarios. The point for us is to track the number of recyclable
1261 * pages in the index at the end of the VACUUM command.)
1262 */
1263 stats->num_pages = 0;
1264 stats->num_index_tuples = 0;
1265 stats->pages_deleted = 0;
1266 stats->pages_free = 0;
1267
1268 /* Set up info to pass down to btvacuumpage */
1269 vstate.info = info;
1270 vstate.stats = stats;
1271 vstate.callback = callback;
1272 vstate.callback_state = callback_state;
1273 vstate.cycleid = cycleid;
1274
1275 /* Create a temporary memory context to run _bt_pagedel in */
1277 "_bt_pagedel",
1279
1280 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1281 vstate.bufsize = 0;
1282 vstate.maxbufsize = 0;
1283 vstate.pendingpages = NULL;
1284 vstate.npendingpages = 0;
1285 /* Consider applying _bt_pendingfsm_finalize optimization */
1287
1288 /*
1289 * The outer loop iterates over all index pages except the metapage, in
1290 * physical order (we hope the kernel will cooperate in providing
1291 * read-ahead for speed). It is critical that we visit all leaf pages,
1292 * including ones added after we start the scan, else we might fail to
1293 * delete some deletable tuples. Hence, we must repeatedly check the
1294 * relation length. We must acquire the relation-extension lock while
1295 * doing so to avoid a race condition: if someone else is extending the
1296 * relation, there is a window where bufmgr/smgr have created a new
1297 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1298 * we manage to scan such a page here, we'll improperly assume it can be
1299 * recycled. Taking the lock synchronizes things enough to prevent a
1300 * problem: either num_pages won't include the new page, or _bt_getbuf
1301 * already has write lock on the buffer and it will be fully initialized
1302 * before we can examine it. Also, we need not worry if a page is added
1303 * immediately after we look; the page splitting code already has
1304 * write-lock on the left page before it adds a right page, so we must
1305 * already have processed any tuples due to be moved into such a page.
1306 *
1307 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1308 * think the use of the extension lock is still required.
1309 *
1310 * We can skip locking for new or temp relations, however, since no one
1311 * else could be accessing them.
1312 */
1314
1316
1317 /*
1318 * It is safe to use batchmode as block_range_read_stream_cb takes no
1319 * locks.
1320 */
1324 info->strategy,
1325 rel,
1328 &p,
1329 0);
1330 for (;;)
1331 {
1332 /* Get the current relation length */
1333 if (needLock)
1335 num_pages = RelationGetNumberOfBlocks(rel);
1336 if (needLock)
1338
1339 if (info->report_progress)
1341 num_pages);
1342
1343 /* Quit if we've scanned the whole relation */
1344 if (p.current_blocknum >= num_pages)
1345 break;
1346
1347 p.last_exclusive = num_pages;
1348
1349 /* Iterate over pages, then loop back to recheck relation length */
1350 while (true)
1351 {
1352 BlockNumber current_block;
1353 Buffer buf;
1354
1355 /* call vacuum_delay_point while not holding any buffer lock */
1356 vacuum_delay_point(false);
1357
1358 buf = read_stream_next_buffer(stream, NULL);
1359
1360 if (!BufferIsValid(buf))
1361 break;
1362
1363 current_block = btvacuumpage(&vstate, buf);
1364
1365 if (info->report_progress)
1367 current_block);
1368 }
1369
1370 /*
1371 * We have to reset the read stream to use it again. After returning
1372 * InvalidBuffer, the read stream API won't invoke our callback again
1373 * until the stream has been reset.
1374 */
1375 read_stream_reset(stream);
1376 }
1377
1378 read_stream_end(stream);
1379
1380 /* Set statistics num_pages field to final size of index */
1381 stats->num_pages = num_pages;
1382
1383 MemoryContextDelete(vstate.pagedelcontext);
1384
1385 /*
1386 * If there were any calls to _bt_pagedel() during scan of the index then
1387 * see if any of the resulting pages can be placed in the FSM now. When
1388 * it's not safe we'll have to leave it up to a future VACUUM operation.
1389 *
1390 * Finally, if we placed any pages in the FSM (either just now or during
1391 * the scan), forcibly update the upper-level FSM pages to ensure that
1392 * searchers can find them.
1393 */
1395 if (stats->pages_free > 0)
1397}
1398
1399/*
1400 * btvacuumpage --- VACUUM one page
1401 *
1402 * This processes a single page for btvacuumscan(). In some cases we must
1403 * backtrack to re-examine and VACUUM pages that were on buf's page during
1404 * a previous call here. This is how we handle page splits (that happened
1405 * after our cycleid was acquired) whose right half page happened to reuse
1406 * a block that we might have processed at some point before it was
1407 * recycled (i.e. before the page split).
1408 *
1409 * Returns BlockNumber of a scanned page (not backtracked).
1410 */
1411static BlockNumber
1413{
1414 IndexVacuumInfo *info = vstate->info;
1415 IndexBulkDeleteResult *stats = vstate->stats;
1417 void *callback_state = vstate->callback_state;
1418 Relation rel = info->index;
1419 Relation heaprel = info->heaprel;
1420 bool attempt_pagedel;
1421 BlockNumber blkno,
1424 Page page;
1425 BTPageOpaque opaque;
1426
1427 blkno = scanblkno;
1428
1429backtrack:
1430
1431 attempt_pagedel = false;
1433
1434 _bt_lockbuf(rel, buf, BT_READ);
1435 page = BufferGetPage(buf);
1436 opaque = NULL;
1437 if (!PageIsNew(page))
1438 {
1439 _bt_checkpage(rel, buf);
1440 opaque = BTPageGetOpaque(page);
1441 }
1442
1443 Assert(blkno <= scanblkno);
1444 if (blkno != scanblkno)
1445 {
1446 /*
1447 * We're backtracking.
1448 *
1449 * We followed a right link to a sibling leaf page (a page that
1450 * happens to be from a block located before scanblkno). The only
1451 * case we want to do anything with is a live leaf page having the
1452 * current vacuum cycle ID.
1453 *
1454 * The page had better be in a state that's consistent with what we
1455 * expect. Check for conditions that imply corruption in passing. It
1456 * can't be half-dead because only an interrupted VACUUM process can
1457 * leave pages in that state, so we'd definitely have dealt with it
1458 * back when the page was the scanblkno page (half-dead pages are
1459 * always marked fully deleted by _bt_pagedel(), barring corruption).
1460 */
1461 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1462 {
1463 Assert(false);
1464 ereport(LOG,
1466 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1467 blkno, scanblkno, RelationGetRelationName(rel))));
1468 _bt_relbuf(rel, buf);
1469 return scanblkno;
1470 }
1471
1472 /*
1473 * We may have already processed the page in an earlier call, when the
1474 * page was scanblkno. This happens when the leaf page split occurred
1475 * after the scan began, but before the right sibling page became the
1476 * scanblkno.
1477 *
1478 * Page may also have been deleted by current btvacuumpage() call,
1479 * since _bt_pagedel() sometimes deletes the right sibling page of
1480 * scanblkno in passing (it does so after we decided where to
1481 * backtrack to). We don't need to process this page as a deleted
1482 * page a second time now (in fact, it would be wrong to count it as a
1483 * deleted page in the bulk delete statistics a second time).
1484 */
1485 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1486 {
1487 /* Done with current scanblkno (and all lower split pages) */
1488 _bt_relbuf(rel, buf);
1489 return scanblkno;
1490 }
1491 }
1492
1493 if (!opaque || BTPageIsRecyclable(page, heaprel))
1494 {
1495 /* Okay to recycle this page (which could be leaf or internal) */
1496 RecordFreeIndexPage(rel, blkno);
1497 stats->pages_deleted++;
1498 stats->pages_free++;
1499 }
1500 else if (P_ISDELETED(opaque))
1501 {
1502 /*
1503 * Already deleted page (which could be leaf or internal). Can't
1504 * recycle yet.
1505 */
1506 stats->pages_deleted++;
1507 }
1508 else if (P_ISHALFDEAD(opaque))
1509 {
1510 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1511 attempt_pagedel = true;
1512
1513 /*
1514 * _bt_pagedel() will increment both pages_newly_deleted and
1515 * pages_deleted stats in all cases (barring corruption)
1516 */
1517 }
1518 else if (P_ISLEAF(opaque))
1519 {
1521 int ndeletable;
1523 int nupdatable;
1524 OffsetNumber offnum,
1525 minoff,
1526 maxoff;
1527 int nhtidsdead,
1528 nhtidslive;
1529
1530 /*
1531 * Trade in the initial read lock for a full cleanup lock on this
1532 * page. We must get such a lock on every leaf page over the course
1533 * of the vacuum scan, whether or not it actually contains any
1534 * deletable tuples --- see nbtree/README.
1535 */
1537
1538 /*
1539 * Check whether we need to backtrack to earlier pages. What we are
1540 * concerned about is a page split that happened since we started the
1541 * vacuum scan. If the split moved tuples on the right half of the
1542 * split (i.e. the tuples that sort high) to a block that we already
1543 * passed over, then we might have missed the tuples. We need to
1544 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1545 * or deleting scanblkno page below!)
1546 */
1547 if (vstate->cycleid != 0 &&
1548 opaque->btpo_cycleid == vstate->cycleid &&
1549 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1550 !P_RIGHTMOST(opaque) &&
1551 opaque->btpo_next < scanblkno)
1552 backtrack_to = opaque->btpo_next;
1553
1554 ndeletable = 0;
1555 nupdatable = 0;
1556 minoff = P_FIRSTDATAKEY(opaque);
1557 maxoff = PageGetMaxOffsetNumber(page);
1558 nhtidsdead = 0;
1559 nhtidslive = 0;
1560 if (callback)
1561 {
1562 /* btbulkdelete callback tells us what to delete (or update) */
1563 for (offnum = minoff;
1564 offnum <= maxoff;
1565 offnum = OffsetNumberNext(offnum))
1566 {
1567 IndexTuple itup;
1568
1569 itup = (IndexTuple) PageGetItem(page,
1570 PageGetItemId(page, offnum));
1571
1572 Assert(!BTreeTupleIsPivot(itup));
1573 if (!BTreeTupleIsPosting(itup))
1574 {
1575 /* Regular tuple, standard table TID representation */
1576 if (callback(&itup->t_tid, callback_state))
1577 {
1578 deletable[ndeletable++] = offnum;
1579 nhtidsdead++;
1580 }
1581 else
1582 nhtidslive++;
1583 }
1584 else
1585 {
1587 int nremaining;
1588
1589 /* Posting list tuple */
1590 vacposting = btreevacuumposting(vstate, itup, offnum,
1591 &nremaining);
1592 if (vacposting == NULL)
1593 {
1594 /*
1595 * All table TIDs from the posting tuple remain, so no
1596 * delete or update required
1597 */
1599 }
1600 else if (nremaining > 0)
1601 {
1602
1603 /*
1604 * Store metadata about posting list tuple in
1605 * updatable array for entire page. Existing tuple
1606 * will be updated during the later call to
1607 * _bt_delitems_vacuum().
1608 */
1610 updatable[nupdatable++] = vacposting;
1612 }
1613 else
1614 {
1615 /*
1616 * All table TIDs from the posting list must be
1617 * deleted. We'll delete the index tuple completely
1618 * (no update required).
1619 */
1620 Assert(nremaining == 0);
1621 deletable[ndeletable++] = offnum;
1624 }
1625
1627 }
1628 }
1629 }
1630
1631 /*
1632 * Apply any needed deletes or updates. We issue just one
1633 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1634 */
1635 if (ndeletable > 0 || nupdatable > 0)
1636 {
1638 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1639 nupdatable);
1640
1641 stats->tuples_removed += nhtidsdead;
1642 /* must recompute maxoff */
1643 maxoff = PageGetMaxOffsetNumber(page);
1644
1645 /* can't leak memory here */
1646 for (int i = 0; i < nupdatable; i++)
1647 pfree(updatable[i]);
1648 }
1649 else
1650 {
1651 /*
1652 * If the leaf page has been split during this vacuum cycle, it
1653 * seems worth expending a write to clear btpo_cycleid even if we
1654 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1655 * takes care of this.) This ensures we won't process the page
1656 * again.
1657 *
1658 * We treat this like a hint-bit update because there's no need to
1659 * WAL-log it.
1660 */
1661 Assert(nhtidsdead == 0);
1662 if (vstate->cycleid != 0 &&
1663 opaque->btpo_cycleid == vstate->cycleid)
1664 {
1665 opaque->btpo_cycleid = 0;
1666 MarkBufferDirtyHint(buf, true);
1667 }
1668 }
1669
1670 /*
1671 * If the leaf page is now empty, try to delete it; else count the
1672 * live tuples (live table TIDs in posting lists are counted as
1673 * separate live tuples). We don't delete when backtracking, though,
1674 * since that would require teaching _bt_pagedel() about backtracking
1675 * (doesn't seem worth adding more complexity to deal with that).
1676 *
1677 * We don't count the number of live TIDs during cleanup-only calls to
1678 * btvacuumscan (i.e. when callback is not set). We count the number
1679 * of index tuples directly instead. This avoids the expense of
1680 * directly examining all of the tuples on each page. VACUUM will
1681 * treat num_index_tuples as an estimate in cleanup-only case, so it
1682 * doesn't matter that this underestimates num_index_tuples
1683 * significantly in some cases.
1684 */
1685 if (minoff > maxoff)
1686 attempt_pagedel = (blkno == scanblkno);
1687 else if (callback)
1688 stats->num_index_tuples += nhtidslive;
1689 else
1690 stats->num_index_tuples += maxoff - minoff + 1;
1691
1693 }
1694
1695 if (attempt_pagedel)
1696 {
1697 MemoryContext oldcontext;
1698
1699 /* Run pagedel in a temp context to avoid memory leakage */
1700 MemoryContextReset(vstate->pagedelcontext);
1701 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1702
1703 /*
1704 * _bt_pagedel maintains the bulk delete stats on our behalf;
1705 * pages_newly_deleted and pages_deleted are likely to be incremented
1706 * during call
1707 */
1708 Assert(blkno == scanblkno);
1709 _bt_pagedel(rel, buf, vstate);
1710
1711 MemoryContextSwitchTo(oldcontext);
1712 /* pagedel released buffer, so we shouldn't */
1713 }
1714 else
1715 _bt_relbuf(rel, buf);
1716
1717 if (backtrack_to != P_NONE)
1718 {
1719 blkno = backtrack_to;
1720
1721 /* check for vacuum delay while not holding any buffer lock */
1722 vacuum_delay_point(false);
1723
1724 /*
1725 * We can't use _bt_getbuf() here because it always applies
1726 * _bt_checkpage(), which will barf on an all-zero page. We want to
1727 * recycle all-zero pages, not fail. Also, we want to use a
1728 * nondefault buffer access strategy.
1729 */
1731 info->strategy);
1732 goto backtrack;
1733 }
1734
1735 return scanblkno;
1736}
1737
1738/*
1739 * btreevacuumposting --- determine TIDs still needed in posting list
1740 *
1741 * Returns metadata describing how to build replacement tuple without the TIDs
1742 * that VACUUM needs to delete. Returned value is NULL in the common case
1743 * where no changes are needed to caller's posting list tuple (we avoid
1744 * allocating memory here as an optimization).
1745 *
1746 * The number of TIDs that should remain in the posting list tuple is set for
1747 * caller in *nremaining.
1748 */
1749static BTVacuumPosting
1751 OffsetNumber updatedoffset, int *nremaining)
1752{
1753 int live = 0;
1754 int nitem = BTreeTupleGetNPosting(posting);
1757
1758 for (int i = 0; i < nitem; i++)
1759 {
1760 if (!vstate->callback(items + i, vstate->callback_state))
1761 {
1762 /* Live table TID */
1763 live++;
1764 }
1765 else if (vacposting == NULL)
1766 {
1767 /*
1768 * First dead table TID encountered.
1769 *
1770 * It's now clear that we need to delete one or more dead table
1771 * TIDs, so start maintaining metadata describing how to update
1772 * existing posting list tuple.
1773 */
1775 nitem * sizeof(uint16));
1776
1777 vacposting->itup = posting;
1778 vacposting->updatedoffset = updatedoffset;
1779 vacposting->ndeletedtids = 0;
1780 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1781 }
1782 else
1783 {
1784 /* Second or subsequent dead table TID */
1785 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1786 }
1787 }
1788
1789 *nremaining = live;
1790 return vacposting;
1791}
1792
1793/*
1794 * btcanreturn() -- Check whether btree indexes support index-only scans.
1795 *
1796 * btrees always do, so this is trivial.
1797 */
1798bool
1800{
1801 return true;
1802}
1803
1804/*
1805 * btgettreeheight() -- Compute tree height for use by btcostestimate().
1806 */
1807int
1809{
1810 return _bt_getrootheight(rel);
1811}
1812
1815{
1816 switch (strategy)
1817 {
1819 return COMPARE_LT;
1821 return COMPARE_LE;
1823 return COMPARE_EQ;
1825 return COMPARE_GE;
1827 return COMPARE_GT;
1828 default:
1829 return COMPARE_INVALID;
1830 }
1831}
1832
1835{
1836 switch (cmptype)
1837 {
1838 case COMPARE_LT:
1839 return BTLessStrategyNumber;
1840 case COMPARE_LE:
1842 case COMPARE_EQ:
1843 return BTEqualStrategyNumber;
1844 case COMPARE_GE:
1846 case COMPARE_GT:
1848 default:
1849 return InvalidStrategy;
1850 }
1851}
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static Datum values[MAXATTR]
Definition bootstrap.c:188
int Buffer
Definition buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5650
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4426
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5801
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:926
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:309
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:472
@ RBM_NORMAL
Definition bufmgr.h:46
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:423
static bool PageIsNew(const PageData *page)
Definition bufpage.h:259
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:269
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:379
PageData * Page
Definition bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:397
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition bulk_write.c:130
#define OffsetToPointer(base, offset)
Definition c.h:857
#define Assert(condition)
Definition c.h:945
int64_t int64
Definition c.h:615
#define FLEXIBLE_ARRAY_MEMBER
Definition c.h:552
int16_t int16
Definition c.h:613
uint16_t uint16
Definition c.h:617
size_t Size
Definition c.h:691
CompareType
Definition cmptype.h:32
@ COMPARE_LE
Definition cmptype.h:35
@ COMPARE_INVALID
Definition cmptype.h:33
@ COMPARE_GT
Definition cmptype.h:38
@ COMPARE_EQ
Definition cmptype.h:36
@ COMPARE_GE
Definition cmptype.h:37
@ COMPARE_LT
Definition cmptype.h:34
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumRestore(char **start_address, bool *isnull)
Definition datum.c:559
void datumSerialize(Datum value, bool isnull, bool typByVal, int typLen, char **start_address)
Definition datum.c:497
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition datum.c:450
int errcode(int sqlerrcode)
Definition elog.c:874
#define LOG
Definition elog.h:31
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define ereport(elevel,...)
Definition elog.h:150
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define palloc0_object(type)
Definition fe_memutils.h:75
#define PG_RETURN_POINTER(x)
Definition fmgr.h:363
#define PG_FUNCTION_ARGS
Definition fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition genam.h:95
IndexUniqueCheck
Definition genam.h:124
void IndexFreeSpaceMapVacuum(Relation rel)
Definition indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition ipc.h:52
int i
Definition isn.c:77
IndexTupleData * IndexTuple
Definition itup.h:53
#define MaxIndexTuplesPerPage
Definition itup.h:181
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
#define ExclusiveLock
Definition lockdefs.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1153
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1770
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:673
@ LW_EXCLUSIVE
Definition lwlock.h:104
void MemoryContextReset(MemoryContext context)
Definition mcxt.c:403
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition nbtinsert.c:105
void _bt_relbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1044
int _bt_getrootheight(Relation rel)
Definition nbtpage.c:680
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition nbtpage.c:1832
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition nbtpage.c:1182
void _bt_checkpage(Relation rel, Buffer buf)
Definition nbtpage.c:802
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition nbtpage.c:233
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition nbtpage.c:1137
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition nbtpage.c:68
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition nbtpage.c:180
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition nbtpage.c:3033
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition nbtpage.c:1067
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition nbtpage.c:2991
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition nbtree.c:1085
bool btcanreturn(Relation index, int attno)
Definition nbtree.c:1799
BTPS_State
Definition nbtree.c:57
@ BTPARALLEL_ADVANCING
Definition nbtree.c:60
@ BTPARALLEL_NEED_PRIMSCAN
Definition nbtree.c:59
@ BTPARALLEL_NOT_INITIALIZED
Definition nbtree.c:58
@ BTPARALLEL_IDLE
Definition nbtree.c:61
@ BTPARALLEL_DONE
Definition nbtree.c:62
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition nbtree.c:870
StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily)
Definition nbtree.c:1834
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition nbtree.c:339
static bool _bt_start_prim_scan(IndexScanDesc scan)
Definition nbtree.c:653
static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf)
Definition nbtree.c:1412
Size btestimateparallelscan(Relation rel, int nkeys, int norderbys)
Definition nbtree.c:575
void _bt_parallel_done(IndexScanDesc scan)
Definition nbtree.c:1035
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition nbtree.c:1149
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition nbtree.c:1750
CompareType bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
Definition nbtree.c:1814
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition nbtree.c:230
void btparallelrescan(IndexScanDesc scan)
Definition nbtree.c:827
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition nbtree.c:206
void btbuildempty(Relation index)
Definition nbtree.c:183
int btgettreeheight(Relation rel)
Definition nbtree.c:1808
void btinitparallelscan(void *target)
Definition nbtree.c:811
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition nbtree.c:1119
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition nbtree.c:1237
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition nbtree.c:717
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition nbtree.c:291
void btmarkpos(IndexScanDesc scan)
Definition nbtree.c:491
void btendscan(IndexScanDesc scan)
Definition nbtree.c:455
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition nbtree.c:388
struct BTParallelScanDescData * BTParallelScanDesc
Definition nbtree.c:95
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition nbtree.c:1008
Datum bthandler(PG_FUNCTION_ARGS)
Definition nbtree.c:118
void btrestrpos(IndexScanDesc scan)
Definition nbtree.c:517
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition nbtree.c:760
#define P_ISHALFDEAD(opaque)
Definition nbtree.h:225
#define BTScanPosIsPinned(scanpos)
Definition nbtree.h:1004
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition nbtree.h:481
#define P_ISLEAF(opaque)
Definition nbtree.h:221
#define SK_BT_SKIP
Definition nbtree.h:1106
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define P_ISDELETED(opaque)
Definition nbtree.h:223
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition nbtree.h:538
#define BTNProcs
Definition nbtree.h:723
#define MaxTIDsPerBTreePage
Definition nbtree.h:186
#define BTScanPosIsValid(scanpos)
Definition nbtree.h:1021
#define P_FIRSTDATAKEY(opaque)
Definition nbtree.h:370
uint16 BTCycleId
Definition nbtree.h:30
#define P_NONE
Definition nbtree.h:213
#define P_RIGHTMOST(opaque)
Definition nbtree.h:220
#define BTREE_METAPAGE
Definition nbtree.h:149
#define SK_BT_MAXVAL
Definition nbtree.h:1110
#define BT_READ
Definition nbtree.h:730
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition nbtree.h:292
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition nbtree.h:493
#define BTScanPosInvalidate(scanpos)
Definition nbtree.h:1027
#define BTScanPosUnpinIfPinned(scanpos)
Definition nbtree.h:1015
#define BTMaxItemSize
Definition nbtree.h:165
#define BTP_SPLIT_END
Definition nbtree.h:82
#define BTOPTIONS_PROC
Definition nbtree.h:721
#define SK_BT_MINVAL
Definition nbtree.h:1109
BTScanOpaqueData * BTScanOpaque
Definition nbtree.h:1097
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:883
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:1586
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition nbtsort.c:299
void _bt_end_vacuum(Relation rel)
Definition nbtutils.c:522
void _bt_end_vacuum_callback(int code, Datum arg)
Definition nbtutils.c:550
void _bt_killitems(IndexScanDesc scan)
Definition nbtutils.c:190
char * btbuildphasename(int64 phasenum)
Definition nbtutils.c:646
bytea * btoptions(Datum reloptions, bool validate)
Definition nbtutils.c:600
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition nbtutils.c:623
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition nbtutils.c:1177
BTCycleId _bt_start_vacuum(Relation rel)
Definition nbtutils.c:465
bool btvalidate(Oid opclassoid)
Definition nbtvalidate.c:40
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
int16 attnum
static char buf[DEFAULT_XLOG_SEG_SIZE]
static Datum PointerGetDatum(const void *X)
Definition postgres.h:342
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
#define InvalidOid
unsigned int Oid
static int fb(int x)
#define PROGRESS_SCAN_BLOCKS_DONE
Definition progress.h:148
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition progress.h:147
void read_stream_reset(ReadStream *stream)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_MAINTENANCE
Definition read_stream.h:28
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
#define RELATION_IS_LOCAL(relation)
Definition rel.h:657
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetRelationName(relation)
Definition rel.h:548
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:533
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
ScanDirection
Definition sdir.h:25
@ ForwardScanDirection
Definition sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition selfuncs.c:7690
Size add_size(Size s1, Size s2)
Definition shmem.c:455
#define SK_SEARCHNULL
Definition skey.h:121
ScanKeyData * ScanKey
Definition skey.h:75
#define SK_ISNULL
Definition skey.h:115
#define IsMVCCLikeSnapshot(snapshot)
Definition snapmgr.h:74
uint16 StrategyNumber
Definition stratnum.h:22
#define BTGreaterStrategyNumber
Definition stratnum.h:33
#define InvalidStrategy
Definition stratnum.h:24
#define BTMaxStrategyNumber
Definition stratnum.h:35
#define BTLessStrategyNumber
Definition stratnum.h:29
#define BTEqualStrategyNumber
Definition stratnum.h:31
#define BTLessEqualStrategyNumber
Definition stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition stratnum.h:32
Datum * elem_values
Definition nbtree.h:1041
BlockNumber btpo_next
Definition nbtree.h:66
uint16 btpo_flags
Definition nbtree.h:68
BTCycleId btpo_cycleid
Definition nbtree.h:69
BTPS_State btps_pageStatus
Definition nbtree.c:74
BlockNumber btps_lastCurrPage
Definition nbtree.c:72
ConditionVariable btps_cv
Definition nbtree.c:78
BlockNumber btps_nextScanPage
Definition nbtree.c:71
int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]
Definition nbtree.c:85
NodeTag type
Definition amapi.h:234
BlockNumber pages_deleted
Definition genam.h:90
BlockNumber pages_free
Definition genam.h:91
BlockNumber num_pages
Definition genam.h:85
double tuples_removed
Definition genam.h:88
double num_index_tuples
Definition genam.h:87
struct ScanKeyData * keyData
Definition relscan.h:142
struct ParallelIndexScanDescData * parallel_scan
Definition relscan.h:192
bool kill_prior_tuple
Definition relscan.h:148
struct TupleDescData * xs_itupdesc
Definition relscan.h:169
Relation indexRelation
Definition relscan.h:138
ItemPointerData xs_heaptid
Definition relscan.h:173
struct SnapshotData * xs_snapshot
Definition relscan.h:139
Relation heapRelation
Definition relscan.h:137
ItemPointerData t_tid
Definition itup.h:37
Relation index
Definition genam.h:54
double num_heap_tuples
Definition genam.h:60
bool analyze_only
Definition genam.h:56
BufferAccessStrategy strategy
Definition genam.h:61
Relation heaprel
Definition genam.h:55
bool report_progress
Definition genam.h:57
bool estimated_count
Definition genam.h:58
TupleDesc rd_att
Definition rel.h:112
Definition type.h:96
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
static ItemArray items
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition tidbitmap.c:367
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:195
void vacuum_delay_point(bool is_analyze)
Definition vacuum.c:2431
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition vacuum.h:47
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition vacuum.h:54