PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
nbtree.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
6 *
7 * NOTES
8 * This file contains only the public interface routines.
9 *
10 *
11 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
13 *
14 * IDENTIFICATION
15 * src/backend/access/nbtree/nbtree.c
16 *
17 *-------------------------------------------------------------------------
18 */
19#include "postgres.h"
20
21#include "access/nbtree.h"
22#include "access/relscan.h"
23#include "access/stratnum.h"
24#include "commands/progress.h"
25#include "commands/vacuum.h"
26#include "nodes/execnodes.h"
27#include "pgstat.h"
28#include "storage/bulk_write.h"
30#include "storage/indexfsm.h"
31#include "storage/ipc.h"
32#include "storage/lmgr.h"
33#include "storage/read_stream.h"
34#include "utils/datum.h"
35#include "utils/fmgrprotos.h"
37#include "utils/memutils.h"
38
39
40/*
41 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
42 *
43 * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
44 * scan to advance it via another call to _bt_first.
45 *
46 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
47 * a new page; others must wait.
48 *
49 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
50 * to a new page; some process can start doing that.
51 *
52 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53 */
54typedef enum
55{
62
63/*
64 * BTParallelScanDescData contains btree specific shared information required
65 * for parallel scan.
66 */
68{
69 BlockNumber btps_nextScanPage; /* next page to be scanned */
70 BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
71 * btps_nextScanPage */
72 BTPS_State btps_pageStatus; /* indicates whether next page is
73 * available for scan. see above for
74 * possible states of parallel scan. */
75 LWLock btps_lock; /* protects shared parallel state */
76 ConditionVariable btps_cv; /* used to synchronize parallel scan */
77
78 /*
79 * btps_arrElems is used when scans need to schedule another primitive
80 * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
81 * offsets for each = scan key associated with a ScalarArrayOp array.
82 */
84
85 /*
86 * Additional space (at the end of the struct) is used when scans need to
87 * schedule another primitive index scan with one or more skip arrays.
88 * Holds a flattened datum representation for each = scan key associated
89 * with a skip array.
90 */
92
94
95
97 BTScanOpaque so);
99 BTScanOpaque so);
100static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
101 IndexBulkDeleteCallback callback, void *callback_state,
102 BTCycleId cycleid);
105 IndexTuple posting,
106 OffsetNumber updatedoffset,
107 int *nremaining);
108
109
110/*
111 * Btree handler function: return IndexAmRoutine with access method parameters
112 * and callbacks.
113 */
114Datum
116{
118
120 amroutine->amsupport = BTNProcs;
121 amroutine->amoptsprocnum = BTOPTIONS_PROC;
122 amroutine->amcanorder = true;
123 amroutine->amcanorderbyop = false;
124 amroutine->amcanhash = false;
125 amroutine->amconsistentequality = true;
126 amroutine->amconsistentordering = true;
127 amroutine->amcanbackward = true;
128 amroutine->amcanunique = true;
129 amroutine->amcanmulticol = true;
130 amroutine->amoptionalkey = true;
131 amroutine->amsearcharray = true;
132 amroutine->amsearchnulls = true;
133 amroutine->amstorage = false;
134 amroutine->amclusterable = true;
135 amroutine->ampredlocks = true;
136 amroutine->amcanparallel = true;
137 amroutine->amcanbuildparallel = true;
138 amroutine->amcaninclude = true;
139 amroutine->amusemaintenanceworkmem = false;
140 amroutine->amsummarizing = false;
141 amroutine->amparallelvacuumoptions =
143 amroutine->amkeytype = InvalidOid;
144
145 amroutine->ambuild = btbuild;
146 amroutine->ambuildempty = btbuildempty;
147 amroutine->aminsert = btinsert;
148 amroutine->aminsertcleanup = NULL;
149 amroutine->ambulkdelete = btbulkdelete;
150 amroutine->amvacuumcleanup = btvacuumcleanup;
151 amroutine->amcanreturn = btcanreturn;
152 amroutine->amcostestimate = btcostestimate;
153 amroutine->amgettreeheight = btgettreeheight;
154 amroutine->amoptions = btoptions;
155 amroutine->amproperty = btproperty;
157 amroutine->amvalidate = btvalidate;
158 amroutine->amadjustmembers = btadjustmembers;
159 amroutine->ambeginscan = btbeginscan;
160 amroutine->amrescan = btrescan;
161 amroutine->amgettuple = btgettuple;
162 amroutine->amgetbitmap = btgetbitmap;
163 amroutine->amendscan = btendscan;
164 amroutine->ammarkpos = btmarkpos;
165 amroutine->amrestrpos = btrestrpos;
171
172 PG_RETURN_POINTER(amroutine);
173}
174
175/*
176 * btbuildempty() -- build an empty btree index in the initialization fork
177 */
178void
180{
181 bool allequalimage = _bt_allequalimage(index, false);
182 BulkWriteState *bulkstate;
183 BulkWriteBuffer metabuf;
184
186
187 /* Construct metapage. */
188 metabuf = smgr_bulk_get_buf(bulkstate);
189 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
190 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
191
192 smgr_bulk_finish(bulkstate);
193}
194
195/*
196 * btinsert() -- insert an index tuple into a btree.
197 *
198 * Descend the tree recursively, find the appropriate location for our
199 * new tuple, and put it there.
200 */
201bool
202btinsert(Relation rel, Datum *values, bool *isnull,
203 ItemPointer ht_ctid, Relation heapRel,
204 IndexUniqueCheck checkUnique,
205 bool indexUnchanged,
206 IndexInfo *indexInfo)
207{
208 bool result;
209 IndexTuple itup;
210
211 /* generate an index tuple */
212 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
213 itup->t_tid = *ht_ctid;
214
215 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
216
217 pfree(itup);
218
219 return result;
220}
221
222/*
223 * btgettuple() -- Get the next tuple in the scan.
224 */
225bool
227{
228 BTScanOpaque so = (BTScanOpaque) scan->opaque;
229 bool res;
230
231 /* btree indexes are never lossy */
232 scan->xs_recheck = false;
233
234 /* Each loop iteration performs another primitive index scan */
235 do
236 {
237 /*
238 * If we've already initialized this scan, we can just advance it in
239 * the appropriate direction. If we haven't done so yet, we call
240 * _bt_first() to get the first item in the scan.
241 */
243 res = _bt_first(scan, dir);
244 else
245 {
246 /*
247 * Check to see if we should kill the previously-fetched tuple.
248 */
249 if (scan->kill_prior_tuple)
250 {
251 /*
252 * Yes, remember it for later. (We'll deal with all such
253 * tuples at once right before leaving the index page.) The
254 * test for numKilled overrun is not just paranoia: if the
255 * caller reverses direction in the indexscan then the same
256 * item might get entered multiple times. It's not worth
257 * trying to optimize that, so we don't detect it, but instead
258 * just forget any excess entries.
259 */
260 if (so->killedItems == NULL)
261 so->killedItems = (int *)
262 palloc(MaxTIDsPerBTreePage * sizeof(int));
264 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
265 }
266
267 /*
268 * Now continue the scan.
269 */
270 res = _bt_next(scan, dir);
271 }
272
273 /* If we have a tuple, return it ... */
274 if (res)
275 break;
276 /* ... otherwise see if we need another primitive index scan */
277 } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
278
279 return res;
280}
281
282/*
283 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
284 */
285int64
287{
288 BTScanOpaque so = (BTScanOpaque) scan->opaque;
289 int64 ntids = 0;
290 ItemPointer heapTid;
291
292 /* Each loop iteration performs another primitive index scan */
293 do
294 {
295 /* Fetch the first page & tuple */
297 {
298 /* Save tuple ID, and continue scanning */
299 heapTid = &scan->xs_heaptid;
300 tbm_add_tuples(tbm, heapTid, 1, false);
301 ntids++;
302
303 for (;;)
304 {
305 /*
306 * Advance to next tuple within page. This is the same as the
307 * easy case in _bt_next().
308 */
309 if (++so->currPos.itemIndex > so->currPos.lastItem)
310 {
311 /* let _bt_next do the heavy lifting */
312 if (!_bt_next(scan, ForwardScanDirection))
313 break;
314 }
315
316 /* Save tuple ID, and continue scanning */
317 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
318 tbm_add_tuples(tbm, heapTid, 1, false);
319 ntids++;
320 }
321 }
322 /* Now see if we need another primitive index scan */
323 } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
324
325 return ntids;
326}
327
328/*
329 * btbeginscan() -- start a scan on a btree index
330 */
332btbeginscan(Relation rel, int nkeys, int norderbys)
333{
334 IndexScanDesc scan;
335 BTScanOpaque so;
336
337 /* no order by operators allowed */
338 Assert(norderbys == 0);
339
340 /* get the scan */
341 scan = RelationGetIndexScan(rel, nkeys, norderbys);
342
343 /* allocate private workspace */
344 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
347 if (scan->numberOfKeys > 0)
348 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
349 else
350 so->keyData = NULL;
351
352 so->skipScan = false;
353 so->needPrimScan = false;
354 so->scanBehind = false;
355 so->oppositeDirCheck = false;
356 so->arrayKeys = NULL;
357 so->orderProcs = NULL;
358 so->arrayContext = NULL;
359
360 so->killedItems = NULL; /* until needed */
361 so->numKilled = 0;
362
363 /*
364 * We don't know yet whether the scan will be index-only, so we do not
365 * allocate the tuple workspace arrays until btrescan. However, we set up
366 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
367 */
368 so->currTuples = so->markTuples = NULL;
369
370 scan->xs_itupdesc = RelationGetDescr(rel);
371
372 scan->opaque = so;
373
374 return scan;
375}
376
377/*
378 * btrescan() -- rescan an index relation
379 */
380void
381btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
382 ScanKey orderbys, int norderbys)
383{
384 BTScanOpaque so = (BTScanOpaque) scan->opaque;
385
386 /* we aren't holding any read locks, but gotta drop the pins */
388 {
389 /* Before leaving current page, deal with any killed items */
390 if (so->numKilled > 0)
391 _bt_killitems(scan);
394 }
395
396 so->markItemIndex = -1;
397 so->needPrimScan = false;
398 so->scanBehind = false;
399 so->oppositeDirCheck = false;
402
403 /*
404 * Allocate tuple workspace arrays, if needed for an index-only scan and
405 * not already done in a previous rescan call. To save on palloc
406 * overhead, both workspaces are allocated as one palloc block; only this
407 * function and btendscan know that.
408 *
409 * NOTE: this data structure also makes it safe to return data from a
410 * "name" column, even though btree name_ops uses an underlying storage
411 * datatype of cstring. The risk there is that "name" is supposed to be
412 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
413 * However, since we only return data out of tuples sitting in the
414 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
415 * data out of the markTuples array --- running off the end of memory for
416 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
417 * adding special-case treatment for name_ops elsewhere.
418 */
419 if (scan->xs_want_itup && so->currTuples == NULL)
420 {
421 so->currTuples = (char *) palloc(BLCKSZ * 2);
422 so->markTuples = so->currTuples + BLCKSZ;
423 }
424
425 /*
426 * Reset the scan keys
427 */
428 if (scankey && scan->numberOfKeys > 0)
429 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
430 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
431 so->numArrayKeys = 0; /* ditto */
432}
433
434/*
435 * btendscan() -- close down a scan
436 */
437void
439{
440 BTScanOpaque so = (BTScanOpaque) scan->opaque;
441
442 /* we aren't holding any read locks, but gotta drop the pins */
444 {
445 /* Before leaving current page, deal with any killed items */
446 if (so->numKilled > 0)
447 _bt_killitems(scan);
449 }
450
451 so->markItemIndex = -1;
453
454 /* No need to invalidate positions, the RAM is about to be freed. */
455
456 /* Release storage */
457 if (so->keyData != NULL)
458 pfree(so->keyData);
459 /* so->arrayKeys and so->orderProcs are in arrayContext */
460 if (so->arrayContext != NULL)
462 if (so->killedItems != NULL)
463 pfree(so->killedItems);
464 if (so->currTuples != NULL)
465 pfree(so->currTuples);
466 /* so->markTuples should not be pfree'd, see btrescan */
467 pfree(so);
468}
469
470/*
471 * btmarkpos() -- save current scan position
472 */
473void
475{
476 BTScanOpaque so = (BTScanOpaque) scan->opaque;
477
478 /* There may be an old mark with a pin (but no lock). */
480
481 /*
482 * Just record the current itemIndex. If we later step to next page
483 * before releasing the marked position, _bt_steppage makes a full copy of
484 * the currPos struct in markPos. If (as often happens) the mark is moved
485 * before we leave the page, we don't have to do that work.
486 */
487 if (BTScanPosIsValid(so->currPos))
489 else
490 {
492 so->markItemIndex = -1;
493 }
494}
495
496/*
497 * btrestrpos() -- restore scan to last saved position
498 */
499void
501{
502 BTScanOpaque so = (BTScanOpaque) scan->opaque;
503
504 if (so->markItemIndex >= 0)
505 {
506 /*
507 * The scan has never moved to a new page since the last mark. Just
508 * restore the itemIndex.
509 *
510 * NB: In this case we can't count on anything in so->markPos to be
511 * accurate.
512 */
514 }
515 else
516 {
517 /*
518 * The scan moved to a new page after last mark or restore, and we are
519 * now restoring to the marked page. We aren't holding any read
520 * locks, but if we're still holding the pin for the current position,
521 * we must drop it.
522 */
523 if (BTScanPosIsValid(so->currPos))
524 {
525 /* Before leaving current page, deal with any killed items */
526 if (so->numKilled > 0)
527 _bt_killitems(scan);
529 }
530
531 if (BTScanPosIsValid(so->markPos))
532 {
533 /* bump pin on mark buffer for assignment to current buffer */
534 if (BTScanPosIsPinned(so->markPos))
536 memcpy(&so->currPos, &so->markPos,
537 offsetof(BTScanPosData, items[1]) +
538 so->markPos.lastItem * sizeof(BTScanPosItem));
539 if (so->currTuples)
540 memcpy(so->currTuples, so->markTuples,
542 /* Reset the scan's array keys (see _bt_steppage for why) */
543 if (so->numArrayKeys)
544 {
546 so->needPrimScan = false;
547 }
548 }
549 else
551 }
552}
553
554/*
555 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
556 */
557Size
558btestimateparallelscan(Relation rel, int nkeys, int norderbys)
559{
561 Size estnbtreeshared,
562 genericattrspace;
563
564 /*
565 * Pessimistically assume that every input scan key will be output with
566 * its own SAOP array
567 */
568 estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
569 sizeof(int) * nkeys;
570
571 /* Single column indexes cannot possibly use a skip array */
572 if (nkeyatts == 1)
573 return estnbtreeshared;
574
575 /*
576 * Pessimistically assume that all attributes prior to the least
577 * significant attribute require a skip array (and an associated key)
578 */
579 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
580 sizeof(Datum));
581 for (int attnum = 1; attnum < nkeyatts; attnum++)
582 {
583 CompactAttribute *attr;
584
585 /*
586 * We make the conservative assumption that every index column will
587 * also require a skip array.
588 *
589 * Every skip array must have space to store its scan key's sk_flags.
590 */
591 estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
592
593 /* Consider space required to store a datum of opclass input type */
594 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
595 if (attr->attbyval)
596 {
597 /* This index attribute stores pass-by-value datums */
598 Size estfixed = datumEstimateSpace((Datum) 0, false,
599 true, attr->attlen);
600
601 estnbtreeshared = add_size(estnbtreeshared, estfixed);
602 continue;
603 }
604
605 /*
606 * This index attribute stores pass-by-reference datums.
607 *
608 * Assume that serializing this array will use just as much space as a
609 * pass-by-value datum, in addition to space for the largest possible
610 * whole index tuple (this is not just a per-datum portion of the
611 * largest possible tuple because that'd be almost as large anyway).
612 *
613 * This is quite conservative, but it's not clear how we could do much
614 * better. The executor requires an up-front storage request size
615 * that reliably covers the scan's high watermark memory usage. We
616 * can't be sure of the real high watermark until the scan is over.
617 */
618 estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
619 estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
620 }
621
622 return estnbtreeshared;
623}
624
625/*
626 * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
627 *
628 * Caller must have exclusively locked btscan->btps_lock when called.
629 */
630static void
632 BTScanOpaque so)
633{
634 char *datumshared;
635
636 /* Space for serialized datums begins immediately after btps_arrElems[] */
637 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
638 for (int i = 0; i < so->numArrayKeys; i++)
639 {
640 BTArrayKeyInfo *array = &so->arrayKeys[i];
641 ScanKey skey = &so->keyData[array->scan_key];
642
643 if (array->num_elems != -1)
644 {
645 /* Save SAOP array's cur_elem (no need to copy key/datum) */
646 Assert(!(skey->sk_flags & SK_BT_SKIP));
647 btscan->btps_arrElems[i] = array->cur_elem;
648 continue;
649 }
650
651 /* Save all mutable state associated with skip array's key */
652 Assert(skey->sk_flags & SK_BT_SKIP);
653 memcpy(datumshared, &skey->sk_flags, sizeof(int));
654 datumshared += sizeof(int);
655
656 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
657 {
658 /* No sk_argument datum to serialize */
659 Assert(skey->sk_argument == 0);
660 continue;
661 }
662
663 datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
664 array->attbyval, array->attlen, &datumshared);
665 }
666}
667
668/*
669 * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
670 *
671 * Caller must have exclusively locked btscan->btps_lock when called.
672 */
673static void
675 BTScanOpaque so)
676{
677 char *datumshared;
678
679 /* Space for serialized datums begins immediately after btps_arrElems[] */
680 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
681 for (int i = 0; i < so->numArrayKeys; i++)
682 {
683 BTArrayKeyInfo *array = &so->arrayKeys[i];
684 ScanKey skey = &so->keyData[array->scan_key];
685 bool isnull;
686
687 if (array->num_elems != -1)
688 {
689 /* Restore SAOP array using its saved cur_elem */
690 Assert(!(skey->sk_flags & SK_BT_SKIP));
691 array->cur_elem = btscan->btps_arrElems[i];
692 skey->sk_argument = array->elem_values[array->cur_elem];
693 continue;
694 }
695
696 /* Restore skip array by restoring its key directly */
697 if (!array->attbyval && skey->sk_argument)
699 skey->sk_argument = (Datum) 0;
700 memcpy(&skey->sk_flags, datumshared, sizeof(int));
701 datumshared += sizeof(int);
702
703 Assert(skey->sk_flags & SK_BT_SKIP);
704
705 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
706 {
707 /* No sk_argument datum to restore */
708 continue;
709 }
710
711 skey->sk_argument = datumRestore(&datumshared, &isnull);
712 if (isnull)
713 {
714 Assert(skey->sk_argument == 0);
716 Assert(skey->sk_flags & SK_ISNULL);
717 }
718 }
719}
720
721/*
722 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
723 */
724void
726{
727 BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
728
729 LWLockInitialize(&bt_target->btps_lock,
734 ConditionVariableInit(&bt_target->btps_cv);
735}
736
737/*
738 * btparallelrescan() -- reset parallel scan
739 */
740void
742{
743 BTParallelScanDesc btscan;
744 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
745
746 Assert(parallel_scan);
747
748 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
749 parallel_scan->ps_offset_am);
750
751 /*
752 * In theory, we don't need to acquire the LWLock here, because there
753 * shouldn't be any other workers running at this point, but we do so for
754 * consistency.
755 */
760 LWLockRelease(&btscan->btps_lock);
761}
762
763/*
764 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
765 * page. Other scans must wait until we call _bt_parallel_release()
766 * or _bt_parallel_done().
767 *
768 * The return value is true if we successfully seized the scan and false
769 * if we did not. The latter case occurs when no pages remain, or when
770 * another primitive index scan is scheduled that caller's backend cannot
771 * start just yet (only backends that call from _bt_first are capable of
772 * starting primitive index scans, which they indicate by passing first=true).
773 *
774 * If the return value is true, *next_scan_page returns the next page of the
775 * scan, and *last_curr_page returns the page that *next_scan_page came from.
776 * An invalid *next_scan_page means the scan hasn't yet started, or that
777 * caller needs to start the next primitive index scan (if it's the latter
778 * case we'll set so.needPrimScan).
779 *
780 * Callers should ignore the value of *next_scan_page and *last_curr_page if
781 * the return value is false.
782 */
783bool
785 BlockNumber *last_curr_page, bool first)
786{
787 Relation rel = scan->indexRelation;
788 BTScanOpaque so = (BTScanOpaque) scan->opaque;
789 bool exit_loop = false,
790 status = true,
791 endscan = false;
792 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
793 BTParallelScanDesc btscan;
794
795 *next_scan_page = InvalidBlockNumber;
796 *last_curr_page = InvalidBlockNumber;
797
798 /*
799 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
800 * call to _bt_readnextpage treats this backend similarly to a serial
801 * backend that steps from *last_curr_page to *next_scan_page (unless this
802 * backend's so->currPos is initialized by _bt_readfirstpage before then).
803 */
805 so->currPos.moreLeft = so->currPos.moreRight = true;
806
807 if (first)
808 {
809 /*
810 * Initialize array related state when called from _bt_first, assuming
811 * that this will be the first primitive index scan for the scan
812 */
813 so->needPrimScan = false;
814 so->scanBehind = false;
815 so->oppositeDirCheck = false;
816 }
817 else
818 {
819 /*
820 * Don't attempt to seize the scan when it requires another primitive
821 * index scan, since caller's backend cannot start it right now
822 */
823 if (so->needPrimScan)
824 return false;
825 }
826
827 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
828 parallel_scan->ps_offset_am);
829
830 while (1)
831 {
832 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
833
834 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
835 {
836 /* We're done with this parallel index scan */
837 status = false;
838 }
839 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
840 btscan->btps_nextScanPage == P_NONE)
841 {
842 /* End this parallel index scan */
843 status = false;
844 endscan = true;
845 }
846 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
847 {
848 Assert(so->numArrayKeys);
849
850 if (first)
851 {
852 /* Can start scheduled primitive scan right away, so do so */
853 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
854
855 /* Restore scan's array keys from serialized values */
856 _bt_parallel_restore_arrays(rel, btscan, so);
857 exit_loop = true;
858 }
859 else
860 {
861 /*
862 * Don't attempt to seize the scan when it requires another
863 * primitive index scan, since caller's backend cannot start
864 * it right now
865 */
866 status = false;
867 }
868
869 /*
870 * Either way, update backend local state to indicate that a
871 * pending primitive scan is required
872 */
873 so->needPrimScan = true;
874 so->scanBehind = false;
875 so->oppositeDirCheck = false;
876 }
877 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
878 {
879 /*
880 * We have successfully seized control of the scan for the purpose
881 * of advancing it to a new page!
882 */
883 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
884 Assert(btscan->btps_nextScanPage != P_NONE);
885 *next_scan_page = btscan->btps_nextScanPage;
886 *last_curr_page = btscan->btps_lastCurrPage;
887 exit_loop = true;
888 }
889 LWLockRelease(&btscan->btps_lock);
890 if (exit_loop || !status)
891 break;
892 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
893 }
895
896 /* When the scan has reached the rightmost (or leftmost) page, end it */
897 if (endscan)
898 _bt_parallel_done(scan);
899
900 return status;
901}
902
903/*
904 * _bt_parallel_release() -- Complete the process of advancing the scan to a
905 * new page. We now have the new value btps_nextScanPage; another backend
906 * can now begin advancing the scan.
907 *
908 * Callers whose scan uses array keys must save their curr_page argument so
909 * that it can be passed to _bt_parallel_primscan_schedule, should caller
910 * determine that another primitive index scan is required.
911 *
912 * If caller's next_scan_page is P_NONE, the scan has reached the index's
913 * rightmost/leftmost page. This is treated as reaching the end of the scan
914 * within _bt_parallel_seize.
915 *
916 * Note: unlike the serial case, parallel scans don't need to remember both
917 * sibling links. next_scan_page is whichever link is next given the scan's
918 * direction. That's all we'll ever need, since the direction of a parallel
919 * scan can never change.
920 */
921void
923 BlockNumber curr_page)
924{
925 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
926 BTParallelScanDesc btscan;
927
928 Assert(BlockNumberIsValid(next_scan_page));
929
930 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
931 parallel_scan->ps_offset_am);
932
934 btscan->btps_nextScanPage = next_scan_page;
935 btscan->btps_lastCurrPage = curr_page;
937 LWLockRelease(&btscan->btps_lock);
939}
940
941/*
942 * _bt_parallel_done() -- Mark the parallel scan as complete.
943 *
944 * When there are no pages left to scan, this function should be called to
945 * notify other workers. Otherwise, they might wait forever for the scan to
946 * advance to the next page.
947 */
948void
950{
951 BTScanOpaque so = (BTScanOpaque) scan->opaque;
952 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
953 BTParallelScanDesc btscan;
954 bool status_changed = false;
955
957
958 /* Do nothing, for non-parallel scans */
959 if (parallel_scan == NULL)
960 return;
961
962 /*
963 * Should not mark parallel scan done when there's still a pending
964 * primitive index scan
965 */
966 if (so->needPrimScan)
967 return;
968
969 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
970 parallel_scan->ps_offset_am);
971
972 /*
973 * Mark the parallel scan as done, unless some other process did so
974 * already
975 */
976 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
977 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
978 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
979 {
980 btscan->btps_pageStatus = BTPARALLEL_DONE;
981 status_changed = true;
982 }
983 LWLockRelease(&btscan->btps_lock);
984
985 /* wake up all the workers associated with this parallel scan */
986 if (status_changed)
987 ConditionVariableBroadcast(&btscan->btps_cv);
988}
989
990/*
991 * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
992 *
993 * Caller passes the curr_page most recently passed to _bt_parallel_release
994 * by its backend. Caller successfully schedules the next primitive index scan
995 * if the shared parallel state hasn't been seized since caller's backend last
996 * advanced the scan.
997 */
998void
1000{
1001 Relation rel = scan->indexRelation;
1002 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1003 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1004 BTParallelScanDesc btscan;
1005
1006 Assert(so->numArrayKeys);
1007
1008 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1009 parallel_scan->ps_offset_am);
1010
1011 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1012 if (btscan->btps_lastCurrPage == curr_page &&
1013 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1014 {
1015 btscan->btps_nextScanPage = InvalidBlockNumber;
1016 btscan->btps_lastCurrPage = InvalidBlockNumber;
1017 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1018
1019 /* Serialize scan's current array keys */
1020 _bt_parallel_serialize_arrays(rel, btscan, so);
1021 }
1022 LWLockRelease(&btscan->btps_lock);
1023}
1024
1025/*
1026 * Bulk deletion of all index entries pointing to a set of heap tuples.
1027 * The set of target tuples is specified via a callback routine that tells
1028 * whether any given heap tuple (identified by ItemPointer) is being deleted.
1029 *
1030 * Result: a palloc'd struct containing statistical info for VACUUM displays.
1031 */
1034 IndexBulkDeleteCallback callback, void *callback_state)
1035{
1036 Relation rel = info->index;
1037 BTCycleId cycleid;
1038
1039 /* allocate stats if first time through, else re-use existing struct */
1040 if (stats == NULL)
1042
1043 /* Establish the vacuum cycle ID to use for this scan */
1044 /* The ENSURE stuff ensures we clean up shared memory on failure */
1046 {
1047 cycleid = _bt_start_vacuum(rel);
1048
1049 btvacuumscan(info, stats, callback, callback_state, cycleid);
1050 }
1052 _bt_end_vacuum(rel);
1053
1054 return stats;
1055}
1056
1057/*
1058 * Post-VACUUM cleanup.
1059 *
1060 * Result: a palloc'd struct containing statistical info for VACUUM displays.
1061 */
1064{
1065 BlockNumber num_delpages;
1066
1067 /* No-op in ANALYZE ONLY mode */
1068 if (info->analyze_only)
1069 return stats;
1070
1071 /*
1072 * If btbulkdelete was called, we need not do anything (we just maintain
1073 * the information used within _bt_vacuum_needs_cleanup() by calling
1074 * _bt_set_cleanup_info() below).
1075 *
1076 * If btbulkdelete was _not_ called, then we have a choice to make: we
1077 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1078 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1079 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1080 * now.
1081 */
1082 if (stats == NULL)
1083 {
1084 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1085 if (!_bt_vacuum_needs_cleanup(info->index))
1086 return NULL;
1087
1088 /*
1089 * Since we aren't going to actually delete any leaf items, there's no
1090 * need to go through all the vacuum-cycle-ID pushups here.
1091 *
1092 * Posting list tuples are a source of inaccuracy for cleanup-only
1093 * scans. btvacuumscan() will assume that the number of index tuples
1094 * from each page can be used as num_index_tuples, even though
1095 * num_index_tuples is supposed to represent the number of TIDs in the
1096 * index. This naive approach can underestimate the number of tuples
1097 * in the index significantly.
1098 *
1099 * We handle the problem by making num_index_tuples an estimate in
1100 * cleanup-only case.
1101 */
1103 btvacuumscan(info, stats, NULL, NULL, 0);
1104 stats->estimated_count = true;
1105 }
1106
1107 /*
1108 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1109 *
1110 * num_delpages is the number of deleted pages now in the index that were
1111 * not safe to place in the FSM to be recycled just yet. num_delpages is
1112 * greater than 0 only when _bt_pagedel() actually deleted pages during
1113 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1114 * have failed to place any newly deleted pages in the FSM just moments
1115 * ago. (Actually, there are edge cases where recycling of the current
1116 * VACUUM's newly deleted pages does not even become safe by the time the
1117 * next VACUUM comes around. See nbtree/README.)
1118 */
1119 Assert(stats->pages_deleted >= stats->pages_free);
1120 num_delpages = stats->pages_deleted - stats->pages_free;
1121 _bt_set_cleanup_info(info->index, num_delpages);
1122
1123 /*
1124 * It's quite possible for us to be fooled by concurrent page splits into
1125 * double-counting some index tuples, so disbelieve any total that exceeds
1126 * the underlying heap's count ... if we know that accurately. Otherwise
1127 * this might just make matters worse.
1128 */
1129 if (!info->estimated_count)
1130 {
1131 if (stats->num_index_tuples > info->num_heap_tuples)
1132 stats->num_index_tuples = info->num_heap_tuples;
1133 }
1134
1135 return stats;
1136}
1137
1138/*
1139 * btvacuumscan --- scan the index for VACUUMing purposes
1140 *
1141 * This combines the functions of looking for leaf tuples that are deletable
1142 * according to the vacuum callback, looking for empty pages that can be
1143 * deleted, and looking for old deleted pages that can be recycled. Both
1144 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1145 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1146 *
1147 * The caller is responsible for initially allocating/zeroing a stats struct
1148 * and for obtaining a vacuum cycle ID if necessary.
1149 */
1150static void
1152 IndexBulkDeleteCallback callback, void *callback_state,
1153 BTCycleId cycleid)
1154{
1155 Relation rel = info->index;
1156 BTVacState vstate;
1157 BlockNumber num_pages;
1158 bool needLock;
1160 ReadStream *stream = NULL;
1161
1162 /*
1163 * Reset fields that track information about the entire index now. This
1164 * avoids double-counting in the case where a single VACUUM command
1165 * requires multiple scans of the index.
1166 *
1167 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1168 * since they track information about the VACUUM command, and so must last
1169 * across each call to btvacuumscan().
1170 *
1171 * (Note that pages_free is treated as state about the whole index, not
1172 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1173 * calls are idempotent, and get repeated for the same deleted pages in
1174 * some scenarios. The point for us is to track the number of recyclable
1175 * pages in the index at the end of the VACUUM command.)
1176 */
1177 stats->num_pages = 0;
1178 stats->num_index_tuples = 0;
1179 stats->pages_deleted = 0;
1180 stats->pages_free = 0;
1181
1182 /* Set up info to pass down to btvacuumpage */
1183 vstate.info = info;
1184 vstate.stats = stats;
1185 vstate.callback = callback;
1186 vstate.callback_state = callback_state;
1187 vstate.cycleid = cycleid;
1188
1189 /* Create a temporary memory context to run _bt_pagedel in */
1191 "_bt_pagedel",
1193
1194 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1195 vstate.bufsize = 0;
1196 vstate.maxbufsize = 0;
1197 vstate.pendingpages = NULL;
1198 vstate.npendingpages = 0;
1199 /* Consider applying _bt_pendingfsm_finalize optimization */
1200 _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1201
1202 /*
1203 * The outer loop iterates over all index pages except the metapage, in
1204 * physical order (we hope the kernel will cooperate in providing
1205 * read-ahead for speed). It is critical that we visit all leaf pages,
1206 * including ones added after we start the scan, else we might fail to
1207 * delete some deletable tuples. Hence, we must repeatedly check the
1208 * relation length. We must acquire the relation-extension lock while
1209 * doing so to avoid a race condition: if someone else is extending the
1210 * relation, there is a window where bufmgr/smgr have created a new
1211 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1212 * we manage to scan such a page here, we'll improperly assume it can be
1213 * recycled. Taking the lock synchronizes things enough to prevent a
1214 * problem: either num_pages won't include the new page, or _bt_getbuf
1215 * already has write lock on the buffer and it will be fully initialized
1216 * before we can examine it. Also, we need not worry if a page is added
1217 * immediately after we look; the page splitting code already has
1218 * write-lock on the left page before it adds a right page, so we must
1219 * already have processed any tuples due to be moved into such a page.
1220 *
1221 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1222 * think the use of the extension lock is still required.
1223 *
1224 * We can skip locking for new or temp relations, however, since no one
1225 * else could be accessing them.
1226 */
1227 needLock = !RELATION_IS_LOCAL(rel);
1228
1230
1231 /*
1232 * It is safe to use batchmode as block_range_read_stream_cb takes no
1233 * locks.
1234 */
1237 info->strategy,
1238 rel,
1241 &p,
1242 0);
1243 for (;;)
1244 {
1245 /* Get the current relation length */
1246 if (needLock)
1248 num_pages = RelationGetNumberOfBlocks(rel);
1249 if (needLock)
1251
1252 if (info->report_progress)
1254 num_pages);
1255
1256 /* Quit if we've scanned the whole relation */
1257 if (p.current_blocknum >= num_pages)
1258 break;
1259
1260 p.last_exclusive = num_pages;
1261
1262 /* Iterate over pages, then loop back to recheck relation length */
1263 while (true)
1264 {
1265 BlockNumber current_block;
1266 Buffer buf;
1267
1268 /* call vacuum_delay_point while not holding any buffer lock */
1269 vacuum_delay_point(false);
1270
1271 buf = read_stream_next_buffer(stream, NULL);
1272
1273 if (!BufferIsValid(buf))
1274 break;
1275
1276 current_block = btvacuumpage(&vstate, buf);
1277
1278 if (info->report_progress)
1280 current_block);
1281 }
1282
1283 /*
1284 * We have to reset the read stream to use it again. After returning
1285 * InvalidBuffer, the read stream API won't invoke our callback again
1286 * until the stream has been reset.
1287 */
1288 read_stream_reset(stream);
1289 }
1290
1291 read_stream_end(stream);
1292
1293 /* Set statistics num_pages field to final size of index */
1294 stats->num_pages = num_pages;
1295
1297
1298 /*
1299 * If there were any calls to _bt_pagedel() during scan of the index then
1300 * see if any of the resulting pages can be placed in the FSM now. When
1301 * it's not safe we'll have to leave it up to a future VACUUM operation.
1302 *
1303 * Finally, if we placed any pages in the FSM (either just now or during
1304 * the scan), forcibly update the upper-level FSM pages to ensure that
1305 * searchers can find them.
1306 */
1307 _bt_pendingfsm_finalize(rel, &vstate);
1308 if (stats->pages_free > 0)
1310}
1311
1312/*
1313 * btvacuumpage --- VACUUM one page
1314 *
1315 * This processes a single page for btvacuumscan(). In some cases we must
1316 * backtrack to re-examine and VACUUM pages that were on buf's page during
1317 * a previous call here. This is how we handle page splits (that happened
1318 * after our cycleid was acquired) whose right half page happened to reuse
1319 * a block that we might have processed at some point before it was
1320 * recycled (i.e. before the page split).
1321 *
1322 * Returns BlockNumber of a scanned page (not backtracked).
1323 */
1324static BlockNumber
1326{
1327 IndexVacuumInfo *info = vstate->info;
1328 IndexBulkDeleteResult *stats = vstate->stats;
1330 void *callback_state = vstate->callback_state;
1331 Relation rel = info->index;
1332 Relation heaprel = info->heaprel;
1333 bool attempt_pagedel;
1334 BlockNumber blkno,
1335 backtrack_to;
1337 Page page;
1338 BTPageOpaque opaque;
1339
1340 blkno = scanblkno;
1341
1342backtrack:
1343
1344 attempt_pagedel = false;
1345 backtrack_to = P_NONE;
1346
1347 _bt_lockbuf(rel, buf, BT_READ);
1348 page = BufferGetPage(buf);
1349 opaque = NULL;
1350 if (!PageIsNew(page))
1351 {
1352 _bt_checkpage(rel, buf);
1353 opaque = BTPageGetOpaque(page);
1354 }
1355
1356 Assert(blkno <= scanblkno);
1357 if (blkno != scanblkno)
1358 {
1359 /*
1360 * We're backtracking.
1361 *
1362 * We followed a right link to a sibling leaf page (a page that
1363 * happens to be from a block located before scanblkno). The only
1364 * case we want to do anything with is a live leaf page having the
1365 * current vacuum cycle ID.
1366 *
1367 * The page had better be in a state that's consistent with what we
1368 * expect. Check for conditions that imply corruption in passing. It
1369 * can't be half-dead because only an interrupted VACUUM process can
1370 * leave pages in that state, so we'd definitely have dealt with it
1371 * back when the page was the scanblkno page (half-dead pages are
1372 * always marked fully deleted by _bt_pagedel(), barring corruption).
1373 */
1374 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1375 {
1376 Assert(false);
1377 ereport(LOG,
1378 (errcode(ERRCODE_INDEX_CORRUPTED),
1379 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1380 blkno, scanblkno, RelationGetRelationName(rel))));
1381 _bt_relbuf(rel, buf);
1382 return scanblkno;
1383 }
1384
1385 /*
1386 * We may have already processed the page in an earlier call, when the
1387 * page was scanblkno. This happens when the leaf page split occurred
1388 * after the scan began, but before the right sibling page became the
1389 * scanblkno.
1390 *
1391 * Page may also have been deleted by current btvacuumpage() call,
1392 * since _bt_pagedel() sometimes deletes the right sibling page of
1393 * scanblkno in passing (it does so after we decided where to
1394 * backtrack to). We don't need to process this page as a deleted
1395 * page a second time now (in fact, it would be wrong to count it as a
1396 * deleted page in the bulk delete statistics a second time).
1397 */
1398 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1399 {
1400 /* Done with current scanblkno (and all lower split pages) */
1401 _bt_relbuf(rel, buf);
1402 return scanblkno;
1403 }
1404 }
1405
1406 if (!opaque || BTPageIsRecyclable(page, heaprel))
1407 {
1408 /* Okay to recycle this page (which could be leaf or internal) */
1409 RecordFreeIndexPage(rel, blkno);
1410 stats->pages_deleted++;
1411 stats->pages_free++;
1412 }
1413 else if (P_ISDELETED(opaque))
1414 {
1415 /*
1416 * Already deleted page (which could be leaf or internal). Can't
1417 * recycle yet.
1418 */
1419 stats->pages_deleted++;
1420 }
1421 else if (P_ISHALFDEAD(opaque))
1422 {
1423 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1424 attempt_pagedel = true;
1425
1426 /*
1427 * _bt_pagedel() will increment both pages_newly_deleted and
1428 * pages_deleted stats in all cases (barring corruption)
1429 */
1430 }
1431 else if (P_ISLEAF(opaque))
1432 {
1434 int ndeletable;
1436 int nupdatable;
1437 OffsetNumber offnum,
1438 minoff,
1439 maxoff;
1440 int nhtidsdead,
1441 nhtidslive;
1442
1443 /*
1444 * Trade in the initial read lock for a full cleanup lock on this
1445 * page. We must get such a lock on every leaf page over the course
1446 * of the vacuum scan, whether or not it actually contains any
1447 * deletable tuples --- see nbtree/README.
1448 */
1450
1451 /*
1452 * Check whether we need to backtrack to earlier pages. What we are
1453 * concerned about is a page split that happened since we started the
1454 * vacuum scan. If the split moved tuples on the right half of the
1455 * split (i.e. the tuples that sort high) to a block that we already
1456 * passed over, then we might have missed the tuples. We need to
1457 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1458 * or deleting scanblkno page below!)
1459 */
1460 if (vstate->cycleid != 0 &&
1461 opaque->btpo_cycleid == vstate->cycleid &&
1462 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1463 !P_RIGHTMOST(opaque) &&
1464 opaque->btpo_next < scanblkno)
1465 backtrack_to = opaque->btpo_next;
1466
1467 ndeletable = 0;
1468 nupdatable = 0;
1469 minoff = P_FIRSTDATAKEY(opaque);
1470 maxoff = PageGetMaxOffsetNumber(page);
1471 nhtidsdead = 0;
1472 nhtidslive = 0;
1473 if (callback)
1474 {
1475 /* btbulkdelete callback tells us what to delete (or update) */
1476 for (offnum = minoff;
1477 offnum <= maxoff;
1478 offnum = OffsetNumberNext(offnum))
1479 {
1480 IndexTuple itup;
1481
1482 itup = (IndexTuple) PageGetItem(page,
1483 PageGetItemId(page, offnum));
1484
1485 Assert(!BTreeTupleIsPivot(itup));
1486 if (!BTreeTupleIsPosting(itup))
1487 {
1488 /* Regular tuple, standard table TID representation */
1489 if (callback(&itup->t_tid, callback_state))
1490 {
1491 deletable[ndeletable++] = offnum;
1492 nhtidsdead++;
1493 }
1494 else
1495 nhtidslive++;
1496 }
1497 else
1498 {
1499 BTVacuumPosting vacposting;
1500 int nremaining;
1501
1502 /* Posting list tuple */
1503 vacposting = btreevacuumposting(vstate, itup, offnum,
1504 &nremaining);
1505 if (vacposting == NULL)
1506 {
1507 /*
1508 * All table TIDs from the posting tuple remain, so no
1509 * delete or update required
1510 */
1511 Assert(nremaining == BTreeTupleGetNPosting(itup));
1512 }
1513 else if (nremaining > 0)
1514 {
1515
1516 /*
1517 * Store metadata about posting list tuple in
1518 * updatable array for entire page. Existing tuple
1519 * will be updated during the later call to
1520 * _bt_delitems_vacuum().
1521 */
1522 Assert(nremaining < BTreeTupleGetNPosting(itup));
1523 updatable[nupdatable++] = vacposting;
1524 nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1525 }
1526 else
1527 {
1528 /*
1529 * All table TIDs from the posting list must be
1530 * deleted. We'll delete the index tuple completely
1531 * (no update required).
1532 */
1533 Assert(nremaining == 0);
1534 deletable[ndeletable++] = offnum;
1535 nhtidsdead += BTreeTupleGetNPosting(itup);
1536 pfree(vacposting);
1537 }
1538
1539 nhtidslive += nremaining;
1540 }
1541 }
1542 }
1543
1544 /*
1545 * Apply any needed deletes or updates. We issue just one
1546 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1547 */
1548 if (ndeletable > 0 || nupdatable > 0)
1549 {
1550 Assert(nhtidsdead >= ndeletable + nupdatable);
1551 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1552 nupdatable);
1553
1554 stats->tuples_removed += nhtidsdead;
1555 /* must recompute maxoff */
1556 maxoff = PageGetMaxOffsetNumber(page);
1557
1558 /* can't leak memory here */
1559 for (int i = 0; i < nupdatable; i++)
1560 pfree(updatable[i]);
1561 }
1562 else
1563 {
1564 /*
1565 * If the leaf page has been split during this vacuum cycle, it
1566 * seems worth expending a write to clear btpo_cycleid even if we
1567 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1568 * takes care of this.) This ensures we won't process the page
1569 * again.
1570 *
1571 * We treat this like a hint-bit update because there's no need to
1572 * WAL-log it.
1573 */
1574 Assert(nhtidsdead == 0);
1575 if (vstate->cycleid != 0 &&
1576 opaque->btpo_cycleid == vstate->cycleid)
1577 {
1578 opaque->btpo_cycleid = 0;
1579 MarkBufferDirtyHint(buf, true);
1580 }
1581 }
1582
1583 /*
1584 * If the leaf page is now empty, try to delete it; else count the
1585 * live tuples (live table TIDs in posting lists are counted as
1586 * separate live tuples). We don't delete when backtracking, though,
1587 * since that would require teaching _bt_pagedel() about backtracking
1588 * (doesn't seem worth adding more complexity to deal with that).
1589 *
1590 * We don't count the number of live TIDs during cleanup-only calls to
1591 * btvacuumscan (i.e. when callback is not set). We count the number
1592 * of index tuples directly instead. This avoids the expense of
1593 * directly examining all of the tuples on each page. VACUUM will
1594 * treat num_index_tuples as an estimate in cleanup-only case, so it
1595 * doesn't matter that this underestimates num_index_tuples
1596 * significantly in some cases.
1597 */
1598 if (minoff > maxoff)
1599 attempt_pagedel = (blkno == scanblkno);
1600 else if (callback)
1601 stats->num_index_tuples += nhtidslive;
1602 else
1603 stats->num_index_tuples += maxoff - minoff + 1;
1604
1605 Assert(!attempt_pagedel || nhtidslive == 0);
1606 }
1607
1608 if (attempt_pagedel)
1609 {
1610 MemoryContext oldcontext;
1611
1612 /* Run pagedel in a temp context to avoid memory leakage */
1614 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1615
1616 /*
1617 * _bt_pagedel maintains the bulk delete stats on our behalf;
1618 * pages_newly_deleted and pages_deleted are likely to be incremented
1619 * during call
1620 */
1621 Assert(blkno == scanblkno);
1622 _bt_pagedel(rel, buf, vstate);
1623
1624 MemoryContextSwitchTo(oldcontext);
1625 /* pagedel released buffer, so we shouldn't */
1626 }
1627 else
1628 _bt_relbuf(rel, buf);
1629
1630 if (backtrack_to != P_NONE)
1631 {
1632 blkno = backtrack_to;
1633
1634 /* check for vacuum delay while not holding any buffer lock */
1635 vacuum_delay_point(false);
1636
1637 /*
1638 * We can't use _bt_getbuf() here because it always applies
1639 * _bt_checkpage(), which will barf on an all-zero page. We want to
1640 * recycle all-zero pages, not fail. Also, we want to use a
1641 * nondefault buffer access strategy.
1642 */
1644 info->strategy);
1645 goto backtrack;
1646 }
1647
1648 return scanblkno;
1649}
1650
1651/*
1652 * btreevacuumposting --- determine TIDs still needed in posting list
1653 *
1654 * Returns metadata describing how to build replacement tuple without the TIDs
1655 * that VACUUM needs to delete. Returned value is NULL in the common case
1656 * where no changes are needed to caller's posting list tuple (we avoid
1657 * allocating memory here as an optimization).
1658 *
1659 * The number of TIDs that should remain in the posting list tuple is set for
1660 * caller in *nremaining.
1661 */
1662static BTVacuumPosting
1664 OffsetNumber updatedoffset, int *nremaining)
1665{
1666 int live = 0;
1667 int nitem = BTreeTupleGetNPosting(posting);
1669 BTVacuumPosting vacposting = NULL;
1670
1671 for (int i = 0; i < nitem; i++)
1672 {
1673 if (!vstate->callback(items + i, vstate->callback_state))
1674 {
1675 /* Live table TID */
1676 live++;
1677 }
1678 else if (vacposting == NULL)
1679 {
1680 /*
1681 * First dead table TID encountered.
1682 *
1683 * It's now clear that we need to delete one or more dead table
1684 * TIDs, so start maintaining metadata describing how to update
1685 * existing posting list tuple.
1686 */
1687 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1688 nitem * sizeof(uint16));
1689
1690 vacposting->itup = posting;
1691 vacposting->updatedoffset = updatedoffset;
1692 vacposting->ndeletedtids = 0;
1693 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1694 }
1695 else
1696 {
1697 /* Second or subsequent dead table TID */
1698 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1699 }
1700 }
1701
1702 *nremaining = live;
1703 return vacposting;
1704}
1705
1706/*
1707 * btcanreturn() -- Check whether btree indexes support index-only scans.
1708 *
1709 * btrees always do, so this is trivial.
1710 */
1711bool
1713{
1714 return true;
1715}
1716
1717/*
1718 * btgettreeheight() -- Compute tree height for use by btcostestimate().
1719 */
1720int
1722{
1723 return _bt_getrootheight(rel);
1724}
1725
1728{
1729 switch (strategy)
1730 {
1732 return COMPARE_LT;
1734 return COMPARE_LE;
1736 return COMPARE_EQ;
1738 return COMPARE_GE;
1740 return COMPARE_GT;
1741 default:
1742 return COMPARE_INVALID;
1743 }
1744}
1745
1748{
1749 switch (cmptype)
1750 {
1751 case COMPARE_LT:
1752 return BTLessStrategyNumber;
1753 case COMPARE_LE:
1755 case COMPARE_EQ:
1756 return BTEqualStrategyNumber;
1757 case COMPARE_GE:
1759 case COMPARE_GT:
1761 default:
1762 return InvalidStrategy;
1763 }
1764}
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static Datum values[MAXATTR]
Definition: bootstrap.c:151
int Buffer
Definition: buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5335
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4161
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5367
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:798
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:280
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:414
@ RBM_NORMAL
Definition: bufmgr.h:46
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:365
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
PageData * Page
Definition: bufpage.h:82
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
#define OffsetToPointer(base, offset)
Definition: c.h:743
int64_t int64
Definition: c.h:499
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:434
int16_t int16
Definition: c.h:497
uint16_t uint16
Definition: c.h:501
size_t Size
Definition: c.h:576
CompareType
Definition: cmptype.h:32
@ COMPARE_LE
Definition: cmptype.h:35
@ COMPARE_INVALID
Definition: cmptype.h:33
@ COMPARE_GT
Definition: cmptype.h:38
@ COMPARE_EQ
Definition: cmptype.h:36
@ COMPARE_GE
Definition: cmptype.h:37
@ COMPARE_LT
Definition: cmptype.h:34
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
Datum datumRestore(char **start_address, bool *isnull)
Definition: datum.c:521
void datumSerialize(Datum value, bool isnull, bool typByVal, int typLen, char **start_address)
Definition: datum.c:459
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition: datum.c:412
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
int errcode(int sqlerrcode)
Definition: elog.c:854
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:110
IndexUniqueCheck
Definition: genam.h:139
Assert(PointerIsAligned(start, uint64))
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:181
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
#define ExclusiveLock
Definition: lockdefs.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1182
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1902
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:721
@ LWTRANCHE_PARALLEL_BTREE_SCAN
Definition: lwlock.h:197
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:414
void pfree(void *pointer)
Definition: mcxt.c:2147
void * palloc0(Size size)
Definition: mcxt.c:1970
void * palloc(Size size)
Definition: mcxt.c:1940
MemoryContext CurrentMemoryContext
Definition: mcxt.c:159
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:485
#define AllocSetContextCreate
Definition: memutils.h:149
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:180
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:102
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:675
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1802
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1154
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:232
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1109
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:179
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2996
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2954
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition: nbtree.c:999
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1712
BTPS_State
Definition: nbtree.c:55
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:58
@ BTPARALLEL_NEED_PRIMSCAN
Definition: nbtree.c:57
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56
@ BTPARALLEL_IDLE
Definition: nbtree.c:59
@ BTPARALLEL_DONE
Definition: nbtree.c:60
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:784
StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily)
Definition: nbtree.c:1747
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:332
static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf)
Definition: nbtree.c:1325
Size btestimateparallelscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:558
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:949
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:1063
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1663
CompareType bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
Definition: nbtree.c:1727
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:226
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:741
struct BTParallelScanDescData BTParallelScanDescData
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:202
void btbuildempty(Relation index)
Definition: nbtree.c:179
int btgettreeheight(Relation rel)
Definition: nbtree.c:1721
void btinitparallelscan(void *target)
Definition: nbtree.c:725
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:1033
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:1151
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:631
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:286
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:474
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:438
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:381
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:93
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition: nbtree.c:922
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:115
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:500
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:674
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:225
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:1004
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define SK_BT_SKIP
Definition: nbtree.h:1136
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:538
#define BTNProcs
Definition: nbtree.h:723
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1021
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
uint16 BTCycleId
Definition: nbtree.h:30
#define P_NONE
Definition: nbtree.h:213
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define BTREE_METAPAGE
Definition: nbtree.h:149
#define SK_BT_MAXVAL
Definition: nbtree.h:1140
#define BT_READ
Definition: nbtree.h:730
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:292
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1027
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1015
#define BTMaxItemSize
Definition: nbtree.h:165
#define BTP_SPLIT_END
Definition: nbtree.h:82
#define BTOPTIONS_PROC
Definition: nbtree.h:721
#define SK_BT_MINVAL
Definition: nbtree.h:1139
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1096
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:882
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1541
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:295
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:3618
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:3646
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:3310
bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1339
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:3742
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:3696
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:3719
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:4273
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:611
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:3561
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:40
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:288
#define makeNode(_type_)
Definition: nodes.h:161
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
int16 attnum
Definition: pg_attribute.h:74
static char * buf
Definition: pg_test_fsync.c:72
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
uintptr_t Datum
Definition: postgres.h:69
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
#define InvalidOid
Definition: postgres_ext.h:35
unsigned int Oid
Definition: postgres_ext.h:30
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:125
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:124
void read_stream_reset(ReadStream *stream)
Definition: read_stream.c:1010
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:770
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:716
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1055
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:659
#define RelationGetDescr(relation)
Definition: rel.h:542
#define RelationGetRelationName(relation)
Definition: rel.h:550
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:535
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:7161
Size add_size(Size s1, Size s2)
Definition: shmem.c:493
#define SK_SEARCHNULL
Definition: skey.h:121
ScanKeyData * ScanKey
Definition: skey.h:75
#define SK_ISNULL
Definition: skey.h:115
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTMaxStrategyNumber
Definition: stratnum.h:35
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
bool attbyval
Definition: nbtree.h:1046
Datum * elem_values
Definition: nbtree.h:1041
int16 attlen
Definition: nbtree.h:1045
BlockNumber btpo_next
Definition: nbtree.h:66
uint16 btpo_flags
Definition: nbtree.h:68
BTCycleId btpo_cycleid
Definition: nbtree.h:69
BTPS_State btps_pageStatus
Definition: nbtree.c:72
BlockNumber btps_lastCurrPage
Definition: nbtree.c:70
ConditionVariable btps_cv
Definition: nbtree.c:76
BlockNumber btps_nextScanPage
Definition: nbtree.c:69
int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.c:83
bool needPrimScan
Definition: nbtree.h:1063
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1066
char * markTuples
Definition: nbtree.h:1080
FmgrInfo * orderProcs
Definition: nbtree.h:1067
BTScanPosData currPos
Definition: nbtree.h:1092
int * killedItems
Definition: nbtree.h:1071
char * currTuples
Definition: nbtree.h:1079
bool oppositeDirCheck
Definition: nbtree.h:1065
BTScanPosData markPos
Definition: nbtree.h:1093
ScanKey keyData
Definition: nbtree.h:1058
MemoryContext arrayContext
Definition: nbtree.h:1068
bool moreRight
Definition: nbtree.h:986
Buffer buf
Definition: nbtree.h:964
int nextTupleOffset
Definition: nbtree.h:979
bool moreLeft
Definition: nbtree.h:985
int lastItem
Definition: nbtree.h:996
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:999
int itemIndex
Definition: nbtree.h:997
ScanDirection dir
Definition: nbtree.h:973
ItemPointerData heapTid
Definition: nbtree.h:957
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BTCycleId cycleid
Definition: nbtree.h:337
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
IndexBulkDeleteCallback callback
Definition: nbtree.h:335
MemoryContext pagedelcontext
Definition: nbtree.h:338
IndexVacuumInfo * info
Definition: nbtree.h:333
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344
void * callback_state
Definition: nbtree.h:336
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:922
uint16 ndeletedtids
Definition: nbtree.h:921
IndexTuple itup
Definition: nbtree.h:917
OffsetNumber updatedoffset
Definition: nbtree.h:918
int16 attlen
Definition: tupdesc.h:71
ambuildphasename_function ambuildphasename
Definition: amapi.h:304
ambuildempty_function ambuildempty
Definition: amapi.h:294
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:298
bool amclusterable
Definition: amapi.h:268
amoptions_function amoptions
Definition: amapi.h:302
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:316
amrestrpos_function amrestrpos
Definition: amapi.h:313
aminsert_function aminsert
Definition: amapi.h:295
amendscan_function amendscan
Definition: amapi.h:311
amtranslate_strategy_function amtranslatestrategy
Definition: amapi.h:321
uint16 amoptsprocnum
Definition: amapi.h:242
amparallelrescan_function amparallelrescan
Definition: amapi.h:318
Oid amkeytype
Definition: amapi.h:284
bool amconsistentordering
Definition: amapi.h:252
bool ampredlocks
Definition: amapi.h:270
uint16 amsupport
Definition: amapi.h:240
amtranslate_cmptype_function amtranslatecmptype
Definition: amapi.h:322
amcostestimate_function amcostestimate
Definition: amapi.h:300
bool amcanorderbyop
Definition: amapi.h:246
amadjustmembers_function amadjustmembers
Definition: amapi.h:306
ambuild_function ambuild
Definition: amapi.h:293
bool amstorage
Definition: amapi.h:266
uint16 amstrategies
Definition: amapi.h:238
bool amoptionalkey
Definition: amapi.h:260
amgettuple_function amgettuple
Definition: amapi.h:309
amcanreturn_function amcanreturn
Definition: amapi.h:299
bool amcanunique
Definition: amapi.h:256
amgetbitmap_function amgetbitmap
Definition: amapi.h:310
amproperty_function amproperty
Definition: amapi.h:303
ambulkdelete_function ambulkdelete
Definition: amapi.h:297
bool amsearcharray
Definition: amapi.h:262
bool amsummarizing
Definition: amapi.h:280
amvalidate_function amvalidate
Definition: amapi.h:305
ammarkpos_function ammarkpos
Definition: amapi.h:312
bool amcanmulticol
Definition: amapi.h:258
bool amusemaintenanceworkmem
Definition: amapi.h:278
ambeginscan_function ambeginscan
Definition: amapi.h:307
bool amcanparallel
Definition: amapi.h:272
amrescan_function amrescan
Definition: amapi.h:308
bool amcanorder
Definition: amapi.h:244
bool amcanbuildparallel
Definition: amapi.h:274
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:317
uint8 amparallelvacuumoptions
Definition: amapi.h:282
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:296
bool amcanbackward
Definition: amapi.h:254
amgettreeheight_function amgettreeheight
Definition: amapi.h:301
bool amcaninclude
Definition: amapi.h:276
bool amsearchnulls
Definition: amapi.h:264
bool amconsistentequality
Definition: amapi.h:250
bool amcanhash
Definition: amapi.h:248
BlockNumber pages_deleted
Definition: genam.h:105
BlockNumber pages_free
Definition: genam.h:106
BlockNumber num_pages
Definition: genam.h:100
double tuples_removed
Definition: genam.h:103
double num_index_tuples
Definition: genam.h:102
struct ScanKeyData * keyData
Definition: relscan.h:141
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:191
bool kill_prior_tuple
Definition: relscan.h:147
struct TupleDescData * xs_itupdesc
Definition: relscan.h:168
Relation indexRelation
Definition: relscan.h:137
ItemPointerData xs_heaptid
Definition: relscan.h:172
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:69
double num_heap_tuples
Definition: genam.h:75
bool analyze_only
Definition: genam.h:71
BufferAccessStrategy strategy
Definition: genam.h:76
Relation heaprel
Definition: genam.h:70
bool report_progress
Definition: genam.h:72
bool estimated_count
Definition: genam.h:73
Definition: lwlock.h:42
TupleDesc rd_att
Definition: rel.h:112
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
Definition: type.h:96
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
static ItemArray items
Definition: test_tidstore.c:48
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:366
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition: tupdesc.h:175
void vacuum_delay_point(bool is_analyze)
Definition: vacuum.c:2402
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:55