PostgreSQL Source Code git master
nbtree.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
6 *
7 * NOTES
8 * This file contains only the public interface routines.
9 *
10 *
11 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
13 *
14 * IDENTIFICATION
15 * src/backend/access/nbtree/nbtree.c
16 *
17 *-------------------------------------------------------------------------
18 */
19#include "postgres.h"
20
21#include "access/nbtree.h"
22#include "access/relscan.h"
23#include "access/stratnum.h"
24#include "commands/progress.h"
25#include "commands/vacuum.h"
26#include "nodes/execnodes.h"
27#include "pgstat.h"
28#include "storage/bulk_write.h"
30#include "storage/indexfsm.h"
31#include "storage/ipc.h"
32#include "storage/lmgr.h"
33#include "utils/fmgrprotos.h"
35#include "utils/memutils.h"
36
37
38/*
39 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
40 *
41 * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
42 * scan to advance it via another call to _bt_first.
43 *
44 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
45 * a new page; others must wait.
46 *
47 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
48 * to a new page; some process can start doing that.
49 *
50 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
51 */
52typedef enum
53{
60
61/*
62 * BTParallelScanDescData contains btree specific shared information required
63 * for parallel scan.
64 */
66{
67 BlockNumber btps_nextScanPage; /* next page to be scanned */
68 BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
69 * btps_nextScanPage */
70 BTPS_State btps_pageStatus; /* indicates whether next page is
71 * available for scan. see above for
72 * possible states of parallel scan. */
73 slock_t btps_mutex; /* protects above variables, btps_arrElems */
74 ConditionVariable btps_cv; /* used to synchronize parallel scan */
75
76 /*
77 * btps_arrElems is used when scans need to schedule another primitive
78 * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
79 */
82
84
85
86static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
87 IndexBulkDeleteCallback callback, void *callback_state,
88 BTCycleId cycleid);
89static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
91 IndexTuple posting,
92 OffsetNumber updatedoffset,
93 int *nremaining);
94
95
96/*
97 * Btree handler function: return IndexAmRoutine with access method parameters
98 * and callbacks.
99 */
100Datum
102{
104
106 amroutine->amsupport = BTNProcs;
107 amroutine->amoptsprocnum = BTOPTIONS_PROC;
108 amroutine->amcanorder = true;
109 amroutine->amcanorderbyop = false;
110 amroutine->amcanbackward = true;
111 amroutine->amcanunique = true;
112 amroutine->amcanmulticol = true;
113 amroutine->amoptionalkey = true;
114 amroutine->amsearcharray = true;
115 amroutine->amsearchnulls = true;
116 amroutine->amstorage = false;
117 amroutine->amclusterable = true;
118 amroutine->ampredlocks = true;
119 amroutine->amcanparallel = true;
120 amroutine->amcanbuildparallel = true;
121 amroutine->amcaninclude = true;
122 amroutine->amusemaintenanceworkmem = false;
123 amroutine->amsummarizing = false;
124 amroutine->amparallelvacuumoptions =
126 amroutine->amkeytype = InvalidOid;
127
128 amroutine->ambuild = btbuild;
129 amroutine->ambuildempty = btbuildempty;
130 amroutine->aminsert = btinsert;
131 amroutine->aminsertcleanup = NULL;
132 amroutine->ambulkdelete = btbulkdelete;
133 amroutine->amvacuumcleanup = btvacuumcleanup;
134 amroutine->amcanreturn = btcanreturn;
135 amroutine->amcostestimate = btcostestimate;
136 amroutine->amgettreeheight = btgettreeheight;
137 amroutine->amoptions = btoptions;
138 amroutine->amproperty = btproperty;
140 amroutine->amvalidate = btvalidate;
141 amroutine->amadjustmembers = btadjustmembers;
142 amroutine->ambeginscan = btbeginscan;
143 amroutine->amrescan = btrescan;
144 amroutine->amgettuple = btgettuple;
145 amroutine->amgetbitmap = btgetbitmap;
146 amroutine->amendscan = btendscan;
147 amroutine->ammarkpos = btmarkpos;
148 amroutine->amrestrpos = btrestrpos;
154
155 PG_RETURN_POINTER(amroutine);
156}
157
158/*
159 * btbuildempty() -- build an empty btree index in the initialization fork
160 */
161void
163{
164 bool allequalimage = _bt_allequalimage(index, false);
165 BulkWriteState *bulkstate;
166 BulkWriteBuffer metabuf;
167
169
170 /* Construct metapage. */
171 metabuf = smgr_bulk_get_buf(bulkstate);
172 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
173 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
174
175 smgr_bulk_finish(bulkstate);
176}
177
178/*
179 * btinsert() -- insert an index tuple into a btree.
180 *
181 * Descend the tree recursively, find the appropriate location for our
182 * new tuple, and put it there.
183 */
184bool
185btinsert(Relation rel, Datum *values, bool *isnull,
186 ItemPointer ht_ctid, Relation heapRel,
187 IndexUniqueCheck checkUnique,
188 bool indexUnchanged,
189 IndexInfo *indexInfo)
190{
191 bool result;
192 IndexTuple itup;
193
194 /* generate an index tuple */
195 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
196 itup->t_tid = *ht_ctid;
197
198 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
199
200 pfree(itup);
201
202 return result;
203}
204
205/*
206 * btgettuple() -- Get the next tuple in the scan.
207 */
208bool
210{
211 BTScanOpaque so = (BTScanOpaque) scan->opaque;
212 bool res;
213
214 /* btree indexes are never lossy */
215 scan->xs_recheck = false;
216
217 /* Each loop iteration performs another primitive index scan */
218 do
219 {
220 /*
221 * If we've already initialized this scan, we can just advance it in
222 * the appropriate direction. If we haven't done so yet, we call
223 * _bt_first() to get the first item in the scan.
224 */
226 res = _bt_first(scan, dir);
227 else
228 {
229 /*
230 * Check to see if we should kill the previously-fetched tuple.
231 */
232 if (scan->kill_prior_tuple)
233 {
234 /*
235 * Yes, remember it for later. (We'll deal with all such
236 * tuples at once right before leaving the index page.) The
237 * test for numKilled overrun is not just paranoia: if the
238 * caller reverses direction in the indexscan then the same
239 * item might get entered multiple times. It's not worth
240 * trying to optimize that, so we don't detect it, but instead
241 * just forget any excess entries.
242 */
243 if (so->killedItems == NULL)
244 so->killedItems = (int *)
245 palloc(MaxTIDsPerBTreePage * sizeof(int));
247 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
248 }
249
250 /*
251 * Now continue the scan.
252 */
253 res = _bt_next(scan, dir);
254 }
255
256 /* If we have a tuple, return it ... */
257 if (res)
258 break;
259 /* ... otherwise see if we need another primitive index scan */
260 } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
261
262 return res;
263}
264
265/*
266 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
267 */
268int64
270{
271 BTScanOpaque so = (BTScanOpaque) scan->opaque;
272 int64 ntids = 0;
273 ItemPointer heapTid;
274
275 /* Each loop iteration performs another primitive index scan */
276 do
277 {
278 /* Fetch the first page & tuple */
280 {
281 /* Save tuple ID, and continue scanning */
282 heapTid = &scan->xs_heaptid;
283 tbm_add_tuples(tbm, heapTid, 1, false);
284 ntids++;
285
286 for (;;)
287 {
288 /*
289 * Advance to next tuple within page. This is the same as the
290 * easy case in _bt_next().
291 */
292 if (++so->currPos.itemIndex > so->currPos.lastItem)
293 {
294 /* let _bt_next do the heavy lifting */
295 if (!_bt_next(scan, ForwardScanDirection))
296 break;
297 }
298
299 /* Save tuple ID, and continue scanning */
300 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
301 tbm_add_tuples(tbm, heapTid, 1, false);
302 ntids++;
303 }
304 }
305 /* Now see if we need another primitive index scan */
306 } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
307
308 return ntids;
309}
310
311/*
312 * btbeginscan() -- start a scan on a btree index
313 */
315btbeginscan(Relation rel, int nkeys, int norderbys)
316{
317 IndexScanDesc scan;
318 BTScanOpaque so;
319
320 /* no order by operators allowed */
321 Assert(norderbys == 0);
322
323 /* get the scan */
324 scan = RelationGetIndexScan(rel, nkeys, norderbys);
325
326 /* allocate private workspace */
327 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
330 if (scan->numberOfKeys > 0)
331 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
332 else
333 so->keyData = NULL;
334
335 so->needPrimScan = false;
336 so->scanBehind = false;
337 so->oppositeDirCheck = false;
338 so->arrayKeys = NULL;
339 so->orderProcs = NULL;
340 so->arrayContext = NULL;
341
342 so->killedItems = NULL; /* until needed */
343 so->numKilled = 0;
344
345 /*
346 * We don't know yet whether the scan will be index-only, so we do not
347 * allocate the tuple workspace arrays until btrescan. However, we set up
348 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
349 */
350 so->currTuples = so->markTuples = NULL;
351
352 scan->xs_itupdesc = RelationGetDescr(rel);
353
354 scan->opaque = so;
355
356 return scan;
357}
358
359/*
360 * btrescan() -- rescan an index relation
361 */
362void
363btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
364 ScanKey orderbys, int norderbys)
365{
366 BTScanOpaque so = (BTScanOpaque) scan->opaque;
367
368 /* we aren't holding any read locks, but gotta drop the pins */
370 {
371 /* Before leaving current page, deal with any killed items */
372 if (so->numKilled > 0)
373 _bt_killitems(scan);
376 }
377
378 so->markItemIndex = -1;
379 so->needPrimScan = false;
380 so->scanBehind = false;
381 so->oppositeDirCheck = false;
384
385 /*
386 * Allocate tuple workspace arrays, if needed for an index-only scan and
387 * not already done in a previous rescan call. To save on palloc
388 * overhead, both workspaces are allocated as one palloc block; only this
389 * function and btendscan know that.
390 *
391 * NOTE: this data structure also makes it safe to return data from a
392 * "name" column, even though btree name_ops uses an underlying storage
393 * datatype of cstring. The risk there is that "name" is supposed to be
394 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
395 * However, since we only return data out of tuples sitting in the
396 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
397 * data out of the markTuples array --- running off the end of memory for
398 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
399 * adding special-case treatment for name_ops elsewhere.
400 */
401 if (scan->xs_want_itup && so->currTuples == NULL)
402 {
403 so->currTuples = (char *) palloc(BLCKSZ * 2);
404 so->markTuples = so->currTuples + BLCKSZ;
405 }
406
407 /*
408 * Reset the scan keys
409 */
410 if (scankey && scan->numberOfKeys > 0)
411 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
412 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
413 so->numArrayKeys = 0; /* ditto */
414}
415
416/*
417 * btendscan() -- close down a scan
418 */
419void
421{
422 BTScanOpaque so = (BTScanOpaque) scan->opaque;
423
424 /* we aren't holding any read locks, but gotta drop the pins */
426 {
427 /* Before leaving current page, deal with any killed items */
428 if (so->numKilled > 0)
429 _bt_killitems(scan);
431 }
432
433 so->markItemIndex = -1;
435
436 /* No need to invalidate positions, the RAM is about to be freed. */
437
438 /* Release storage */
439 if (so->keyData != NULL)
440 pfree(so->keyData);
441 /* so->arrayKeys and so->orderProcs are in arrayContext */
442 if (so->arrayContext != NULL)
444 if (so->killedItems != NULL)
445 pfree(so->killedItems);
446 if (so->currTuples != NULL)
447 pfree(so->currTuples);
448 /* so->markTuples should not be pfree'd, see btrescan */
449 pfree(so);
450}
451
452/*
453 * btmarkpos() -- save current scan position
454 */
455void
457{
458 BTScanOpaque so = (BTScanOpaque) scan->opaque;
459
460 /* There may be an old mark with a pin (but no lock). */
462
463 /*
464 * Just record the current itemIndex. If we later step to next page
465 * before releasing the marked position, _bt_steppage makes a full copy of
466 * the currPos struct in markPos. If (as often happens) the mark is moved
467 * before we leave the page, we don't have to do that work.
468 */
469 if (BTScanPosIsValid(so->currPos))
471 else
472 {
474 so->markItemIndex = -1;
475 }
476}
477
478/*
479 * btrestrpos() -- restore scan to last saved position
480 */
481void
483{
484 BTScanOpaque so = (BTScanOpaque) scan->opaque;
485
486 if (so->markItemIndex >= 0)
487 {
488 /*
489 * The scan has never moved to a new page since the last mark. Just
490 * restore the itemIndex.
491 *
492 * NB: In this case we can't count on anything in so->markPos to be
493 * accurate.
494 */
496 }
497 else
498 {
499 /*
500 * The scan moved to a new page after last mark or restore, and we are
501 * now restoring to the marked page. We aren't holding any read
502 * locks, but if we're still holding the pin for the current position,
503 * we must drop it.
504 */
505 if (BTScanPosIsValid(so->currPos))
506 {
507 /* Before leaving current page, deal with any killed items */
508 if (so->numKilled > 0)
509 _bt_killitems(scan);
511 }
512
513 if (BTScanPosIsValid(so->markPos))
514 {
515 /* bump pin on mark buffer for assignment to current buffer */
516 if (BTScanPosIsPinned(so->markPos))
518 memcpy(&so->currPos, &so->markPos,
519 offsetof(BTScanPosData, items[1]) +
520 so->markPos.lastItem * sizeof(BTScanPosItem));
521 if (so->currTuples)
522 memcpy(so->currTuples, so->markTuples,
524 /* Reset the scan's array keys (see _bt_steppage for why) */
525 if (so->numArrayKeys)
526 {
528 so->needPrimScan = false;
529 }
530 }
531 else
533 }
534}
535
536/*
537 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
538 */
539Size
540btestimateparallelscan(int nkeys, int norderbys)
541{
542 /* Pessimistically assume all input scankeys will be output with arrays */
543 return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
544}
545
546/*
547 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
548 */
549void
551{
552 BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
553
554 SpinLockInit(&bt_target->btps_mutex);
558 ConditionVariableInit(&bt_target->btps_cv);
559}
560
561/*
562 * btparallelrescan() -- reset parallel scan
563 */
564void
566{
567 BTParallelScanDesc btscan;
568 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
569
570 Assert(parallel_scan);
571
572 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
573 parallel_scan->ps_offset);
574
575 /*
576 * In theory, we don't need to acquire the spinlock here, because there
577 * shouldn't be any other workers running at this point, but we do so for
578 * consistency.
579 */
580 SpinLockAcquire(&btscan->btps_mutex);
584 SpinLockRelease(&btscan->btps_mutex);
585}
586
587/*
588 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
589 * page. Other scans must wait until we call _bt_parallel_release()
590 * or _bt_parallel_done().
591 *
592 * The return value is true if we successfully seized the scan and false
593 * if we did not. The latter case occurs when no pages remain, or when
594 * another primitive index scan is scheduled that caller's backend cannot
595 * start just yet (only backends that call from _bt_first are capable of
596 * starting primitive index scans, which they indicate by passing first=true).
597 *
598 * If the return value is true, *next_scan_page returns the next page of the
599 * scan, and *last_curr_page returns the page that *next_scan_page came from.
600 * An invalid *next_scan_page means the scan hasn't yet started, or that
601 * caller needs to start the next primitive index scan (if it's the latter
602 * case we'll set so.needPrimScan).
603 *
604 * Callers should ignore the value of *next_scan_page and *last_curr_page if
605 * the return value is false.
606 */
607bool
609 BlockNumber *last_curr_page, bool first)
610{
611 BTScanOpaque so = (BTScanOpaque) scan->opaque;
612 bool exit_loop = false,
613 status = true,
614 endscan = false;
615 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
616 BTParallelScanDesc btscan;
617
618 *next_scan_page = InvalidBlockNumber;
619 *last_curr_page = InvalidBlockNumber;
620
621 /*
622 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
623 * call to _bt_readnextpage treats this backend similarly to a serial
624 * backend that steps from *last_curr_page to *next_scan_page (unless this
625 * backend's so->currPos is initialized by _bt_readfirstpage before then).
626 */
628 so->currPos.moreLeft = so->currPos.moreRight = true;
629
630 if (first)
631 {
632 /*
633 * Initialize array related state when called from _bt_first, assuming
634 * that this will be the first primitive index scan for the scan
635 */
636 so->needPrimScan = false;
637 so->scanBehind = false;
638 so->oppositeDirCheck = false;
639 }
640 else
641 {
642 /*
643 * Don't attempt to seize the scan when it requires another primitive
644 * index scan, since caller's backend cannot start it right now
645 */
646 if (so->needPrimScan)
647 return false;
648 }
649
650 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
651 parallel_scan->ps_offset);
652
653 while (1)
654 {
655 SpinLockAcquire(&btscan->btps_mutex);
656
657 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
658 {
659 /* We're done with this parallel index scan */
660 status = false;
661 }
662 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
663 btscan->btps_nextScanPage == P_NONE)
664 {
665 /* End this parallel index scan */
666 status = false;
667 endscan = true;
668 }
669 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
670 {
671 Assert(so->numArrayKeys);
672
673 if (first)
674 {
675 /* Can start scheduled primitive scan right away, so do so */
676 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
677 for (int i = 0; i < so->numArrayKeys; i++)
678 {
679 BTArrayKeyInfo *array = &so->arrayKeys[i];
680 ScanKey skey = &so->keyData[array->scan_key];
681
682 array->cur_elem = btscan->btps_arrElems[i];
683 skey->sk_argument = array->elem_values[array->cur_elem];
684 }
685 exit_loop = true;
686 }
687 else
688 {
689 /*
690 * Don't attempt to seize the scan when it requires another
691 * primitive index scan, since caller's backend cannot start
692 * it right now
693 */
694 status = false;
695 }
696
697 /*
698 * Either way, update backend local state to indicate that a
699 * pending primitive scan is required
700 */
701 so->needPrimScan = true;
702 so->scanBehind = false;
703 so->oppositeDirCheck = false;
704 }
705 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
706 {
707 /*
708 * We have successfully seized control of the scan for the purpose
709 * of advancing it to a new page!
710 */
711 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
712 Assert(btscan->btps_nextScanPage != P_NONE);
713 *next_scan_page = btscan->btps_nextScanPage;
714 *last_curr_page = btscan->btps_lastCurrPage;
715 exit_loop = true;
716 }
717 SpinLockRelease(&btscan->btps_mutex);
718 if (exit_loop || !status)
719 break;
720 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
721 }
723
724 /* When the scan has reached the rightmost (or leftmost) page, end it */
725 if (endscan)
726 _bt_parallel_done(scan);
727
728 return status;
729}
730
731/*
732 * _bt_parallel_release() -- Complete the process of advancing the scan to a
733 * new page. We now have the new value btps_nextScanPage; another backend
734 * can now begin advancing the scan.
735 *
736 * Callers whose scan uses array keys must save their curr_page argument so
737 * that it can be passed to _bt_parallel_primscan_schedule, should caller
738 * determine that another primitive index scan is required.
739 *
740 * If caller's next_scan_page is P_NONE, the scan has reached the index's
741 * rightmost/leftmost page. This is treated as reaching the end of the scan
742 * within _bt_parallel_seize.
743 *
744 * Note: unlike the serial case, parallel scans don't need to remember both
745 * sibling links. next_scan_page is whichever link is next given the scan's
746 * direction. That's all we'll ever need, since the direction of a parallel
747 * scan can never change.
748 */
749void
751 BlockNumber curr_page)
752{
753 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
754 BTParallelScanDesc btscan;
755
756 Assert(BlockNumberIsValid(next_scan_page));
757
758 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
759 parallel_scan->ps_offset);
760
761 SpinLockAcquire(&btscan->btps_mutex);
762 btscan->btps_nextScanPage = next_scan_page;
763 btscan->btps_lastCurrPage = curr_page;
765 SpinLockRelease(&btscan->btps_mutex);
767}
768
769/*
770 * _bt_parallel_done() -- Mark the parallel scan as complete.
771 *
772 * When there are no pages left to scan, this function should be called to
773 * notify other workers. Otherwise, they might wait forever for the scan to
774 * advance to the next page.
775 */
776void
778{
779 BTScanOpaque so = (BTScanOpaque) scan->opaque;
780 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
781 BTParallelScanDesc btscan;
782 bool status_changed = false;
783
785
786 /* Do nothing, for non-parallel scans */
787 if (parallel_scan == NULL)
788 return;
789
790 /*
791 * Should not mark parallel scan done when there's still a pending
792 * primitive index scan
793 */
794 if (so->needPrimScan)
795 return;
796
797 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
798 parallel_scan->ps_offset);
799
800 /*
801 * Mark the parallel scan as done, unless some other process did so
802 * already
803 */
804 SpinLockAcquire(&btscan->btps_mutex);
805 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
806 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
807 {
808 btscan->btps_pageStatus = BTPARALLEL_DONE;
809 status_changed = true;
810 }
811 SpinLockRelease(&btscan->btps_mutex);
812
813 /* wake up all the workers associated with this parallel scan */
814 if (status_changed)
815 ConditionVariableBroadcast(&btscan->btps_cv);
816}
817
818/*
819 * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
820 *
821 * Caller passes the curr_page most recently passed to _bt_parallel_release
822 * by its backend. Caller successfully schedules the next primitive index scan
823 * if the shared parallel state hasn't been seized since caller's backend last
824 * advanced the scan.
825 */
826void
828{
829 BTScanOpaque so = (BTScanOpaque) scan->opaque;
830 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
831 BTParallelScanDesc btscan;
832
833 Assert(so->numArrayKeys);
834
835 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
836 parallel_scan->ps_offset);
837
838 SpinLockAcquire(&btscan->btps_mutex);
839 if (btscan->btps_lastCurrPage == curr_page &&
840 btscan->btps_pageStatus == BTPARALLEL_IDLE)
841 {
842 btscan->btps_nextScanPage = InvalidBlockNumber;
843 btscan->btps_lastCurrPage = InvalidBlockNumber;
844 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
845
846 /* Serialize scan's current array keys */
847 for (int i = 0; i < so->numArrayKeys; i++)
848 {
849 BTArrayKeyInfo *array = &so->arrayKeys[i];
850
851 btscan->btps_arrElems[i] = array->cur_elem;
852 }
853 }
854 SpinLockRelease(&btscan->btps_mutex);
855}
856
857/*
858 * Bulk deletion of all index entries pointing to a set of heap tuples.
859 * The set of target tuples is specified via a callback routine that tells
860 * whether any given heap tuple (identified by ItemPointer) is being deleted.
861 *
862 * Result: a palloc'd struct containing statistical info for VACUUM displays.
863 */
866 IndexBulkDeleteCallback callback, void *callback_state)
867{
868 Relation rel = info->index;
869 BTCycleId cycleid;
870
871 /* allocate stats if first time through, else re-use existing struct */
872 if (stats == NULL)
874
875 /* Establish the vacuum cycle ID to use for this scan */
876 /* The ENSURE stuff ensures we clean up shared memory on failure */
878 {
879 cycleid = _bt_start_vacuum(rel);
880
881 btvacuumscan(info, stats, callback, callback_state, cycleid);
882 }
884 _bt_end_vacuum(rel);
885
886 return stats;
887}
888
889/*
890 * Post-VACUUM cleanup.
891 *
892 * Result: a palloc'd struct containing statistical info for VACUUM displays.
893 */
896{
897 BlockNumber num_delpages;
898
899 /* No-op in ANALYZE ONLY mode */
900 if (info->analyze_only)
901 return stats;
902
903 /*
904 * If btbulkdelete was called, we need not do anything (we just maintain
905 * the information used within _bt_vacuum_needs_cleanup() by calling
906 * _bt_set_cleanup_info() below).
907 *
908 * If btbulkdelete was _not_ called, then we have a choice to make: we
909 * must decide whether or not a btvacuumscan() call is needed now (i.e.
910 * whether the ongoing VACUUM operation can entirely avoid a physical scan
911 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
912 * now.
913 */
914 if (stats == NULL)
915 {
916 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
918 return NULL;
919
920 /*
921 * Since we aren't going to actually delete any leaf items, there's no
922 * need to go through all the vacuum-cycle-ID pushups here.
923 *
924 * Posting list tuples are a source of inaccuracy for cleanup-only
925 * scans. btvacuumscan() will assume that the number of index tuples
926 * from each page can be used as num_index_tuples, even though
927 * num_index_tuples is supposed to represent the number of TIDs in the
928 * index. This naive approach can underestimate the number of tuples
929 * in the index significantly.
930 *
931 * We handle the problem by making num_index_tuples an estimate in
932 * cleanup-only case.
933 */
935 btvacuumscan(info, stats, NULL, NULL, 0);
936 stats->estimated_count = true;
937 }
938
939 /*
940 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
941 *
942 * num_delpages is the number of deleted pages now in the index that were
943 * not safe to place in the FSM to be recycled just yet. num_delpages is
944 * greater than 0 only when _bt_pagedel() actually deleted pages during
945 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
946 * have failed to place any newly deleted pages in the FSM just moments
947 * ago. (Actually, there are edge cases where recycling of the current
948 * VACUUM's newly deleted pages does not even become safe by the time the
949 * next VACUUM comes around. See nbtree/README.)
950 */
951 Assert(stats->pages_deleted >= stats->pages_free);
952 num_delpages = stats->pages_deleted - stats->pages_free;
953 _bt_set_cleanup_info(info->index, num_delpages);
954
955 /*
956 * It's quite possible for us to be fooled by concurrent page splits into
957 * double-counting some index tuples, so disbelieve any total that exceeds
958 * the underlying heap's count ... if we know that accurately. Otherwise
959 * this might just make matters worse.
960 */
961 if (!info->estimated_count)
962 {
963 if (stats->num_index_tuples > info->num_heap_tuples)
964 stats->num_index_tuples = info->num_heap_tuples;
965 }
966
967 return stats;
968}
969
970/*
971 * btvacuumscan --- scan the index for VACUUMing purposes
972 *
973 * This combines the functions of looking for leaf tuples that are deletable
974 * according to the vacuum callback, looking for empty pages that can be
975 * deleted, and looking for old deleted pages that can be recycled. Both
976 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
977 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
978 *
979 * The caller is responsible for initially allocating/zeroing a stats struct
980 * and for obtaining a vacuum cycle ID if necessary.
981 */
982static void
984 IndexBulkDeleteCallback callback, void *callback_state,
985 BTCycleId cycleid)
986{
987 Relation rel = info->index;
988 BTVacState vstate;
989 BlockNumber num_pages;
990 BlockNumber scanblkno;
991 bool needLock;
992
993 /*
994 * Reset fields that track information about the entire index now. This
995 * avoids double-counting in the case where a single VACUUM command
996 * requires multiple scans of the index.
997 *
998 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
999 * since they track information about the VACUUM command, and so must last
1000 * across each call to btvacuumscan().
1001 *
1002 * (Note that pages_free is treated as state about the whole index, not
1003 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1004 * calls are idempotent, and get repeated for the same deleted pages in
1005 * some scenarios. The point for us is to track the number of recyclable
1006 * pages in the index at the end of the VACUUM command.)
1007 */
1008 stats->num_pages = 0;
1009 stats->num_index_tuples = 0;
1010 stats->pages_deleted = 0;
1011 stats->pages_free = 0;
1012
1013 /* Set up info to pass down to btvacuumpage */
1014 vstate.info = info;
1015 vstate.stats = stats;
1016 vstate.callback = callback;
1017 vstate.callback_state = callback_state;
1018 vstate.cycleid = cycleid;
1019
1020 /* Create a temporary memory context to run _bt_pagedel in */
1022 "_bt_pagedel",
1024
1025 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1026 vstate.bufsize = 0;
1027 vstate.maxbufsize = 0;
1028 vstate.pendingpages = NULL;
1029 vstate.npendingpages = 0;
1030 /* Consider applying _bt_pendingfsm_finalize optimization */
1031 _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1032
1033 /*
1034 * The outer loop iterates over all index pages except the metapage, in
1035 * physical order (we hope the kernel will cooperate in providing
1036 * read-ahead for speed). It is critical that we visit all leaf pages,
1037 * including ones added after we start the scan, else we might fail to
1038 * delete some deletable tuples. Hence, we must repeatedly check the
1039 * relation length. We must acquire the relation-extension lock while
1040 * doing so to avoid a race condition: if someone else is extending the
1041 * relation, there is a window where bufmgr/smgr have created a new
1042 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1043 * we manage to scan such a page here, we'll improperly assume it can be
1044 * recycled. Taking the lock synchronizes things enough to prevent a
1045 * problem: either num_pages won't include the new page, or _bt_getbuf
1046 * already has write lock on the buffer and it will be fully initialized
1047 * before we can examine it. Also, we need not worry if a page is added
1048 * immediately after we look; the page splitting code already has
1049 * write-lock on the left page before it adds a right page, so we must
1050 * already have processed any tuples due to be moved into such a page.
1051 *
1052 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1053 * think the use of the extension lock is still required.
1054 *
1055 * We can skip locking for new or temp relations, however, since no one
1056 * else could be accessing them.
1057 */
1058 needLock = !RELATION_IS_LOCAL(rel);
1059
1060 scanblkno = BTREE_METAPAGE + 1;
1061 for (;;)
1062 {
1063 /* Get the current relation length */
1064 if (needLock)
1066 num_pages = RelationGetNumberOfBlocks(rel);
1067 if (needLock)
1069
1070 if (info->report_progress)
1072 num_pages);
1073
1074 /* Quit if we've scanned the whole relation */
1075 if (scanblkno >= num_pages)
1076 break;
1077 /* Iterate over pages, then loop back to recheck length */
1078 for (; scanblkno < num_pages; scanblkno++)
1079 {
1080 btvacuumpage(&vstate, scanblkno);
1081 if (info->report_progress)
1083 scanblkno);
1084 }
1085 }
1086
1087 /* Set statistics num_pages field to final size of index */
1088 stats->num_pages = num_pages;
1089
1091
1092 /*
1093 * If there were any calls to _bt_pagedel() during scan of the index then
1094 * see if any of the resulting pages can be placed in the FSM now. When
1095 * it's not safe we'll have to leave it up to a future VACUUM operation.
1096 *
1097 * Finally, if we placed any pages in the FSM (either just now or during
1098 * the scan), forcibly update the upper-level FSM pages to ensure that
1099 * searchers can find them.
1100 */
1101 _bt_pendingfsm_finalize(rel, &vstate);
1102 if (stats->pages_free > 0)
1104}
1105
1106/*
1107 * btvacuumpage --- VACUUM one page
1108 *
1109 * This processes a single page for btvacuumscan(). In some cases we must
1110 * backtrack to re-examine and VACUUM pages that were the scanblkno during
1111 * a previous call here. This is how we handle page splits (that happened
1112 * after our cycleid was acquired) whose right half page happened to reuse
1113 * a block that we might have processed at some point before it was
1114 * recycled (i.e. before the page split).
1115 */
1116static void
1118{
1119 IndexVacuumInfo *info = vstate->info;
1120 IndexBulkDeleteResult *stats = vstate->stats;
1122 void *callback_state = vstate->callback_state;
1123 Relation rel = info->index;
1124 Relation heaprel = info->heaprel;
1125 bool attempt_pagedel;
1126 BlockNumber blkno,
1127 backtrack_to;
1128 Buffer buf;
1129 Page page;
1130 BTPageOpaque opaque;
1131
1132 blkno = scanblkno;
1133
1134backtrack:
1135
1136 attempt_pagedel = false;
1137 backtrack_to = P_NONE;
1138
1139 /* call vacuum_delay_point while not holding any buffer lock */
1140 vacuum_delay_point(false);
1141
1142 /*
1143 * We can't use _bt_getbuf() here because it always applies
1144 * _bt_checkpage(), which will barf on an all-zero page. We want to
1145 * recycle all-zero pages, not fail. Also, we want to use a nondefault
1146 * buffer access strategy.
1147 */
1149 info->strategy);
1150 _bt_lockbuf(rel, buf, BT_READ);
1151 page = BufferGetPage(buf);
1152 opaque = NULL;
1153 if (!PageIsNew(page))
1154 {
1155 _bt_checkpage(rel, buf);
1156 opaque = BTPageGetOpaque(page);
1157 }
1158
1159 Assert(blkno <= scanblkno);
1160 if (blkno != scanblkno)
1161 {
1162 /*
1163 * We're backtracking.
1164 *
1165 * We followed a right link to a sibling leaf page (a page that
1166 * happens to be from a block located before scanblkno). The only
1167 * case we want to do anything with is a live leaf page having the
1168 * current vacuum cycle ID.
1169 *
1170 * The page had better be in a state that's consistent with what we
1171 * expect. Check for conditions that imply corruption in passing. It
1172 * can't be half-dead because only an interrupted VACUUM process can
1173 * leave pages in that state, so we'd definitely have dealt with it
1174 * back when the page was the scanblkno page (half-dead pages are
1175 * always marked fully deleted by _bt_pagedel(), barring corruption).
1176 */
1177 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1178 {
1179 Assert(false);
1180 ereport(LOG,
1181 (errcode(ERRCODE_INDEX_CORRUPTED),
1182 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1183 blkno, scanblkno, RelationGetRelationName(rel))));
1184 _bt_relbuf(rel, buf);
1185 return;
1186 }
1187
1188 /*
1189 * We may have already processed the page in an earlier call, when the
1190 * page was scanblkno. This happens when the leaf page split occurred
1191 * after the scan began, but before the right sibling page became the
1192 * scanblkno.
1193 *
1194 * Page may also have been deleted by current btvacuumpage() call,
1195 * since _bt_pagedel() sometimes deletes the right sibling page of
1196 * scanblkno in passing (it does so after we decided where to
1197 * backtrack to). We don't need to process this page as a deleted
1198 * page a second time now (in fact, it would be wrong to count it as a
1199 * deleted page in the bulk delete statistics a second time).
1200 */
1201 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1202 {
1203 /* Done with current scanblkno (and all lower split pages) */
1204 _bt_relbuf(rel, buf);
1205 return;
1206 }
1207 }
1208
1209 if (!opaque || BTPageIsRecyclable(page, heaprel))
1210 {
1211 /* Okay to recycle this page (which could be leaf or internal) */
1212 RecordFreeIndexPage(rel, blkno);
1213 stats->pages_deleted++;
1214 stats->pages_free++;
1215 }
1216 else if (P_ISDELETED(opaque))
1217 {
1218 /*
1219 * Already deleted page (which could be leaf or internal). Can't
1220 * recycle yet.
1221 */
1222 stats->pages_deleted++;
1223 }
1224 else if (P_ISHALFDEAD(opaque))
1225 {
1226 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1227 attempt_pagedel = true;
1228
1229 /*
1230 * _bt_pagedel() will increment both pages_newly_deleted and
1231 * pages_deleted stats in all cases (barring corruption)
1232 */
1233 }
1234 else if (P_ISLEAF(opaque))
1235 {
1237 int ndeletable;
1239 int nupdatable;
1240 OffsetNumber offnum,
1241 minoff,
1242 maxoff;
1243 int nhtidsdead,
1244 nhtidslive;
1245
1246 /*
1247 * Trade in the initial read lock for a full cleanup lock on this
1248 * page. We must get such a lock on every leaf page over the course
1249 * of the vacuum scan, whether or not it actually contains any
1250 * deletable tuples --- see nbtree/README.
1251 */
1253
1254 /*
1255 * Check whether we need to backtrack to earlier pages. What we are
1256 * concerned about is a page split that happened since we started the
1257 * vacuum scan. If the split moved tuples on the right half of the
1258 * split (i.e. the tuples that sort high) to a block that we already
1259 * passed over, then we might have missed the tuples. We need to
1260 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1261 * or deleting scanblkno page below!)
1262 */
1263 if (vstate->cycleid != 0 &&
1264 opaque->btpo_cycleid == vstate->cycleid &&
1265 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1266 !P_RIGHTMOST(opaque) &&
1267 opaque->btpo_next < scanblkno)
1268 backtrack_to = opaque->btpo_next;
1269
1270 ndeletable = 0;
1271 nupdatable = 0;
1272 minoff = P_FIRSTDATAKEY(opaque);
1273 maxoff = PageGetMaxOffsetNumber(page);
1274 nhtidsdead = 0;
1275 nhtidslive = 0;
1276 if (callback)
1277 {
1278 /* btbulkdelete callback tells us what to delete (or update) */
1279 for (offnum = minoff;
1280 offnum <= maxoff;
1281 offnum = OffsetNumberNext(offnum))
1282 {
1283 IndexTuple itup;
1284
1285 itup = (IndexTuple) PageGetItem(page,
1286 PageGetItemId(page, offnum));
1287
1288 Assert(!BTreeTupleIsPivot(itup));
1289 if (!BTreeTupleIsPosting(itup))
1290 {
1291 /* Regular tuple, standard table TID representation */
1292 if (callback(&itup->t_tid, callback_state))
1293 {
1294 deletable[ndeletable++] = offnum;
1295 nhtidsdead++;
1296 }
1297 else
1298 nhtidslive++;
1299 }
1300 else
1301 {
1302 BTVacuumPosting vacposting;
1303 int nremaining;
1304
1305 /* Posting list tuple */
1306 vacposting = btreevacuumposting(vstate, itup, offnum,
1307 &nremaining);
1308 if (vacposting == NULL)
1309 {
1310 /*
1311 * All table TIDs from the posting tuple remain, so no
1312 * delete or update required
1313 */
1314 Assert(nremaining == BTreeTupleGetNPosting(itup));
1315 }
1316 else if (nremaining > 0)
1317 {
1318
1319 /*
1320 * Store metadata about posting list tuple in
1321 * updatable array for entire page. Existing tuple
1322 * will be updated during the later call to
1323 * _bt_delitems_vacuum().
1324 */
1325 Assert(nremaining < BTreeTupleGetNPosting(itup));
1326 updatable[nupdatable++] = vacposting;
1327 nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1328 }
1329 else
1330 {
1331 /*
1332 * All table TIDs from the posting list must be
1333 * deleted. We'll delete the index tuple completely
1334 * (no update required).
1335 */
1336 Assert(nremaining == 0);
1337 deletable[ndeletable++] = offnum;
1338 nhtidsdead += BTreeTupleGetNPosting(itup);
1339 pfree(vacposting);
1340 }
1341
1342 nhtidslive += nremaining;
1343 }
1344 }
1345 }
1346
1347 /*
1348 * Apply any needed deletes or updates. We issue just one
1349 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1350 */
1351 if (ndeletable > 0 || nupdatable > 0)
1352 {
1353 Assert(nhtidsdead >= ndeletable + nupdatable);
1354 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1355 nupdatable);
1356
1357 stats->tuples_removed += nhtidsdead;
1358 /* must recompute maxoff */
1359 maxoff = PageGetMaxOffsetNumber(page);
1360
1361 /* can't leak memory here */
1362 for (int i = 0; i < nupdatable; i++)
1363 pfree(updatable[i]);
1364 }
1365 else
1366 {
1367 /*
1368 * If the leaf page has been split during this vacuum cycle, it
1369 * seems worth expending a write to clear btpo_cycleid even if we
1370 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1371 * takes care of this.) This ensures we won't process the page
1372 * again.
1373 *
1374 * We treat this like a hint-bit update because there's no need to
1375 * WAL-log it.
1376 */
1377 Assert(nhtidsdead == 0);
1378 if (vstate->cycleid != 0 &&
1379 opaque->btpo_cycleid == vstate->cycleid)
1380 {
1381 opaque->btpo_cycleid = 0;
1382 MarkBufferDirtyHint(buf, true);
1383 }
1384 }
1385
1386 /*
1387 * If the leaf page is now empty, try to delete it; else count the
1388 * live tuples (live table TIDs in posting lists are counted as
1389 * separate live tuples). We don't delete when backtracking, though,
1390 * since that would require teaching _bt_pagedel() about backtracking
1391 * (doesn't seem worth adding more complexity to deal with that).
1392 *
1393 * We don't count the number of live TIDs during cleanup-only calls to
1394 * btvacuumscan (i.e. when callback is not set). We count the number
1395 * of index tuples directly instead. This avoids the expense of
1396 * directly examining all of the tuples on each page. VACUUM will
1397 * treat num_index_tuples as an estimate in cleanup-only case, so it
1398 * doesn't matter that this underestimates num_index_tuples
1399 * significantly in some cases.
1400 */
1401 if (minoff > maxoff)
1402 attempt_pagedel = (blkno == scanblkno);
1403 else if (callback)
1404 stats->num_index_tuples += nhtidslive;
1405 else
1406 stats->num_index_tuples += maxoff - minoff + 1;
1407
1408 Assert(!attempt_pagedel || nhtidslive == 0);
1409 }
1410
1411 if (attempt_pagedel)
1412 {
1413 MemoryContext oldcontext;
1414
1415 /* Run pagedel in a temp context to avoid memory leakage */
1417 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1418
1419 /*
1420 * _bt_pagedel maintains the bulk delete stats on our behalf;
1421 * pages_newly_deleted and pages_deleted are likely to be incremented
1422 * during call
1423 */
1424 Assert(blkno == scanblkno);
1425 _bt_pagedel(rel, buf, vstate);
1426
1427 MemoryContextSwitchTo(oldcontext);
1428 /* pagedel released buffer, so we shouldn't */
1429 }
1430 else
1431 _bt_relbuf(rel, buf);
1432
1433 if (backtrack_to != P_NONE)
1434 {
1435 blkno = backtrack_to;
1436 goto backtrack;
1437 }
1438}
1439
1440/*
1441 * btreevacuumposting --- determine TIDs still needed in posting list
1442 *
1443 * Returns metadata describing how to build replacement tuple without the TIDs
1444 * that VACUUM needs to delete. Returned value is NULL in the common case
1445 * where no changes are needed to caller's posting list tuple (we avoid
1446 * allocating memory here as an optimization).
1447 *
1448 * The number of TIDs that should remain in the posting list tuple is set for
1449 * caller in *nremaining.
1450 */
1451static BTVacuumPosting
1453 OffsetNumber updatedoffset, int *nremaining)
1454{
1455 int live = 0;
1456 int nitem = BTreeTupleGetNPosting(posting);
1458 BTVacuumPosting vacposting = NULL;
1459
1460 for (int i = 0; i < nitem; i++)
1461 {
1462 if (!vstate->callback(items + i, vstate->callback_state))
1463 {
1464 /* Live table TID */
1465 live++;
1466 }
1467 else if (vacposting == NULL)
1468 {
1469 /*
1470 * First dead table TID encountered.
1471 *
1472 * It's now clear that we need to delete one or more dead table
1473 * TIDs, so start maintaining metadata describing how to update
1474 * existing posting list tuple.
1475 */
1476 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1477 nitem * sizeof(uint16));
1478
1479 vacposting->itup = posting;
1480 vacposting->updatedoffset = updatedoffset;
1481 vacposting->ndeletedtids = 0;
1482 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1483 }
1484 else
1485 {
1486 /* Second or subsequent dead table TID */
1487 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1488 }
1489 }
1490
1491 *nremaining = live;
1492 return vacposting;
1493}
1494
1495/*
1496 * btcanreturn() -- Check whether btree indexes support index-only scans.
1497 *
1498 * btrees always do, so this is trivial.
1499 */
1500bool
1502{
1503 return true;
1504}
1505
1506/*
1507 * btgettreeheight() -- Compute tree height for use by btcostestimate().
1508 */
1509int
1511{
1512 return _bt_getrootheight(rel);
1513}
1514
1516bttranslatestrategy(StrategyNumber strategy, Oid opfamily, Oid opcintype)
1517{
1518 switch (strategy)
1519 {
1521 return COMPARE_LT;
1523 return COMPARE_LE;
1525 return COMPARE_EQ;
1527 return COMPARE_GE;
1529 return COMPARE_GT;
1530 default:
1531 return COMPARE_INVALID;
1532 }
1533}
1534
1536bttranslatecmptype(CompareType cmptype, Oid opfamily, Oid opcintype)
1537{
1538 switch (cmptype)
1539 {
1540 case COMPARE_LT:
1541 return BTLessStrategyNumber;
1542 case COMPARE_LE:
1544 case COMPARE_EQ:
1545 return BTEqualStrategyNumber;
1546 case COMPARE_GE:
1548 case COMPARE_GT:
1550 default:
1551 return InvalidStrategy;
1552 }
1553}
void pgstat_progress_update_param(int index, int64 val)
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static Datum values[MAXATTR]
Definition: bootstrap.c:151
int Buffer
Definition: buf.h:23
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4895
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4927
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:793
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:273
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:396
@ RBM_NORMAL
Definition: bufmgr.h:45
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
PageData * Page
Definition: bufpage.h:82
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
#define OffsetToPointer(base, offset)
Definition: c.h:729
#define Assert(condition)
Definition: c.h:815
int64_t int64
Definition: c.h:485
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:420
uint16_t uint16
Definition: c.h:487
size_t Size
Definition: c.h:562
CompareType
Definition: cmptype.h:32
@ COMPARE_LE
Definition: cmptype.h:35
@ COMPARE_INVALID
Definition: cmptype.h:33
@ COMPARE_GT
Definition: cmptype.h:38
@ COMPARE_EQ
Definition: cmptype.h:36
@ COMPARE_GE
Definition: cmptype.h:37
@ COMPARE_LT
Definition: cmptype.h:34
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode(int sqlerrcode)
Definition: elog.c:853
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:149
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:89
IndexUniqueCheck
Definition: genam.h:118
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
int i
Definition: isn.c:72
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:181
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:419
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:469
#define ExclusiveLock
Definition: lockdefs.h:42
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:383
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:102
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:675
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1802
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1154
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:232
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1109
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:179
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2996
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2954
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition: nbtree.c:827
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1501
BTPS_State
Definition: nbtree.c:53
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:56
@ BTPARALLEL_NEED_PRIMSCAN
Definition: nbtree.c:55
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:54
@ BTPARALLEL_IDLE
Definition: nbtree.c:57
@ BTPARALLEL_DONE
Definition: nbtree.c:58
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:608
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:315
StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily, Oid opcintype)
Definition: nbtree.c:1536
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:777
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:895
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1452
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:209
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:565
struct BTParallelScanDescData BTParallelScanDescData
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:185
void btbuildempty(Relation index)
Definition: nbtree.c:162
static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
Definition: nbtree.c:1117
int btgettreeheight(Relation rel)
Definition: nbtree.c:1510
void btinitparallelscan(void *target)
Definition: nbtree.c:550
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:865
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:983
Size btestimateparallelscan(int nkeys, int norderbys)
Definition: nbtree.c:540
CompareType bttranslatestrategy(StrategyNumber strategy, Oid opfamily, Oid opcintype)
Definition: nbtree.c:1516
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:269
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:456
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:420
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:363
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:83
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition: nbtree.c:750
Datum bthandler(PG_FUNCTION_ARGS)
Definition: nbtree.c:101
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:482
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:998
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:537
#define BTNProcs
Definition: nbtree.h:717
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1015
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
uint16 BTCycleId
Definition: nbtree.h:29
#define P_NONE
Definition: nbtree.h:212
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_READ
Definition: nbtree.h:724
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:291
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1021
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1009
#define BTP_SPLIT_END
Definition: nbtree.h:81
#define BTOPTIONS_PROC
Definition: nbtree.h:716
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1078
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:882
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1448
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:295
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2641
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:2669
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:2333
bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:743
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:2765
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:2719
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:2742
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:3297
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:413
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:2584
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:40
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:284
#define makeNode(_type_)
Definition: nodes.h:155
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
static char * buf
Definition: pg_test_fsync.c:72
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
uintptr_t Datum
Definition: postgres.h:69
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:125
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:124
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:655
#define RelationGetDescr(relation)
Definition: rel.h:538
#define RelationGetRelationName(relation)
Definition: rel.h:546
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
ScanDirection
Definition: sdir.h:25
@ ForwardScanDirection
Definition: sdir.h:28
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:6822
ScanKeyData * ScanKey
Definition: skey.h:75
#define SpinLockInit(lock)
Definition: spin.h:57
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTMaxStrategyNumber
Definition: stratnum.h:35
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
Datum * elem_values
Definition: nbtree.h:1033
BlockNumber btpo_next
Definition: nbtree.h:65
uint16 btpo_flags
Definition: nbtree.h:67
BTCycleId btpo_cycleid
Definition: nbtree.h:68
slock_t btps_mutex
Definition: nbtree.c:73
BTPS_State btps_pageStatus
Definition: nbtree.c:70
BlockNumber btps_lastCurrPage
Definition: nbtree.c:68
ConditionVariable btps_cv
Definition: nbtree.c:74
BlockNumber btps_nextScanPage
Definition: nbtree.c:67
int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.c:80
bool needPrimScan
Definition: nbtree.h:1045
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1048
char * markTuples
Definition: nbtree.h:1062
FmgrInfo * orderProcs
Definition: nbtree.h:1049
BTScanPosData currPos
Definition: nbtree.h:1074
int * killedItems
Definition: nbtree.h:1053
char * currTuples
Definition: nbtree.h:1061
bool oppositeDirCheck
Definition: nbtree.h:1047
BTScanPosData markPos
Definition: nbtree.h:1075
ScanKey keyData
Definition: nbtree.h:1041
MemoryContext arrayContext
Definition: nbtree.h:1050
bool moreRight
Definition: nbtree.h:980
Buffer buf
Definition: nbtree.h:958
int nextTupleOffset
Definition: nbtree.h:973
bool moreLeft
Definition: nbtree.h:979
int lastItem
Definition: nbtree.h:990
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:993
int itemIndex
Definition: nbtree.h:991
ScanDirection dir
Definition: nbtree.h:967
ItemPointerData heapTid
Definition: nbtree.h:951
IndexBulkDeleteResult * stats
Definition: nbtree.h:333
BTCycleId cycleid
Definition: nbtree.h:336
BTPendingFSM * pendingpages
Definition: nbtree.h:344
int npendingpages
Definition: nbtree.h:345
IndexBulkDeleteCallback callback
Definition: nbtree.h:334
MemoryContext pagedelcontext
Definition: nbtree.h:337
IndexVacuumInfo * info
Definition: nbtree.h:332
int bufsize
Definition: nbtree.h:342
int maxbufsize
Definition: nbtree.h:343
void * callback_state
Definition: nbtree.h:335
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:916
uint16 ndeletedtids
Definition: nbtree.h:915
IndexTuple itup
Definition: nbtree.h:911
OffsetNumber updatedoffset
Definition: nbtree.h:912
ambuildphasename_function ambuildphasename
Definition: amapi.h:297
ambuildempty_function ambuildempty
Definition: amapi.h:287
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:291
bool amclusterable
Definition: amapi.h:261
amoptions_function amoptions
Definition: amapi.h:295
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:309
amrestrpos_function amrestrpos
Definition: amapi.h:306
aminsert_function aminsert
Definition: amapi.h:288
amendscan_function amendscan
Definition: amapi.h:304
amtranslate_strategy_function amtranslatestrategy
Definition: amapi.h:314
uint16 amoptsprocnum
Definition: amapi.h:241
amparallelrescan_function amparallelrescan
Definition: amapi.h:311
Oid amkeytype
Definition: amapi.h:277
bool ampredlocks
Definition: amapi.h:263
uint16 amsupport
Definition: amapi.h:239
amtranslate_cmptype_function amtranslatecmptype
Definition: amapi.h:315
amcostestimate_function amcostestimate
Definition: amapi.h:293
bool amcanorderbyop
Definition: amapi.h:245
amadjustmembers_function amadjustmembers
Definition: amapi.h:299
ambuild_function ambuild
Definition: amapi.h:286
bool amstorage
Definition: amapi.h:259
uint16 amstrategies
Definition: amapi.h:237
bool amoptionalkey
Definition: amapi.h:253
amgettuple_function amgettuple
Definition: amapi.h:302
amcanreturn_function amcanreturn
Definition: amapi.h:292
bool amcanunique
Definition: amapi.h:249
amgetbitmap_function amgetbitmap
Definition: amapi.h:303
amproperty_function amproperty
Definition: amapi.h:296
ambulkdelete_function ambulkdelete
Definition: amapi.h:290
bool amsearcharray
Definition: amapi.h:255
bool amsummarizing
Definition: amapi.h:273
amvalidate_function amvalidate
Definition: amapi.h:298
ammarkpos_function ammarkpos
Definition: amapi.h:305
bool amcanmulticol
Definition: amapi.h:251
bool amusemaintenanceworkmem
Definition: amapi.h:271
ambeginscan_function ambeginscan
Definition: amapi.h:300
bool amcanparallel
Definition: amapi.h:265
amrescan_function amrescan
Definition: amapi.h:301
bool amcanorder
Definition: amapi.h:243
bool amcanbuildparallel
Definition: amapi.h:267
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:310
uint8 amparallelvacuumoptions
Definition: amapi.h:275
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:289
bool amcanbackward
Definition: amapi.h:247
amgettreeheight_function amgettreeheight
Definition: amapi.h:294
bool amcaninclude
Definition: amapi.h:269
bool amsearchnulls
Definition: amapi.h:257
bool estimated_count
Definition: genam.h:80
BlockNumber pages_deleted
Definition: genam.h:84
BlockNumber pages_free
Definition: genam.h:85
BlockNumber num_pages
Definition: genam.h:79
double tuples_removed
Definition: genam.h:82
double num_index_tuples
Definition: genam.h:81
struct ScanKeyData * keyData
Definition: relscan.h:139
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:183
bool kill_prior_tuple
Definition: relscan.h:145
struct TupleDescData * xs_itupdesc
Definition: relscan.h:160
ItemPointerData xs_heaptid
Definition: relscan.h:164
ItemPointerData t_tid
Definition: itup.h:37
Relation index
Definition: genam.h:48
double num_heap_tuples
Definition: genam.h:54
bool analyze_only
Definition: genam.h:50
BufferAccessStrategy strategy
Definition: genam.h:55
Relation heaprel
Definition: genam.h:49
bool report_progress
Definition: genam.h:51
bool estimated_count
Definition: genam.h:52
Datum sk_argument
Definition: skey.h:72
Definition: type.h:96
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
static ItemArray items
Definition: test_tidstore.c:48
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:377
void vacuum_delay_point(bool is_analyze)
Definition: vacuum.c:2387
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:55