PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtree.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/stratnum.h"
#include "commands/progress.h"
#include "commands/vacuum.h"
#include "nodes/execnodes.h"
#include "pgstat.h"
#include "storage/bulk_write.h"
#include "storage/condition_variable.h"
#include "storage/indexfsm.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/lwlock.h"
#include "storage/read_stream.h"
#include "utils/datum.h"
#include "utils/fmgrprotos.h"
#include "utils/index_selfuncs.h"
#include "utils/memutils.h"
#include "utils/wait_event.h"
Include dependency graph for nbtree.c:

Go to the source code of this file.

Data Structures

struct  BTParallelScanDescData
 

Typedefs

typedef struct BTParallelScanDescData BTParallelScanDescData
 
typedef struct BTParallelScanDescDataBTParallelScanDesc
 

Enumerations

enum  BTPS_State {
  BTPARALLEL_NOT_INITIALIZED , BTPARALLEL_NEED_PRIMSCAN , BTPARALLEL_ADVANCING , BTPARALLEL_IDLE ,
  BTPARALLEL_DONE
}
 

Functions

static bool _bt_start_prim_scan (IndexScanDesc scan)
 
static void _bt_parallel_serialize_arrays (Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
 
static void _bt_parallel_restore_arrays (Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
 
static void btvacuumscan (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
 
static BlockNumber btvacuumpage (BTVacState *vstate, Buffer buf)
 
static BTVacuumPosting btreevacuumposting (BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
 
Datum bthandler (PG_FUNCTION_ARGS)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
Size btestimateparallelscan (Relation rel, int nkeys, int norderbys)
 
void btinitparallelscan (void *target)
 
void btparallelrescan (IndexScanDesc scan)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_primscan_schedule (IndexScanDesc scan, BlockNumber curr_page)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
int btgettreeheight (Relation rel)
 
CompareType bttranslatestrategy (StrategyNumber strategy, Oid opfamily)
 
StrategyNumber bttranslatecmptype (CompareType cmptype, Oid opfamily)
 

Typedef Documentation

◆ BTParallelScanDesc

Definition at line 95 of file nbtree.c.

◆ BTParallelScanDescData

Enumeration Type Documentation

◆ BTPS_State

Enumerator
BTPARALLEL_NOT_INITIALIZED 
BTPARALLEL_NEED_PRIMSCAN 
BTPARALLEL_ADVANCING 
BTPARALLEL_IDLE 
BTPARALLEL_DONE 

Definition at line 56 of file nbtree.c.

57{
BTPS_State
Definition nbtree.c:57
@ BTPARALLEL_ADVANCING
Definition nbtree.c:60
@ BTPARALLEL_NEED_PRIMSCAN
Definition nbtree.c:59
@ BTPARALLEL_NOT_INITIALIZED
Definition nbtree.c:58
@ BTPARALLEL_IDLE
Definition nbtree.c:61
@ BTPARALLEL_DONE
Definition nbtree.c:62

Function Documentation

◆ _bt_parallel_done()

void _bt_parallel_done ( IndexScanDesc  scan)

Definition at line 1038 of file nbtree.c.

1039{
1041 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1043 bool status_changed = false;
1044
1045 Assert(!BTScanPosIsValid(so->currPos));
1046
1047 /* Do nothing, for non-parallel scans */
1048 if (parallel_scan == NULL)
1049 return;
1050
1051 /*
1052 * Should not mark parallel scan done when there's still a pending
1053 * primitive index scan
1054 */
1055 if (so->needPrimScan)
1056 return;
1057
1058 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1059 parallel_scan->ps_offset_am);
1060
1061 /*
1062 * Mark the parallel scan as done, unless some other process did so
1063 * already
1064 */
1065 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1066 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1067 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1068 {
1069 btscan->btps_pageStatus = BTPARALLEL_DONE;
1070 status_changed = true;
1071 }
1072 LWLockRelease(&btscan->btps_lock);
1073
1074 /* wake up all the workers associated with this parallel scan */
1075 if (status_changed)
1077}
#define OffsetToPointer(base, offset)
Definition c.h:855
#define Assert(condition)
Definition c.h:943
void ConditionVariableBroadcast(ConditionVariable *cv)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1150
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1767
@ LW_EXCLUSIVE
Definition lwlock.h:104
struct BTParallelScanDescData * BTParallelScanDesc
Definition nbtree.c:95
#define BTScanPosIsValid(scanpos)
Definition nbtree.h:1021
BTScanOpaqueData * BTScanOpaque
Definition nbtree.h:1097
static int fb(int x)
struct ParallelIndexScanDescData * parallel_scan
Definition relscan.h:204

References Assert, BTPARALLEL_DONE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosIsValid, ConditionVariableBroadcast(), fb(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_seize(), _bt_readnextpage(), and _bt_start_prim_scan().

◆ _bt_parallel_primscan_schedule()

void _bt_parallel_primscan_schedule ( IndexScanDesc  scan,
BlockNumber  curr_page 
)

Definition at line 1088 of file nbtree.c.

1089{
1090 Relation rel = scan->indexRelation;
1092 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1094
1095 Assert(so->numArrayKeys);
1096
1097 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1098 parallel_scan->ps_offset_am);
1099
1100 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1101 if (btscan->btps_lastCurrPage == curr_page &&
1102 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1103 {
1104 btscan->btps_nextScanPage = InvalidBlockNumber;
1105 btscan->btps_lastCurrPage = InvalidBlockNumber;
1106 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1107
1108 /* Serialize scan's current array keys */
1110 }
1111 LWLockRelease(&btscan->btps_lock);
1112}
#define InvalidBlockNumber
Definition block.h:33
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition nbtree.c:720
Relation indexRelation
Definition relscan.h:150

References _bt_parallel_serialize_arrays(), Assert, BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, fb(), IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_advance_array_keys(), and _bt_readpage().

◆ _bt_parallel_release()

void _bt_parallel_release ( IndexScanDesc  scan,
BlockNumber  next_scan_page,
BlockNumber  curr_page 
)

Definition at line 1011 of file nbtree.c.

1013{
1014 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1016
1018
1019 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1020 parallel_scan->ps_offset_am);
1021
1022 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1023 btscan->btps_nextScanPage = next_scan_page;
1024 btscan->btps_lastCurrPage = curr_page;
1025 btscan->btps_pageStatus = BTPARALLEL_IDLE;
1026 LWLockRelease(&btscan->btps_lock);
1028}
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
void ConditionVariableSignal(ConditionVariable *cv)

References Assert, BlockNumberIsValid(), BTPARALLEL_IDLE, ConditionVariableSignal(), fb(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_readnextpage(), and _bt_readpage().

◆ _bt_parallel_restore_arrays()

static void _bt_parallel_restore_arrays ( Relation  rel,
BTParallelScanDesc  btscan,
BTScanOpaque  so 
)
static

Definition at line 763 of file nbtree.c.

765{
766 char *datumshared;
767
768 /* Space for serialized datums begins immediately after btps_arrElems[] */
769 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
770 for (int i = 0; i < so->numArrayKeys; i++)
771 {
772 BTArrayKeyInfo *array = &so->arrayKeys[i];
773 ScanKey skey = &so->keyData[array->scan_key];
774 bool isnull;
775
776 if (array->num_elems != -1)
777 {
778 /* Restore SAOP array using its saved cur_elem */
779 Assert(!(skey->sk_flags & SK_BT_SKIP));
780 array->cur_elem = btscan->btps_arrElems[i];
781 skey->sk_argument = array->elem_values[array->cur_elem];
782 continue;
783 }
784
785 /* Restore skip array by restoring its key directly */
786 if (!array->attbyval && skey->sk_argument)
787 pfree(DatumGetPointer(skey->sk_argument));
788 skey->sk_argument = (Datum) 0;
789 memcpy(&skey->sk_flags, datumshared, sizeof(int));
790 datumshared += sizeof(int);
791
792 Assert(skey->sk_flags & SK_BT_SKIP);
793
794 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
795 {
796 /* No sk_argument datum to restore */
797 continue;
798 }
799
800 skey->sk_argument = datumRestore(&datumshared, &isnull);
801 if (isnull)
802 {
803 Assert(skey->sk_argument == 0);
804 Assert(skey->sk_flags & SK_SEARCHNULL);
805 Assert(skey->sk_flags & SK_ISNULL);
806 }
807 }
808}
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
Datum datumRestore(char **start_address, bool *isnull)
Definition datum.c:559
int i
Definition isn.c:77
void pfree(void *pointer)
Definition mcxt.c:1616
#define SK_BT_SKIP
Definition nbtree.h:1106
#define SK_BT_MAXVAL
Definition nbtree.h:1110
#define SK_BT_MINVAL
Definition nbtree.h:1109
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
#define SK_SEARCHNULL
Definition skey.h:121
#define SK_ISNULL
Definition skey.h:115
Datum * elem_values
Definition nbtree.h:1041

References Assert, BTArrayKeyInfo::attbyval, BTArrayKeyInfo::cur_elem, DatumGetPointer(), datumRestore(), BTArrayKeyInfo::elem_values, fb(), i, memcpy(), BTArrayKeyInfo::num_elems, pfree(), BTArrayKeyInfo::scan_key, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_SKIP, SK_ISNULL, and SK_SEARCHNULL.

Referenced by _bt_parallel_seize().

◆ _bt_parallel_seize()

bool _bt_parallel_seize ( IndexScanDesc  scan,
BlockNumber next_scan_page,
BlockNumber last_curr_page,
bool  first 
)

Definition at line 873 of file nbtree.c.

875{
876 Relation rel = scan->indexRelation;
878 bool exit_loop = false,
879 status = true,
880 endscan = false;
881 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
883
886
887 /*
888 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
889 * call to _bt_readnextpage treats this backend similarly to a serial
890 * backend that steps from *last_curr_page to *next_scan_page (unless this
891 * backend's so->currPos is initialized by _bt_readfirstpage before then).
892 */
893 BTScanPosInvalidate(so->currPos);
894 so->currPos.moreLeft = so->currPos.moreRight = true;
895
896 if (first)
897 {
898 /*
899 * Initialize array related state when called from _bt_first, assuming
900 * that this will be the first primitive index scan for the scan
901 */
902 so->needPrimScan = false;
903 so->scanBehind = false;
904 so->oppositeDirCheck = false;
905 }
906 else
907 {
908 /*
909 * Don't attempt to seize the scan when it requires another primitive
910 * index scan, since caller's backend cannot start it right now
911 */
912 if (so->needPrimScan)
913 return false;
914 }
915
916 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
917 parallel_scan->ps_offset_am);
918
919 while (1)
920 {
921 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
922
923 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
924 {
925 /* We're done with this parallel index scan */
926 status = false;
927 }
928 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
929 btscan->btps_nextScanPage == P_NONE)
930 {
931 /* End this parallel index scan */
932 status = false;
933 endscan = true;
934 }
935 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
936 {
937 Assert(so->numArrayKeys);
938
939 if (first)
940 {
941 /* Can start scheduled primitive scan right away, so do so */
942 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
943
944 /* Restore scan's array keys from serialized values */
946 exit_loop = true;
947 }
948 else
949 {
950 /*
951 * Don't attempt to seize the scan when it requires another
952 * primitive index scan, since caller's backend cannot start
953 * it right now
954 */
955 status = false;
956 }
957
958 /*
959 * Either way, update backend local state to indicate that a
960 * pending primitive scan is required
961 */
962 so->needPrimScan = true;
963 so->scanBehind = false;
964 so->oppositeDirCheck = false;
965 }
966 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
967 {
968 /*
969 * We have successfully seized control of the scan for the purpose
970 * of advancing it to a new page!
971 */
972 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
973 Assert(btscan->btps_nextScanPage != P_NONE);
974 *next_scan_page = btscan->btps_nextScanPage;
975 *last_curr_page = btscan->btps_lastCurrPage;
976 exit_loop = true;
977 }
978 LWLockRelease(&btscan->btps_lock);
979 if (exit_loop || !status)
980 break;
982 }
984
985 /* When the scan has reached the rightmost (or leftmost) page, end it */
986 if (endscan)
987 _bt_parallel_done(scan);
988
989 return status;
990}
bool ConditionVariableCancelSleep(void)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void _bt_parallel_done(IndexScanDesc scan)
Definition nbtree.c:1038
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition nbtree.c:763
#define P_NONE
Definition nbtree.h:213
#define BTScanPosInvalidate(scanpos)
Definition nbtree.h:1027

References _bt_parallel_done(), _bt_parallel_restore_arrays(), Assert, BTPARALLEL_ADVANCING, BTPARALLEL_DONE, BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosInvalidate, ConditionVariableCancelSleep(), ConditionVariableSleep(), fb(), IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_first(), and _bt_readnextpage().

◆ _bt_parallel_serialize_arrays()

static void _bt_parallel_serialize_arrays ( Relation  rel,
BTParallelScanDesc  btscan,
BTScanOpaque  so 
)
static

Definition at line 720 of file nbtree.c.

722{
723 char *datumshared;
724
725 /* Space for serialized datums begins immediately after btps_arrElems[] */
726 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
727 for (int i = 0; i < so->numArrayKeys; i++)
728 {
729 BTArrayKeyInfo *array = &so->arrayKeys[i];
730 ScanKey skey = &so->keyData[array->scan_key];
731
732 if (array->num_elems != -1)
733 {
734 /* Save SAOP array's cur_elem (no need to copy key/datum) */
735 Assert(!(skey->sk_flags & SK_BT_SKIP));
736 btscan->btps_arrElems[i] = array->cur_elem;
737 continue;
738 }
739
740 /* Save all mutable state associated with skip array's key */
741 Assert(skey->sk_flags & SK_BT_SKIP);
742 memcpy(datumshared, &skey->sk_flags, sizeof(int));
743 datumshared += sizeof(int);
744
745 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
746 {
747 /* No sk_argument datum to serialize */
748 Assert(skey->sk_argument == 0);
749 continue;
750 }
751
752 datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
753 array->attbyval, array->attlen, &datumshared);
754 }
755}
void datumSerialize(Datum value, bool isnull, bool typByVal, int typLen, char **start_address)
Definition datum.c:497

References Assert, BTArrayKeyInfo::attbyval, BTArrayKeyInfo::attlen, BTArrayKeyInfo::cur_elem, datumSerialize(), fb(), i, memcpy(), BTArrayKeyInfo::num_elems, BTArrayKeyInfo::scan_key, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_SKIP, and SK_ISNULL.

Referenced by _bt_parallel_primscan_schedule().

◆ _bt_start_prim_scan()

static bool _bt_start_prim_scan ( IndexScanDesc  scan)
static

Definition at line 656 of file nbtree.c.

657{
659
660 Assert(so->numArrayKeys);
661
662 so->scanBehind = so->oppositeDirCheck = false; /* reset */
663
664 /*
665 * Array keys are advanced within _bt_checkkeys when the scan reaches the
666 * leaf level (more precisely, they're advanced when the scan reaches the
667 * end of each distinct set of array elements). This process avoids
668 * repeat access to leaf pages (across multiple primitive index scans) by
669 * advancing the scan's array keys when it allows the primitive index scan
670 * to find nearby matching tuples (or when it eliminates ranges of array
671 * key space that can't possibly be satisfied by any index tuple).
672 *
673 * _bt_checkkeys sets a simple flag variable to schedule another primitive
674 * index scan. The flag tells us what to do.
675 *
676 * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
677 * various cases where that won't happen. For example, if the index is
678 * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
679 * We also don't expect a call to _bt_checkkeys during searches for a
680 * non-existent value that happens to be lower/higher than any existing
681 * value in the index.
682 *
683 * We don't require special handling for these cases -- we don't need to
684 * be explicitly instructed to _not_ perform another primitive index scan.
685 * It's up to code under the control of _bt_first to always set the flag
686 * when another primitive index scan will be required.
687 *
688 * This works correctly, even with the tricky cases listed above, which
689 * all involve access to leaf pages "near the boundaries of the key space"
690 * (whether it's from a leftmost/rightmost page, or an imaginary empty
691 * leaf root page). If _bt_checkkeys cannot be reached by a primitive
692 * index scan for one set of array keys, then it also won't be reached for
693 * any later set ("later" in terms of the direction that we scan the index
694 * and advance the arrays). The array keys won't have advanced in these
695 * cases, but that's the correct behavior (even _bt_advance_array_keys
696 * won't always advance the arrays at the point they become "exhausted").
697 */
698 if (so->needPrimScan)
699 {
700 /*
701 * Flag was set -- must call _bt_first again, which will reset the
702 * scan's needPrimScan flag
703 */
704 return true;
705 }
706
707 /* The top-level index scan ran out of tuples in this scan direction */
708 if (scan->parallel_scan != NULL)
709 _bt_parallel_done(scan);
710
711 return false;
712}

References _bt_parallel_done(), Assert, fb(), IndexScanDescData::opaque, and IndexScanDescData::parallel_scan.

Referenced by btgetbitmap(), and btgettuple().

◆ btbeginscan()

IndexScanDesc btbeginscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 339 of file nbtree.c.

340{
341 IndexScanDesc scan;
343
344 /* no order by operators allowed */
345 Assert(norderbys == 0);
346
347 /* get the scan */
348 scan = RelationGetIndexScan(rel, nkeys, norderbys);
349
350 /* allocate private workspace */
352 BTScanPosInvalidate(so->currPos);
353 BTScanPosInvalidate(so->markPos);
354 if (scan->numberOfKeys > 0)
355 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
356 else
357 so->keyData = NULL;
358
359 so->skipScan = false;
360 so->needPrimScan = false;
361 so->scanBehind = false;
362 so->oppositeDirCheck = false;
363 so->arrayKeys = NULL;
364 so->orderProcs = NULL;
365 so->arrayContext = NULL;
366
367 so->killedItems = NULL; /* until needed */
368 so->numKilled = 0;
369
370 /*
371 * We don't know yet whether the scan will be index-only, so we do not
372 * allocate the tuple workspace arrays until btrescan. However, we set up
373 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
374 */
375 so->currTuples = so->markTuples = NULL;
376
377 scan->xs_itupdesc = RelationGetDescr(rel);
378
379 scan->opaque = so;
380
381 return scan;
382}
#define palloc_object(type)
Definition fe_memutils.h:74
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition genam.c:80
void * palloc(Size size)
Definition mcxt.c:1387
#define RelationGetDescr(relation)
Definition rel.h:542
ScanKeyData * ScanKey
Definition skey.h:75
struct TupleDescData * xs_itupdesc
Definition relscan.h:181

References Assert, BTScanPosInvalidate, fb(), IndexScanDescData::numberOfKeys, IndexScanDescData::opaque, palloc(), palloc_object, RelationGetDescr, RelationGetIndexScan(), and IndexScanDescData::xs_itupdesc.

Referenced by bthandler().

◆ btbuildempty()

void btbuildempty ( Relation  index)

Definition at line 183 of file nbtree.c.

184{
185 bool allequalimage = _bt_allequalimage(index, false);
186 BulkWriteState *bulkstate;
188
190
191 /* Construct metapage. */
192 metabuf = smgr_bulk_get_buf(bulkstate);
193 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
194 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
195
196 smgr_bulk_finish(bulkstate);
197}
PageData * Page
Definition bufpage.h:81
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition bulk_write.c:130
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition nbtpage.c:68
#define BTREE_METAPAGE
Definition nbtree.h:149
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition nbtutils.c:1175
@ INIT_FORKNUM
Definition relpath.h:61
Definition type.h:96

References _bt_allequalimage(), _bt_initmetapage(), BTREE_METAPAGE, fb(), INIT_FORKNUM, P_NONE, smgr_bulk_finish(), smgr_bulk_get_buf(), smgr_bulk_start_rel(), and smgr_bulk_write().

Referenced by bthandler().

◆ btbulkdelete()

IndexBulkDeleteResult * btbulkdelete ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void callback_state 
)

Definition at line 1122 of file nbtree.c.

1124{
1125 Relation rel = info->index;
1126 BTCycleId cycleid;
1127
1128 /* allocate stats if first time through, else re-use existing struct */
1129 if (stats == NULL)
1131
1132 /* Establish the vacuum cycle ID to use for this scan */
1133 /* The ENSURE stuff ensures we clean up shared memory on failure */
1135 {
1136 cycleid = _bt_start_vacuum(rel);
1137
1138 btvacuumscan(info, stats, callback, callback_state, cycleid);
1139 }
1141 _bt_end_vacuum(rel);
1142
1143 return stats;
1144}
#define palloc0_object(type)
Definition fe_memutils.h:75
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition ipc.h:52
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition nbtree.c:1240
uint16 BTCycleId
Definition nbtree.h:30
void _bt_end_vacuum(Relation rel)
Definition nbtutils.c:530
void _bt_end_vacuum_callback(int code, Datum arg)
Definition nbtutils.c:558
BTCycleId _bt_start_vacuum(Relation rel)
Definition nbtutils.c:473
#define PointerGetDatum(X)
Definition postgres.h:354
Relation index
Definition genam.h:54
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), fb(), IndexVacuumInfo::index, palloc0_object, PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, and PointerGetDatum.

Referenced by bthandler().

◆ btcanreturn()

bool btcanreturn ( Relation  index,
int  attno 
)

Definition at line 1802 of file nbtree.c.

1803{
1804 return true;
1805}

Referenced by bthandler().

◆ btendscan()

void btendscan ( IndexScanDesc  scan)

Definition at line 455 of file nbtree.c.

456{
458
459 /* we aren't holding any read locks, but gotta drop the pins */
460 if (BTScanPosIsValid(so->currPos))
461 {
462 /* Before leaving current page, deal with any killed items */
463 if (so->numKilled > 0)
464 _bt_killitems(scan);
465 BTScanPosUnpinIfPinned(so->currPos);
466 }
467
468 so->markItemIndex = -1;
469 BTScanPosUnpinIfPinned(so->markPos);
470
471 /* No need to invalidate positions, the RAM is about to be freed. */
472
473 /* Release storage */
474 if (so->keyData != NULL)
475 pfree(so->keyData);
476 /* so->arrayKeys and so->orderProcs are in arrayContext */
477 if (so->arrayContext != NULL)
478 MemoryContextDelete(so->arrayContext);
479 if (so->killedItems != NULL)
480 pfree(so->killedItems);
481 if (so->currTuples != NULL)
482 pfree(so->currTuples);
483 /* so->markTuples should not be pfree'd, see btrescan */
484 pfree(so);
485}
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define BTScanPosUnpinIfPinned(scanpos)
Definition nbtree.h:1015
void _bt_killitems(IndexScanDesc scan)
Definition nbtutils.c:191

References _bt_killitems(), BTScanPosIsValid, BTScanPosUnpinIfPinned, fb(), MemoryContextDelete(), IndexScanDescData::opaque, and pfree().

Referenced by bthandler().

◆ btestimateparallelscan()

Size btestimateparallelscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 575 of file nbtree.c.

576{
580
581 /*
582 * Pessimistically assume that every input scan key will be output with
583 * its own SAOP array
584 */
586 sizeof(int) * nkeys;
587
588 /* Single column indexes cannot possibly use a skip array */
589 if (nkeyatts == 1)
590 return estnbtreeshared;
591
592 /*
593 * Pessimistically assume that all attributes prior to the least
594 * significant attribute require a skip array (and an associated key)
595 */
596 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
597 sizeof(Datum));
598 for (int attnum = 1; attnum < nkeyatts; attnum++)
599 {
600 CompactAttribute *attr;
601
602 /*
603 * We make the conservative assumption that every index column will
604 * also require a skip array.
605 *
606 * Every skip array must have space to store its scan key's sk_flags.
607 * We also need space for each skip array's unused btps_arrElems slot
608 * (we need to be able to subscript btps_arrElems using a simple
609 * so->arrayKeys[]-wise offset for any subsequent SAOP arrays).
610 */
611 estnbtreeshared = add_size(estnbtreeshared, sizeof(int) * 2);
612
613 /* Consider space required to store a datum of opclass input type */
614 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
615 if (attr->attbyval)
616 {
617 /* This index attribute stores pass-by-value datums */
619 true, attr->attlen);
620
622 continue;
623 }
624
625 /*
626 * This index attribute stores pass-by-reference datums.
627 *
628 * Assume that serializing this array will use just as much space as a
629 * pass-by-value datum, in addition to space for the largest possible
630 * whole index tuple (this is not just a per-datum portion of the
631 * largest possible tuple because that'd be almost as large anyway).
632 *
633 * This is quite conservative, but it's not clear how we could do much
634 * better. The executor requires an up-front storage request size
635 * that reliably covers the scan's high watermark memory usage. We
636 * can't be sure of the real high watermark until the scan is over.
637 */
640 }
641
642 return estnbtreeshared;
643}
int16_t int16
Definition c.h:619
size_t Size
Definition c.h:689
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition datum.c:450
#define BTMaxItemSize
Definition nbtree.h:165
int16 attnum
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:535
Size add_size(Size s1, Size s2)
Definition shmem.c:1048
TupleDesc rd_att
Definition rel.h:112
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:195

References add_size(), CompactAttribute::attbyval, CompactAttribute::attlen, attnum, BTMaxItemSize, BTParallelScanDescData::btps_arrElems, datumEstimateSpace(), fb(), IndexRelationGetNumberOfKeyAttributes, RelationData::rd_att, and TupleDescCompactAttr().

Referenced by bthandler().

◆ btgetbitmap()

int64 btgetbitmap ( IndexScanDesc  scan,
TIDBitmap tbm 
)

Definition at line 291 of file nbtree.c.

292{
294 int64 ntids = 0;
295 ItemPointer heapTid;
296
297 Assert(scan->heapRelation == NULL);
298
299 /* Each loop iteration performs another primitive index scan */
300 do
301 {
302 /* Fetch the first page & tuple */
304 {
305 /* Save tuple ID, and continue scanning */
306 heapTid = &scan->xs_heaptid;
307 tbm_add_tuples(tbm, heapTid, 1, false);
308 ntids++;
309
310 for (;;)
311 {
312 /*
313 * Advance to next tuple within page. This is the same as the
314 * easy case in _bt_next().
315 */
316 if (++so->currPos.itemIndex > so->currPos.lastItem)
317 {
318 /* let _bt_next do the heavy lifting */
319 if (!_bt_next(scan, ForwardScanDirection))
320 break;
321 }
322
323 /* Save tuple ID, and continue scanning */
324 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
325 tbm_add_tuples(tbm, heapTid, 1, false);
326 ntids++;
327 }
328 }
329 /* Now see if we need another primitive index scan */
330 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
331
332 return ntids;
333}
int64_t int64
Definition c.h:621
static bool _bt_start_prim_scan(IndexScanDesc scan)
Definition nbtree.c:656
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:883
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:1586
@ ForwardScanDirection
Definition sdir.h:28
ItemPointerData xs_heaptid
Definition relscan.h:185
Relation heapRelation
Definition relscan.h:149
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition tidbitmap.c:367

References _bt_first(), _bt_next(), _bt_start_prim_scan(), Assert, fb(), ForwardScanDirection, IndexScanDescData::heapRelation, IndexScanDescData::opaque, tbm_add_tuples(), and IndexScanDescData::xs_heaptid.

Referenced by bthandler().

◆ btgettreeheight()

int btgettreeheight ( Relation  rel)

Definition at line 1811 of file nbtree.c.

1812{
1813 return _bt_getrootheight(rel);
1814}
int _bt_getrootheight(Relation rel)
Definition nbtpage.c:680

References _bt_getrootheight().

Referenced by bthandler().

◆ btgettuple()

bool btgettuple ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 230 of file nbtree.c.

231{
233 bool res;
234
235 Assert(scan->heapRelation != NULL);
236
237 /* btree indexes are never lossy */
238 scan->xs_recheck = false;
239
240 /* Each loop iteration performs another primitive index scan */
241 do
242 {
243 /*
244 * If we've already initialized this scan, we can just advance it in
245 * the appropriate direction. If we haven't done so yet, we call
246 * _bt_first() to get the first item in the scan.
247 */
248 if (!BTScanPosIsValid(so->currPos))
249 res = _bt_first(scan, dir);
250 else
251 {
252 /*
253 * Check to see if we should kill the previously-fetched tuple.
254 */
255 if (scan->kill_prior_tuple)
256 {
257 /*
258 * Yes, remember it for later. (We'll deal with all such
259 * tuples at once right before leaving the index page.) The
260 * test for numKilled overrun is not just paranoia: if the
261 * caller reverses direction in the indexscan then the same
262 * item might get entered multiple times. It's not worth
263 * trying to optimize that, so we don't detect it, but instead
264 * just forget any excess entries.
265 */
266 if (so->killedItems == NULL)
267 so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
268 if (so->numKilled < MaxTIDsPerBTreePage)
269 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
270 }
271
272 /*
273 * Now continue the scan.
274 */
275 res = _bt_next(scan, dir);
276 }
277
278 /* If we have a tuple, return it ... */
279 if (res)
280 break;
281 /* ... otherwise see if we need another primitive index scan */
282 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
283
284 return res;
285}
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define MaxTIDsPerBTreePage
Definition nbtree.h:186
bool kill_prior_tuple
Definition relscan.h:160

References _bt_first(), _bt_next(), _bt_start_prim_scan(), Assert, BTScanPosIsValid, fb(), IndexScanDescData::heapRelation, IndexScanDescData::kill_prior_tuple, MaxTIDsPerBTreePage, IndexScanDescData::opaque, palloc_array, and IndexScanDescData::xs_recheck.

Referenced by bthandler().

◆ bthandler()

Datum bthandler ( PG_FUNCTION_ARGS  )

Definition at line 118 of file nbtree.c.

119{
120 static const IndexAmRoutine amroutine = {
122 .amstrategies = BTMaxStrategyNumber,
123 .amsupport = BTNProcs,
124 .amoptsprocnum = BTOPTIONS_PROC,
125 .amcanorder = true,
126 .amcanorderbyop = false,
127 .amcanhash = false,
128 .amconsistentequality = true,
129 .amconsistentordering = true,
130 .amcanbackward = true,
131 .amcanunique = true,
132 .amcanmulticol = true,
133 .amoptionalkey = true,
134 .amsearcharray = true,
135 .amsearchnulls = true,
136 .amstorage = false,
137 .amclusterable = true,
138 .ampredlocks = true,
139 .amcanparallel = true,
140 .amcanbuildparallel = true,
141 .amcaninclude = true,
142 .amusemaintenanceworkmem = false,
143 .amsummarizing = false,
144 .amparallelvacuumoptions =
146 .amkeytype = InvalidOid,
147
148 .ambuild = btbuild,
149 .ambuildempty = btbuildempty,
150 .aminsert = btinsert,
151 .aminsertcleanup = NULL,
152 .ambulkdelete = btbulkdelete,
153 .amvacuumcleanup = btvacuumcleanup,
154 .amcanreturn = btcanreturn,
155 .amcostestimate = btcostestimate,
156 .amgettreeheight = btgettreeheight,
157 .amoptions = btoptions,
158 .amproperty = btproperty,
159 .ambuildphasename = btbuildphasename,
160 .amvalidate = btvalidate,
161 .amadjustmembers = btadjustmembers,
162 .ambeginscan = btbeginscan,
163 .amrescan = btrescan,
164 .amgettuple = btgettuple,
165 .amgetbitmap = btgetbitmap,
166 .amendscan = btendscan,
167 .ammarkpos = btmarkpos,
168 .amrestrpos = btrestrpos,
169 .amestimateparallelscan = btestimateparallelscan,
170 .aminitparallelscan = btinitparallelscan,
171 .amparallelrescan = btparallelrescan,
172 .amtranslatestrategy = bttranslatestrategy,
173 .amtranslatecmptype = bttranslatecmptype,
174 };
175
177}
#define PG_RETURN_POINTER(x)
Definition fmgr.h:363
bool btcanreturn(Relation index, int attno)
Definition nbtree.c:1802
StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily)
Definition nbtree.c:1837
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition nbtree.c:339
Size btestimateparallelscan(Relation rel, int nkeys, int norderbys)
Definition nbtree.c:575
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition nbtree.c:1152
CompareType bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
Definition nbtree.c:1817
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition nbtree.c:230
void btparallelrescan(IndexScanDesc scan)
Definition nbtree.c:830
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition nbtree.c:206
void btbuildempty(Relation index)
Definition nbtree.c:183
int btgettreeheight(Relation rel)
Definition nbtree.c:1811
void btinitparallelscan(void *target)
Definition nbtree.c:814
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition nbtree.c:1122
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition nbtree.c:291
void btmarkpos(IndexScanDesc scan)
Definition nbtree.c:491
void btendscan(IndexScanDesc scan)
Definition nbtree.c:455
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition nbtree.c:388
void btrestrpos(IndexScanDesc scan)
Definition nbtree.c:517
#define BTNProcs
Definition nbtree.h:723
#define BTOPTIONS_PROC
Definition nbtree.h:721
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition nbtsort.c:299
char * btbuildphasename(int64 phasenum)
Definition nbtutils.c:644
bytea * btoptions(Datum reloptions, bool validate)
Definition nbtutils.c:598
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition nbtutils.c:621
bool btvalidate(Oid opclassoid)
Definition nbtvalidate.c:40
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
#define InvalidOid
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition selfuncs.c:7703
#define BTMaxStrategyNumber
Definition stratnum.h:35
NodeTag type
Definition amapi.h:234
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition vacuum.h:47
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition vacuum.h:54

References btadjustmembers(), btbeginscan(), btbuild(), btbuildempty(), btbuildphasename(), btbulkdelete(), btcanreturn(), btcostestimate(), btendscan(), btestimateparallelscan(), btgetbitmap(), btgettreeheight(), btgettuple(), btinitparallelscan(), btinsert(), btmarkpos(), BTMaxStrategyNumber, BTNProcs, btoptions(), BTOPTIONS_PROC, btparallelrescan(), btproperty(), btrescan(), btrestrpos(), bttranslatecmptype(), bttranslatestrategy(), btvacuumcleanup(), btvalidate(), fb(), InvalidOid, PG_RETURN_POINTER, IndexAmRoutine::type, VACUUM_OPTION_PARALLEL_BULKDEL, and VACUUM_OPTION_PARALLEL_COND_CLEANUP.

◆ btinitparallelscan()

void btinitparallelscan ( void target)

Definition at line 814 of file nbtree.c.

815{
817
818 LWLockInitialize(&bt_target->btps_lock,
820 bt_target->btps_nextScanPage = InvalidBlockNumber;
821 bt_target->btps_lastCurrPage = InvalidBlockNumber;
822 bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
824}
void ConditionVariableInit(ConditionVariable *cv)
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:670

References BTPARALLEL_NOT_INITIALIZED, ConditionVariableInit(), fb(), InvalidBlockNumber, and LWLockInitialize().

Referenced by bthandler().

◆ btinsert()

bool btinsert ( Relation  rel,
Datum values,
bool isnull,
ItemPointer  ht_ctid,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
IndexInfo indexInfo 
)

Definition at line 206 of file nbtree.c.

211{
212 bool result;
213 IndexTuple itup;
214
215 /* generate an index tuple */
216 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
217 itup->t_tid = *ht_ctid;
218
219 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
220
221 pfree(itup);
222
223 return result;
224}
static Datum values[MAXATTR]
Definition bootstrap.c:190
uint32 result
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition indextuple.c:44
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition nbtinsert.c:105
ItemPointerData t_tid
Definition itup.h:37

References _bt_doinsert(), fb(), index_form_tuple(), pfree(), RelationGetDescr, result, IndexTupleData::t_tid, and values.

Referenced by bthandler().

◆ btmarkpos()

void btmarkpos ( IndexScanDesc  scan)

Definition at line 491 of file nbtree.c.

492{
494
495 /* There may be an old mark with a pin (but no lock). */
496 BTScanPosUnpinIfPinned(so->markPos);
497
498 /*
499 * Just record the current itemIndex. If we later step to next page
500 * before releasing the marked position, _bt_steppage makes a full copy of
501 * the currPos struct in markPos. If (as often happens) the mark is moved
502 * before we leave the page, we don't have to do that work.
503 */
504 if (BTScanPosIsValid(so->currPos))
505 so->markItemIndex = so->currPos.itemIndex;
506 else
507 {
508 BTScanPosInvalidate(so->markPos);
509 so->markItemIndex = -1;
510 }
511}

References BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, fb(), and IndexScanDescData::opaque.

Referenced by bthandler().

◆ btparallelrescan()

void btparallelrescan ( IndexScanDesc  scan)

Definition at line 830 of file nbtree.c.

831{
833 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
834
835 Assert(parallel_scan);
836
837 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
838 parallel_scan->ps_offset_am);
839
840 /*
841 * In theory, we don't need to acquire the LWLock here, because there
842 * shouldn't be any other workers running at this point, but we do so for
843 * consistency.
844 */
845 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
846 btscan->btps_nextScanPage = InvalidBlockNumber;
847 btscan->btps_lastCurrPage = InvalidBlockNumber;
848 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
849 LWLockRelease(&btscan->btps_lock);
850}

References Assert, BTPARALLEL_NOT_INITIALIZED, fb(), InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by bthandler().

◆ btreevacuumposting()

static BTVacuumPosting btreevacuumposting ( BTVacState vstate,
IndexTuple  posting,
OffsetNumber  updatedoffset,
int nremaining 
)
static

Definition at line 1753 of file nbtree.c.

1755{
1756 int live = 0;
1757 int nitem = BTreeTupleGetNPosting(posting);
1760
1761 for (int i = 0; i < nitem; i++)
1762 {
1763 if (!vstate->callback(items + i, vstate->callback_state))
1764 {
1765 /* Live table TID */
1766 live++;
1767 }
1768 else if (vacposting == NULL)
1769 {
1770 /*
1771 * First dead table TID encountered.
1772 *
1773 * It's now clear that we need to delete one or more dead table
1774 * TIDs, so start maintaining metadata describing how to update
1775 * existing posting list tuple.
1776 */
1778 nitem * sizeof(uint16));
1779
1780 vacposting->itup = posting;
1781 vacposting->updatedoffset = updatedoffset;
1782 vacposting->ndeletedtids = 0;
1783 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1784 }
1785 else
1786 {
1787 /* Second or subsequent dead table TID */
1788 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1789 }
1790 }
1791
1792 *nremaining = live;
1793 return vacposting;
1794}
uint16_t uint16
Definition c.h:623
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition nbtree.h:519
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition nbtree.h:538
static ItemArray items

References BTreeTupleGetNPosting(), BTreeTupleGetPosting(), fb(), i, items, and palloc().

Referenced by btvacuumpage().

◆ btrescan()

void btrescan ( IndexScanDesc  scan,
ScanKey  scankey,
int  nscankeys,
ScanKey  orderbys,
int  norderbys 
)

Definition at line 388 of file nbtree.c.

390{
392
393 /* we aren't holding any read locks, but gotta drop the pins */
394 if (BTScanPosIsValid(so->currPos))
395 {
396 /* Before leaving current page, deal with any killed items */
397 if (so->numKilled > 0)
398 _bt_killitems(scan);
399 BTScanPosUnpinIfPinned(so->currPos);
400 BTScanPosInvalidate(so->currPos);
401 }
402
403 /*
404 * We prefer to eagerly drop leaf page pins before btgettuple returns.
405 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
406 *
407 * We cannot safely drop leaf page pins during index-only scans due to a
408 * race condition involving VACUUM setting pages all-visible in the VM.
409 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
410 *
411 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
412 * Pins cannot be held for more than an instant during bitmap scans either
413 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
414 *
415 * See nbtree/README section on making concurrent TID recycling safe.
416 *
417 * Note: so->dropPin should never change across rescans.
418 */
419 so->dropPin = (!scan->xs_want_itup &&
421 scan->heapRelation != NULL);
422
423 so->markItemIndex = -1;
424 so->needPrimScan = false;
425 so->scanBehind = false;
426 so->oppositeDirCheck = false;
427 BTScanPosUnpinIfPinned(so->markPos);
428 BTScanPosInvalidate(so->markPos);
429
430 /*
431 * Allocate tuple workspace arrays, if needed for an index-only scan and
432 * not already done in a previous rescan call. To save on palloc
433 * overhead, both workspaces are allocated as one palloc block; only this
434 * function and btendscan know that.
435 */
436 if (scan->xs_want_itup && so->currTuples == NULL)
437 {
438 so->currTuples = (char *) palloc(BLCKSZ * 2);
439 so->markTuples = so->currTuples + BLCKSZ;
440 }
441
442 /*
443 * Reset the scan keys
444 */
445 if (scankey && scan->numberOfKeys > 0)
446 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
447 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
448 so->numArrayKeys = 0; /* ditto */
449}
#define IsMVCCLikeSnapshot(snapshot)
Definition snapmgr.h:74
struct ScanKeyData * keyData
Definition relscan.h:154
struct SnapshotData * xs_snapshot
Definition relscan.h:151

References _bt_killitems(), BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, fb(), IndexScanDescData::heapRelation, IsMVCCLikeSnapshot, IndexScanDescData::keyData, memcpy(), IndexScanDescData::numberOfKeys, IndexScanDescData::opaque, palloc(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by bthandler().

◆ btrestrpos()

void btrestrpos ( IndexScanDesc  scan)

Definition at line 517 of file nbtree.c.

518{
520
521 if (so->markItemIndex >= 0)
522 {
523 /*
524 * The scan has never moved to a new page since the last mark. Just
525 * restore the itemIndex.
526 *
527 * NB: In this case we can't count on anything in so->markPos to be
528 * accurate.
529 */
530 so->currPos.itemIndex = so->markItemIndex;
531 }
532 else
533 {
534 /*
535 * The scan moved to a new page after last mark or restore, and we are
536 * now restoring to the marked page. We aren't holding any read
537 * locks, but if we're still holding the pin for the current position,
538 * we must drop it.
539 */
540 if (BTScanPosIsValid(so->currPos))
541 {
542 /* Before leaving current page, deal with any killed items */
543 if (so->numKilled > 0)
544 _bt_killitems(scan);
545 BTScanPosUnpinIfPinned(so->currPos);
546 }
547
548 if (BTScanPosIsValid(so->markPos))
549 {
550 /* bump pin on mark buffer for assignment to current buffer */
551 if (BTScanPosIsPinned(so->markPos))
552 IncrBufferRefCount(so->markPos.buf);
553 memcpy(&so->currPos, &so->markPos,
555 so->markPos.lastItem * sizeof(BTScanPosItem));
556 if (so->currTuples)
557 memcpy(so->currTuples, so->markTuples,
558 so->markPos.nextTupleOffset);
559 /* Reset the scan's array keys (see _bt_steppage for why) */
560 if (so->numArrayKeys)
561 {
562 _bt_start_array_keys(scan, so->currPos.dir);
563 so->needPrimScan = false;
564 }
565 }
566 else
567 BTScanPosInvalidate(so->currPos);
568 }
569}
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5670
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
#define BTScanPosIsPinned(scanpos)
Definition nbtree.h:1004

References _bt_killitems(), _bt_start_array_keys(), BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, fb(), IncrBufferRefCount(), items, memcpy(), and IndexScanDescData::opaque.

Referenced by bthandler().

◆ bttranslatecmptype()

StrategyNumber bttranslatecmptype ( CompareType  cmptype,
Oid  opfamily 
)

Definition at line 1837 of file nbtree.c.

1838{
1839 switch (cmptype)
1840 {
1841 case COMPARE_LT:
1842 return BTLessStrategyNumber;
1843 case COMPARE_LE:
1845 case COMPARE_EQ:
1846 return BTEqualStrategyNumber;
1847 case COMPARE_GE:
1849 case COMPARE_GT:
1851 default:
1852 return InvalidStrategy;
1853 }
1854}
@ COMPARE_LE
Definition cmptype.h:35
@ COMPARE_GT
Definition cmptype.h:38
@ COMPARE_EQ
Definition cmptype.h:36
@ COMPARE_GE
Definition cmptype.h:37
@ COMPARE_LT
Definition cmptype.h:34
#define BTGreaterStrategyNumber
Definition stratnum.h:33
#define InvalidStrategy
Definition stratnum.h:24
#define BTLessStrategyNumber
Definition stratnum.h:29
#define BTEqualStrategyNumber
Definition stratnum.h:31
#define BTLessEqualStrategyNumber
Definition stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition stratnum.h:32

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_LE, COMPARE_LT, and InvalidStrategy.

Referenced by bthandler().

◆ bttranslatestrategy()

CompareType bttranslatestrategy ( StrategyNumber  strategy,
Oid  opfamily 
)

Definition at line 1817 of file nbtree.c.

1818{
1819 switch (strategy)
1820 {
1822 return COMPARE_LT;
1824 return COMPARE_LE;
1826 return COMPARE_EQ;
1828 return COMPARE_GE;
1830 return COMPARE_GT;
1831 default:
1832 return COMPARE_INVALID;
1833 }
1834}
@ COMPARE_INVALID
Definition cmptype.h:33

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_INVALID, COMPARE_LE, and COMPARE_LT.

Referenced by bthandler().

◆ btvacuumcleanup()

IndexBulkDeleteResult * btvacuumcleanup ( IndexVacuumInfo info,
IndexBulkDeleteResult stats 
)

Definition at line 1152 of file nbtree.c.

1153{
1155
1156 /* No-op in ANALYZE ONLY mode */
1157 if (info->analyze_only)
1158 return stats;
1159
1160 /*
1161 * If btbulkdelete was called, we need not do anything (we just maintain
1162 * the information used within _bt_vacuum_needs_cleanup() by calling
1163 * _bt_set_cleanup_info() below).
1164 *
1165 * If btbulkdelete was _not_ called, then we have a choice to make: we
1166 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1167 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1168 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1169 * now.
1170 */
1171 if (stats == NULL)
1172 {
1173 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1174 if (!_bt_vacuum_needs_cleanup(info->index))
1175 return NULL;
1176
1177 /*
1178 * Since we aren't going to actually delete any leaf items, there's no
1179 * need to go through all the vacuum-cycle-ID pushups here.
1180 *
1181 * Posting list tuples are a source of inaccuracy for cleanup-only
1182 * scans. btvacuumscan() will assume that the number of index tuples
1183 * from each page can be used as num_index_tuples, even though
1184 * num_index_tuples is supposed to represent the number of TIDs in the
1185 * index. This naive approach can underestimate the number of tuples
1186 * in the index significantly.
1187 *
1188 * We handle the problem by making num_index_tuples an estimate in
1189 * cleanup-only case.
1190 */
1192 btvacuumscan(info, stats, NULL, NULL, 0);
1193 stats->estimated_count = true;
1194 }
1195
1196 /*
1197 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1198 *
1199 * num_delpages is the number of deleted pages now in the index that were
1200 * not safe to place in the FSM to be recycled just yet. num_delpages is
1201 * greater than 0 only when _bt_pagedel() actually deleted pages during
1202 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1203 * have failed to place any newly deleted pages in the FSM just moments
1204 * ago. (Actually, there are edge cases where recycling of the current
1205 * VACUUM's newly deleted pages does not even become safe by the time the
1206 * next VACUUM comes around. See nbtree/README.)
1207 */
1208 Assert(stats->pages_deleted >= stats->pages_free);
1209 num_delpages = stats->pages_deleted - stats->pages_free;
1211
1212 /*
1213 * It's quite possible for us to be fooled by concurrent page splits into
1214 * double-counting some index tuples, so disbelieve any total that exceeds
1215 * the underlying heap's count ... if we know that accurately. Otherwise
1216 * this might just make matters worse.
1217 */
1218 if (!info->estimated_count)
1219 {
1220 if (stats->num_index_tuples > info->num_heap_tuples)
1221 stats->num_index_tuples = info->num_heap_tuples;
1222 }
1223
1224 return stats;
1225}
uint32 BlockNumber
Definition block.h:31
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition nbtpage.c:233
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition nbtpage.c:180
BlockNumber pages_deleted
Definition genam.h:90
BlockNumber pages_free
Definition genam.h:91
double num_index_tuples
Definition genam.h:87
double num_heap_tuples
Definition genam.h:60
bool analyze_only
Definition genam.h:56
bool estimated_count
Definition genam.h:58

References _bt_set_cleanup_info(), _bt_vacuum_needs_cleanup(), IndexVacuumInfo::analyze_only, Assert, btvacuumscan(), IndexVacuumInfo::estimated_count, IndexBulkDeleteResult::estimated_count, fb(), IndexVacuumInfo::index, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, and palloc0_object.

Referenced by bthandler().

◆ btvacuumpage()

static BlockNumber btvacuumpage ( BTVacState vstate,
Buffer  buf 
)
static

Definition at line 1415 of file nbtree.c.

1416{
1417 IndexVacuumInfo *info = vstate->info;
1418 IndexBulkDeleteResult *stats = vstate->stats;
1420 void *callback_state = vstate->callback_state;
1421 Relation rel = info->index;
1422 Relation heaprel = info->heaprel;
1423 bool attempt_pagedel;
1424 BlockNumber blkno,
1427 Page page;
1428 BTPageOpaque opaque;
1429
1430 blkno = scanblkno;
1431
1432backtrack:
1433
1434 attempt_pagedel = false;
1436
1437 _bt_lockbuf(rel, buf, BT_READ);
1438 page = BufferGetPage(buf);
1439 opaque = NULL;
1440 if (!PageIsNew(page))
1441 {
1442 _bt_checkpage(rel, buf);
1443 opaque = BTPageGetOpaque(page);
1444 }
1445
1446 Assert(blkno <= scanblkno);
1447 if (blkno != scanblkno)
1448 {
1449 /*
1450 * We're backtracking.
1451 *
1452 * We followed a right link to a sibling leaf page (a page that
1453 * happens to be from a block located before scanblkno). The only
1454 * case we want to do anything with is a live leaf page having the
1455 * current vacuum cycle ID.
1456 *
1457 * The page had better be in a state that's consistent with what we
1458 * expect. Check for conditions that imply corruption in passing. It
1459 * can't be half-dead because only an interrupted VACUUM process can
1460 * leave pages in that state, so we'd definitely have dealt with it
1461 * back when the page was the scanblkno page (half-dead pages are
1462 * always marked fully deleted by _bt_pagedel(), barring corruption).
1463 */
1464 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1465 {
1466 Assert(false);
1467 ereport(LOG,
1469 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1470 blkno, scanblkno, RelationGetRelationName(rel))));
1471 _bt_relbuf(rel, buf);
1472 return scanblkno;
1473 }
1474
1475 /*
1476 * We may have already processed the page in an earlier call, when the
1477 * page was scanblkno. This happens when the leaf page split occurred
1478 * after the scan began, but before the right sibling page became the
1479 * scanblkno.
1480 *
1481 * Page may also have been deleted by current btvacuumpage() call,
1482 * since _bt_pagedel() sometimes deletes the right sibling page of
1483 * scanblkno in passing (it does so after we decided where to
1484 * backtrack to). We don't need to process this page as a deleted
1485 * page a second time now (in fact, it would be wrong to count it as a
1486 * deleted page in the bulk delete statistics a second time).
1487 */
1488 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1489 {
1490 /* Done with current scanblkno (and all lower split pages) */
1491 _bt_relbuf(rel, buf);
1492 return scanblkno;
1493 }
1494 }
1495
1496 if (!opaque || BTPageIsRecyclable(page, heaprel))
1497 {
1498 /* Okay to recycle this page (which could be leaf or internal) */
1499 RecordFreeIndexPage(rel, blkno);
1500 stats->pages_deleted++;
1501 stats->pages_free++;
1502 }
1503 else if (P_ISDELETED(opaque))
1504 {
1505 /*
1506 * Already deleted page (which could be leaf or internal). Can't
1507 * recycle yet.
1508 */
1509 stats->pages_deleted++;
1510 }
1511 else if (P_ISHALFDEAD(opaque))
1512 {
1513 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1514 attempt_pagedel = true;
1515
1516 /*
1517 * _bt_pagedel() will increment both pages_newly_deleted and
1518 * pages_deleted stats in all cases (barring corruption)
1519 */
1520 }
1521 else if (P_ISLEAF(opaque))
1522 {
1524 int ndeletable;
1526 int nupdatable;
1527 OffsetNumber offnum,
1528 minoff,
1529 maxoff;
1530 int nhtidsdead,
1531 nhtidslive;
1532
1533 /*
1534 * Trade in the initial read lock for a full cleanup lock on this
1535 * page. We must get such a lock on every leaf page over the course
1536 * of the vacuum scan, whether or not it actually contains any
1537 * deletable tuples --- see nbtree/README.
1538 */
1540
1541 /*
1542 * Check whether we need to backtrack to earlier pages. What we are
1543 * concerned about is a page split that happened since we started the
1544 * vacuum scan. If the split moved tuples on the right half of the
1545 * split (i.e. the tuples that sort high) to a block that we already
1546 * passed over, then we might have missed the tuples. We need to
1547 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1548 * or deleting scanblkno page below!)
1549 */
1550 if (vstate->cycleid != 0 &&
1551 opaque->btpo_cycleid == vstate->cycleid &&
1552 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1553 !P_RIGHTMOST(opaque) &&
1554 opaque->btpo_next < scanblkno)
1555 backtrack_to = opaque->btpo_next;
1556
1557 ndeletable = 0;
1558 nupdatable = 0;
1559 minoff = P_FIRSTDATAKEY(opaque);
1560 maxoff = PageGetMaxOffsetNumber(page);
1561 nhtidsdead = 0;
1562 nhtidslive = 0;
1563 if (callback)
1564 {
1565 /* btbulkdelete callback tells us what to delete (or update) */
1566 for (offnum = minoff;
1567 offnum <= maxoff;
1568 offnum = OffsetNumberNext(offnum))
1569 {
1570 IndexTuple itup;
1571
1572 itup = (IndexTuple) PageGetItem(page,
1573 PageGetItemId(page, offnum));
1574
1575 Assert(!BTreeTupleIsPivot(itup));
1576 if (!BTreeTupleIsPosting(itup))
1577 {
1578 /* Regular tuple, standard table TID representation */
1579 if (callback(&itup->t_tid, callback_state))
1580 {
1581 deletable[ndeletable++] = offnum;
1582 nhtidsdead++;
1583 }
1584 else
1585 nhtidslive++;
1586 }
1587 else
1588 {
1590 int nremaining;
1591
1592 /* Posting list tuple */
1593 vacposting = btreevacuumposting(vstate, itup, offnum,
1594 &nremaining);
1595 if (vacposting == NULL)
1596 {
1597 /*
1598 * All table TIDs from the posting tuple remain, so no
1599 * delete or update required
1600 */
1602 }
1603 else if (nremaining > 0)
1604 {
1605
1606 /*
1607 * Store metadata about posting list tuple in
1608 * updatable array for entire page. Existing tuple
1609 * will be updated during the later call to
1610 * _bt_delitems_vacuum().
1611 */
1613 updatable[nupdatable++] = vacposting;
1615 }
1616 else
1617 {
1618 /*
1619 * All table TIDs from the posting list must be
1620 * deleted. We'll delete the index tuple completely
1621 * (no update required).
1622 */
1623 Assert(nremaining == 0);
1624 deletable[ndeletable++] = offnum;
1627 }
1628
1630 }
1631 }
1632 }
1633
1634 /*
1635 * Apply any needed deletes or updates. We issue just one
1636 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1637 */
1638 if (ndeletable > 0 || nupdatable > 0)
1639 {
1641 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1642 nupdatable);
1643
1644 stats->tuples_removed += nhtidsdead;
1645 /* must recompute maxoff */
1646 maxoff = PageGetMaxOffsetNumber(page);
1647
1648 /* can't leak memory here */
1649 for (int i = 0; i < nupdatable; i++)
1650 pfree(updatable[i]);
1651 }
1652 else
1653 {
1654 /*
1655 * If the leaf page has been split during this vacuum cycle, it
1656 * seems worth expending a write to clear btpo_cycleid even if we
1657 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1658 * takes care of this.) This ensures we won't process the page
1659 * again.
1660 *
1661 * We treat this like a hint-bit update because there's no need to
1662 * WAL-log it.
1663 */
1664 Assert(nhtidsdead == 0);
1665 if (vstate->cycleid != 0 &&
1666 opaque->btpo_cycleid == vstate->cycleid)
1667 {
1668 opaque->btpo_cycleid = 0;
1669 MarkBufferDirtyHint(buf, true);
1670 }
1671 }
1672
1673 /*
1674 * If the leaf page is now empty, try to delete it; else count the
1675 * live tuples (live table TIDs in posting lists are counted as
1676 * separate live tuples). We don't delete when backtracking, though,
1677 * since that would require teaching _bt_pagedel() about backtracking
1678 * (doesn't seem worth adding more complexity to deal with that).
1679 *
1680 * We don't count the number of live TIDs during cleanup-only calls to
1681 * btvacuumscan (i.e. when callback is not set). We count the number
1682 * of index tuples directly instead. This avoids the expense of
1683 * directly examining all of the tuples on each page. VACUUM will
1684 * treat num_index_tuples as an estimate in cleanup-only case, so it
1685 * doesn't matter that this underestimates num_index_tuples
1686 * significantly in some cases.
1687 */
1688 if (minoff > maxoff)
1689 attempt_pagedel = (blkno == scanblkno);
1690 else if (callback)
1691 stats->num_index_tuples += nhtidslive;
1692 else
1693 stats->num_index_tuples += maxoff - minoff + 1;
1694
1696 }
1697
1698 if (attempt_pagedel)
1699 {
1700 MemoryContext oldcontext;
1701
1702 /* Run pagedel in a temp context to avoid memory leakage */
1703 MemoryContextReset(vstate->pagedelcontext);
1704 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1705
1706 /*
1707 * _bt_pagedel maintains the bulk delete stats on our behalf;
1708 * pages_newly_deleted and pages_deleted are likely to be incremented
1709 * during call
1710 */
1711 Assert(blkno == scanblkno);
1712 _bt_pagedel(rel, buf, vstate);
1713
1714 MemoryContextSwitchTo(oldcontext);
1715 /* pagedel released buffer, so we shouldn't */
1716 }
1717 else
1718 _bt_relbuf(rel, buf);
1719
1720 if (backtrack_to != P_NONE)
1721 {
1722 blkno = backtrack_to;
1723
1724 /* check for vacuum delay while not holding any buffer lock */
1725 vacuum_delay_point(false);
1726
1727 /*
1728 * We can't use _bt_getbuf() here because it always applies
1729 * _bt_checkpage(), which will barf on an all-zero page. We want to
1730 * recycle all-zero pages, not fail. Also, we want to use a
1731 * nondefault buffer access strategy.
1732 */
1734 info->strategy);
1735 goto backtrack;
1736 }
1737
1738 return scanblkno;
1739}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4446
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5821
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:926
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:468
@ RBM_NORMAL
Definition bufmgr.h:46
static bool PageIsNew(const PageData *page)
Definition bufpage.h:258
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:268
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:378
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:396
int errcode(int sqlerrcode)
Definition elog.c:875
#define LOG
Definition elog.h:32
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define ereport(elevel,...)
Definition elog.h:152
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition genam.h:95
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition indexfsm.c:52
IndexTupleData * IndexTuple
Definition itup.h:53
#define MaxIndexTuplesPerPage
Definition itup.h:181
void MemoryContextReset(MemoryContext context)
Definition mcxt.c:403
void _bt_relbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1044
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition nbtpage.c:1832
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition nbtpage.c:1182
void _bt_checkpage(Relation rel, Buffer buf)
Definition nbtpage.c:802
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition nbtpage.c:1137
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition nbtpage.c:1067
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition nbtree.c:1753
#define P_ISHALFDEAD(opaque)
Definition nbtree.h:225
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition nbtree.h:481
#define P_ISLEAF(opaque)
Definition nbtree.h:221
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define P_ISDELETED(opaque)
Definition nbtree.h:223
#define P_FIRSTDATAKEY(opaque)
Definition nbtree.h:370
#define P_RIGHTMOST(opaque)
Definition nbtree.h:220
#define BT_READ
Definition nbtree.h:730
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition nbtree.h:292
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition nbtree.h:493
#define BTP_SPLIT_END
Definition nbtree.h:82
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define RelationGetRelationName(relation)
Definition rel.h:550
@ MAIN_FORKNUM
Definition relpath.h:58
BlockNumber btpo_next
Definition nbtree.h:66
uint16 btpo_flags
Definition nbtree.h:68
BTCycleId btpo_cycleid
Definition nbtree.h:69
double tuples_removed
Definition genam.h:88
BufferAccessStrategy strategy
Definition genam.h:61
Relation heaprel
Definition genam.h:55
void vacuum_delay_point(bool is_analyze)
Definition vacuum.c:2433

References _bt_checkpage(), _bt_delitems_vacuum(), _bt_lockbuf(), _bt_pagedel(), _bt_relbuf(), _bt_upgradelockbufcleanup(), Assert, BT_READ, BTP_SPLIT_END, BTPageGetOpaque, BTPageIsRecyclable(), BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTreeTupleGetNPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), btreevacuumposting(), buf, BufferGetBlockNumber(), BufferGetPage(), callback(), ereport, errcode(), errmsg_internal(), fb(), IndexVacuumInfo::heaprel, i, IndexVacuumInfo::index, LOG, MAIN_FORKNUM, MarkBufferDirtyHint(), MaxIndexTuplesPerPage, MemoryContextReset(), MemoryContextSwitchTo(), IndexBulkDeleteResult::num_index_tuples, OffsetNumberNext, P_FIRSTDATAKEY, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PageIsNew(), IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, pfree(), RBM_NORMAL, ReadBufferExtended(), RecordFreeIndexPage(), RelationGetRelationName, IndexVacuumInfo::strategy, IndexTupleData::t_tid, IndexBulkDeleteResult::tuples_removed, and vacuum_delay_point().

Referenced by btvacuumscan().

◆ btvacuumscan()

static void btvacuumscan ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void callback_state,
BTCycleId  cycleid 
)
static

Definition at line 1240 of file nbtree.c.

1243{
1244 Relation rel = info->index;
1246 BlockNumber num_pages;
1247 bool needLock;
1249 ReadStream *stream = NULL;
1250
1251 /*
1252 * Reset fields that track information about the entire index now. This
1253 * avoids double-counting in the case where a single VACUUM command
1254 * requires multiple scans of the index.
1255 *
1256 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1257 * since they track information about the VACUUM command, and so must last
1258 * across each call to btvacuumscan().
1259 *
1260 * (Note that pages_free is treated as state about the whole index, not
1261 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1262 * calls are idempotent, and get repeated for the same deleted pages in
1263 * some scenarios. The point for us is to track the number of recyclable
1264 * pages in the index at the end of the VACUUM command.)
1265 */
1266 stats->num_pages = 0;
1267 stats->num_index_tuples = 0;
1268 stats->pages_deleted = 0;
1269 stats->pages_free = 0;
1270
1271 /* Set up info to pass down to btvacuumpage */
1272 vstate.info = info;
1273 vstate.stats = stats;
1274 vstate.callback = callback;
1275 vstate.callback_state = callback_state;
1276 vstate.cycleid = cycleid;
1277
1278 /* Create a temporary memory context to run _bt_pagedel in */
1280 "_bt_pagedel",
1282
1283 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1284 vstate.bufsize = 0;
1285 vstate.maxbufsize = 0;
1286 vstate.pendingpages = NULL;
1287 vstate.npendingpages = 0;
1288 /* Consider applying _bt_pendingfsm_finalize optimization */
1290
1291 /*
1292 * The outer loop iterates over all index pages except the metapage, in
1293 * physical order (we hope the kernel will cooperate in providing
1294 * read-ahead for speed). It is critical that we visit all leaf pages,
1295 * including ones added after we start the scan, else we might fail to
1296 * delete some deletable tuples. Hence, we must repeatedly check the
1297 * relation length. We must acquire the relation-extension lock while
1298 * doing so to avoid a race condition: if someone else is extending the
1299 * relation, there is a window where bufmgr/smgr have created a new
1300 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1301 * we manage to scan such a page here, we'll improperly assume it can be
1302 * recycled. Taking the lock synchronizes things enough to prevent a
1303 * problem: either num_pages won't include the new page, or _bt_getbuf
1304 * already has write lock on the buffer and it will be fully initialized
1305 * before we can examine it. Also, we need not worry if a page is added
1306 * immediately after we look; the page splitting code already has
1307 * write-lock on the left page before it adds a right page, so we must
1308 * already have processed any tuples due to be moved into such a page.
1309 *
1310 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1311 * think the use of the extension lock is still required.
1312 *
1313 * We can skip locking for new or temp relations, however, since no one
1314 * else could be accessing them.
1315 */
1317
1319
1320 /*
1321 * It is safe to use batchmode as block_range_read_stream_cb takes no
1322 * locks.
1323 */
1327 info->strategy,
1328 rel,
1331 &p,
1332 0);
1333 for (;;)
1334 {
1335 /* Get the current relation length */
1336 if (needLock)
1338 num_pages = RelationGetNumberOfBlocks(rel);
1339 if (needLock)
1341
1342 if (info->report_progress)
1344 num_pages);
1345
1346 /* Quit if we've scanned the whole relation */
1347 if (p.current_blocknum >= num_pages)
1348 break;
1349
1350 p.last_exclusive = num_pages;
1351
1352 /* Iterate over pages, then loop back to recheck relation length */
1353 while (true)
1354 {
1355 BlockNumber current_block;
1356 Buffer buf;
1357
1358 /* call vacuum_delay_point while not holding any buffer lock */
1359 vacuum_delay_point(false);
1360
1361 buf = read_stream_next_buffer(stream, NULL);
1362
1363 if (!BufferIsValid(buf))
1364 break;
1365
1366 current_block = btvacuumpage(&vstate, buf);
1367
1368 if (info->report_progress)
1370 current_block);
1371 }
1372
1373 /*
1374 * We have to reset the read stream to use it again. After returning
1375 * InvalidBuffer, the read stream API won't invoke our callback again
1376 * until the stream has been reset.
1377 */
1378 read_stream_reset(stream);
1379 }
1380
1381 read_stream_end(stream);
1382
1383 /* Set statistics num_pages field to final size of index */
1384 stats->num_pages = num_pages;
1385
1386 MemoryContextDelete(vstate.pagedelcontext);
1387
1388 /*
1389 * If there were any calls to _bt_pagedel() during scan of the index then
1390 * see if any of the resulting pages can be placed in the FSM now. When
1391 * it's not safe we'll have to leave it up to a future VACUUM operation.
1392 *
1393 * Finally, if we placed any pages in the FSM (either just now or during
1394 * the scan), forcibly update the upper-level FSM pages to ensure that
1395 * searchers can find them.
1396 */
1398 if (stats->pages_free > 0)
1400}
void pgstat_progress_update_param(int index, int64 val)
int Buffer
Definition buf.h:23
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:309
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:419
void IndexFreeSpaceMapVacuum(Relation rel)
Definition indexfsm.c:71
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
#define ExclusiveLock
Definition lockdefs.h:42
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition nbtpage.c:3033
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition nbtpage.c:2991
static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf)
Definition nbtree.c:1415
#define PROGRESS_SCAN_BLOCKS_DONE
Definition progress.h:151
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition progress.h:150
void read_stream_reset(ReadStream *stream)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_MAINTENANCE
Definition read_stream.h:28
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
#define RELATION_IS_LOCAL(relation)
Definition rel.h:659
BlockNumber num_pages
Definition genam.h:85
bool report_progress
Definition genam.h:57

References _bt_pendingfsm_finalize(), _bt_pendingfsm_init(), ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, block_range_read_stream_cb(), BTREE_METAPAGE, btvacuumpage(), buf, BufferIsValid(), callback(), BlockRangeReadStreamPrivate::current_blocknum, CurrentMemoryContext, ExclusiveLock, fb(), IndexVacuumInfo::index, IndexFreeSpaceMapVacuum(), BlockRangeReadStreamPrivate::last_exclusive, LockRelationForExtension(), MAIN_FORKNUM, MemoryContextDelete(), IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::num_pages, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, pgstat_progress_update_param(), PROGRESS_SCAN_BLOCKS_DONE, PROGRESS_SCAN_BLOCKS_TOTAL, read_stream_begin_relation(), read_stream_end(), READ_STREAM_FULL, READ_STREAM_MAINTENANCE, read_stream_next_buffer(), read_stream_reset(), READ_STREAM_USE_BATCHING, RELATION_IS_LOCAL, RelationGetNumberOfBlocks, IndexVacuumInfo::report_progress, IndexVacuumInfo::strategy, UnlockRelationForExtension(), and vacuum_delay_point().

Referenced by btbulkdelete(), and btvacuumcleanup().